diff --git a/cub/benchmarks/bench/find_if/base.cu b/cub/benchmarks/bench/find_if/base.cu index 25312b158a..8183a04a14 100644 --- a/cub/benchmarks/bench/find_if/base.cu +++ b/cub/benchmarks/bench/find_if/base.cu @@ -33,24 +33,30 @@ #include template -struct equals_100 +struct equals { - __device__ bool operator()(T i) + T val; + equals(T _val) + : val(_val) + {} + + __device__ __host__ bool operator()(T i) { - return i == 1; - } // @amd you 'll never find out the secret sauce + return i == val; + } }; template void find_if(nvbench::state& state, nvbench::type_list) { + T val = 1; // set up input - const auto elements = state.get_int64("Elements"); - const auto common_prefix = state.get_float64("CommonPrefixRatio"); - const auto same_elements = elements * common_prefix; + const auto elements = state.get_int64("Elements"); + const auto common_prefix = state.get_float64("MismatchAt"); + const auto mismatch_point = elements * common_prefix; thrust::device_vector dinput(elements, 0); - thrust::fill(dinput.begin() + same_elements, dinput.end(), 1); + thrust::fill(dinput.begin() + mismatch_point, dinput.end(), val); thrust::device_vector d_result(1); /// @@ -58,7 +64,13 @@ void find_if(nvbench::state& state, nvbench::type_list) size_t temp_storage_bytes{}; cub::DeviceFind::FindIf( - d_temp_storage, temp_storage_bytes, dinput.begin(), d_result.begin(), equals_100{}, dinput.size(), 0); + d_temp_storage, + temp_storage_bytes, + thrust::raw_pointer_cast(dinput.data()), + thrust::raw_pointer_cast(d_result.data()), + equals{val}, + dinput.size(), + 0); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); @@ -67,61 +79,14 @@ void find_if(nvbench::state& state, nvbench::type_list) cub::DeviceFind::FindIf( d_temp_storage, temp_storage_bytes, - dinput.begin(), - d_result.begin(), - equals_100{}, + thrust::raw_pointer_cast(dinput.data()), + thrust::raw_pointer_cast(d_result.data()), + equals{val}, dinput.size(), launch.get_stream()); }); } -NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(nvbench::type_list /*integral_types*/)) - .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0}); - -////////////////////////////////////////////////////// -template -void thrust_find_if(nvbench::state& state, nvbench::type_list) -{ - // set up input - const auto elements = state.get_int64("Elements"); - const auto common_prefix = state.get_float64("CommonPrefixRatio"); - const auto same_elements = elements * common_prefix; - - thrust::device_vector dinput(elements, 0); - thrust::fill(dinput.begin() + same_elements, dinput.end(), 1); - /// - - caching_allocator_t alloc; - thrust::find_if(policy(alloc), dinput.begin(), dinput.end(), equals_100{}); - - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - thrust::find_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals_100{}); - }); -} -NVBENCH_BENCH_TYPES(thrust_find_if, NVBENCH_TYPE_AXES(nvbench::type_list /*integral_types*/)) - .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0}); - -////////////////////////////////////////////////////// -template -void thrust_count_if(nvbench::state& state, nvbench::type_list) -{ - // set up input - const auto elements = state.get_int64("Elements"); - const auto common_prefix = state.get_float64("CommonPrefixRatio"); - const auto same_elements = elements * common_prefix; - thrust::device_vector dinput(elements, 0); - thrust::fill(dinput.begin() + same_elements, dinput.end(), 1); - /// - - caching_allocator_t alloc; - thrust::count_if(policy(alloc), dinput.begin(), dinput.end(), equals_100{}); - - state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - thrust::count_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals_100{}); - }); -} -NVBENCH_BENCH_TYPES(thrust_count_if, NVBENCH_TYPE_AXES(nvbench::type_list /*integral_types*/)) +NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(fundamental_types)) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0}); + .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.0}); diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh index 365e03cd59..0adf8aeedc 100644 --- a/cub/cub/device/device_find_if.cuh +++ b/cub/cub/device/device_find_if.cuh @@ -51,6 +51,8 @@ #include #include +#include + static constexpr int elements_per_thread = 16; static constexpr int _VECTOR_LOAD_LENGTH = 4; static constexpr int block_threads = 128; @@ -129,6 +131,7 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, st } } } + if (syncthreads_or(found)) { if (threadIdx.x == 0) @@ -166,6 +169,8 @@ struct DeviceFind NumItemsT num_items, cudaStream_t stream = 0) { + static_assert(elements_per_thread % _VECTOR_LOAD_LENGTH == 0, "No full tile at the end"); + // int items_per_thread = 2; int tile_size = block_threads * elements_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); diff --git a/thrust/benchmarks/bench/count_if/basic.cu b/thrust/benchmarks/bench/count_if/basic.cu new file mode 100644 index 0000000000..b672d78986 --- /dev/null +++ b/thrust/benchmarks/bench/count_if/basic.cu @@ -0,0 +1,71 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include +#include + +#include "nvbench_helper.cuh" + +template +struct equals +{ + T val; + equals(T _val) + : val(_val) + {} + + __device__ __host__ bool operator()(T i) + { + return i == val; + } +}; + +template +void count_if(nvbench::state& state, nvbench::type_list) +{ + T val = 1; + // set up input + const auto elements = static_cast(state.get_int64("Elements")); + const auto common_prefix = state.get_float64("MismatchAt"); + const auto mismatch_point = elements * common_prefix; + + thrust::device_vector dinput(elements, 0); + thrust::fill(dinput.begin() + mismatch_point, dinput.end(), val); + /// + + caching_allocator_t alloc; + thrust::count_if(policy(alloc), dinput.begin(), dinput.end(), equals{val}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::count_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals{val}); + }); +} + +NVBENCH_BENCH_TYPES(count_if, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("thrust::count_if") + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) + .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.0}); diff --git a/thrust/benchmarks/bench/find_if/basic.cu b/thrust/benchmarks/bench/find_if/basic.cu new file mode 100644 index 0000000000..8a2618e26d --- /dev/null +++ b/thrust/benchmarks/bench/find_if/basic.cu @@ -0,0 +1,71 @@ +/****************************************************************************** + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include +#include + +#include "nvbench_helper.cuh" + +template +struct equals +{ + T val; + equals(T _val) + : val(_val) + {} + + __device__ __host__ bool operator()(T i) + { + return i == val; + } +}; + +template +void find_if(nvbench::state& state, nvbench::type_list) +{ + T val = 1; + // set up input + const auto elements = static_cast(state.get_int64("Elements")); + const auto common_prefix = state.get_float64("MismatchAt"); + const auto mismatch_point = elements * common_prefix; + + thrust::device_vector dinput(elements, 0); + thrust::fill(dinput.begin() + mismatch_point, dinput.end(), val); + /// + + caching_allocator_t alloc; + thrust::find_if(policy(alloc), dinput.begin(), dinput.end(), equals{val}); + + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::find_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals{val}); + }); +} + +NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("thrust::find_if") + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) + .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.0});