From 5e52320a5c1b03c68b9ae775f265db290873883d Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 30 Jun 2023 00:21:04 +0000 Subject: [PATCH 01/15] Enable hash computation from variable length keys --- .../detail/hash_functions/murmurhash3.cuh | 25 +++++-- include/cuco/detail/hash_functions/xxhash.cuh | 70 ++++++++++++++----- 2 files changed, 72 insertions(+), 23 deletions(-) diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index ce5ab9d56..690dfa04f 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -16,6 +16,8 @@ #pragma once +#include + #include namespace cuco::detail { @@ -139,16 +141,31 @@ struct MurmurHash3_32 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - constexpr int len = sizeof(argument_type); + return (*this)(key, cuco::experimental::extent{}); + } + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param key The input argument to hash + * @param size The extent of the key in bytes + * @return A resulting hash value for `key` + */ + template + constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept + { + auto const len = static_cast(size); // TODO size_t? const uint8_t* const data = (const uint8_t*)&key; - constexpr int nblocks = len / 4; + auto const nblocks = len / 4; uint32_t h1 = seed_; constexpr uint32_t c1 = 0xcc9e2d51; constexpr uint32_t c2 = 0x1b873593; //---------- // body - const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4); + uint32_t const* const blocks = (uint32_t const*)(data + nblocks * 4); for (int i = -nblocks; i; i++) { uint32_t k1 = blocks[i]; // getblock32(blocks,i); k1 *= c1; @@ -160,7 +177,7 @@ struct MurmurHash3_32 { } //---------- // tail - const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); + uint8_t const* tail = (uint8_t const*)(data + nblocks * 4); uint32_t k1 = 0; switch (len & 3) { case 3: k1 ^= tail[2] << 16; diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index 10360be9b..18553562a 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -16,6 +16,8 @@ #pragma once +#include + #include namespace cuco::detail { @@ -84,9 +86,24 @@ struct XXHash_32 { * @return A resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + return (*this)(key, cuco::experimental::extent{}); + } + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param key The input argument to hash + * @param size The extent of the key in bytes + * @return A resulting hash value for `key` + */ + template + constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept { // TODO do we need to add checks/hints for alignment? - constexpr auto nbytes = sizeof(Key); + auto const nbytes = size; [[maybe_unused]] auto const bytes = reinterpret_cast(&key); ///< per-byte access [[maybe_unused]] auto const blocks = reinterpret_cast(&key); ///< 4-byte word access @@ -95,12 +112,12 @@ struct XXHash_32 { std::uint32_t h32; // data can be processed in 16-byte chunks - if constexpr (nbytes >= 16) { - constexpr auto limit = nbytes - 16; - std::uint32_t v1 = seed_ + prime1 + prime2; - std::uint32_t v2 = seed_ + prime2; - std::uint32_t v3 = seed_; - std::uint32_t v4 = seed_ - prime1; + if (nbytes >= 16) { + auto const limit = nbytes - 16; + std::uint32_t v1 = seed_ + prime1 + prime2; + std::uint32_t v2 = seed_ + prime2; + std::uint32_t v3 = seed_; + std::uint32_t v4 = seed_ - prime1; do { // pipeline 4*4byte computations @@ -128,7 +145,7 @@ struct XXHash_32 { h32 += nbytes; // remaining data can be processed in 4-byte chunks - if constexpr ((nbytes % 16) >= 4) { + if ((nbytes % 16) >= 4) { for (; offset <= nbytes - 4; offset += 4) { h32 += blocks[offset / 4] * prime3; h32 = rotl(h32, 17) * prime4; @@ -136,7 +153,7 @@ struct XXHash_32 { } // the following loop is only needed if the size of the key is not a multiple of the block size - if constexpr (nbytes % 4) { + if (nbytes % 4) { while (offset < nbytes) { h32 += (bytes[offset] & 255) * prime5; h32 = rotl(h32, 11) * prime1; @@ -235,9 +252,24 @@ struct XXHash_64 { * @return A resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + return (*this)(key, cuco::experimental::extent{}); + } + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param key The input argument to hash + * @param size The extent of the key in bytes + * @return A resulting hash value for `key` + */ + template + constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept { // TODO do we need to add checks/hints for alignment? - constexpr auto nbytes = sizeof(Key); + auto const nbytes = static_cast(size); [[maybe_unused]] auto const bytes = reinterpret_cast(&key); ///< per-byte access [[maybe_unused]] auto const blocks4 = reinterpret_cast(&key); ///< 4-byte word access @@ -248,12 +280,12 @@ struct XXHash_64 { std::uint64_t h64; // data can be processed in 32-byte chunks - if constexpr (nbytes >= 32) { - constexpr auto limit = nbytes - 32; - std::uint64_t v1 = seed_ + prime1 + prime2; - std::uint64_t v2 = seed_ + prime2; - std::uint64_t v3 = seed_; - std::uint64_t v4 = seed_ - prime1; + if (nbytes >= 32) { + auto const limit = nbytes - 32; + std::uint64_t v1 = seed_ + prime1 + prime2; + std::uint64_t v2 = seed_ + prime2; + std::uint64_t v3 = seed_; + std::uint64_t v4 = seed_ - prime1; do { // pipeline 4*8byte computations @@ -305,7 +337,7 @@ struct XXHash_64 { h64 += nbytes; // remaining data can be processed in 8-byte chunks - if constexpr ((nbytes % 32) >= 8) { + if ((nbytes % 32) >= 8) { for (; offset <= nbytes - 8; offset += 8) { std::uint64_t k1 = blocks8[offset / 8] * prime2; k1 = rotl(k1, 31) * prime1; @@ -315,7 +347,7 @@ struct XXHash_64 { } // remaining data can be processed in 4-byte chunks - if constexpr (((nbytes % 32) % 8) >= 4) { + if (((nbytes % 32) % 8) >= 4) { for (; offset <= nbytes - 4; offset += 4) { h64 ^= (blocks4[offset / 4] & 0xffffffffull) * prime1; h64 = rotl(h64, 23) * prime2 + prime3; @@ -324,7 +356,7 @@ struct XXHash_64 { // the following loop is only needed if the size of the key is not a multiple of a previous // block size - if constexpr (nbytes % 4) { + if (nbytes % 4) { while (offset < nbytes) { h64 += (bytes[offset] & 0xff) * prime5; h64 = rotl(h64, 11) * prime1; From acd719e3b0433a61285411c6e191c626d6cd180d Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 30 Jun 2023 14:01:56 +0000 Subject: [PATCH 02/15] Revert murmurhash3 changes --- .../detail/hash_functions/murmurhash3.cuh | 27 ++++--------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index 690dfa04f..53bbf65b1 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -16,8 +16,6 @@ #pragma once -#include - #include namespace cuco::detail { @@ -141,31 +139,16 @@ struct MurmurHash3_32 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - return (*this)(key, cuco::experimental::extent{}); - } - - /** - * @brief Returns a hash value for its argument, as a value of type `result_type`. - * - * @tparam Extent The extent type - * - * @param key The input argument to hash - * @param size The extent of the key in bytes - * @return A resulting hash value for `key` - */ - template - constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept - { - auto const len = static_cast(size); // TODO size_t? + constexpr int len = sizeof(argument_type); const uint8_t* const data = (const uint8_t*)&key; - auto const nblocks = len / 4; + constexpr int nblocks = len / 4; uint32_t h1 = seed_; constexpr uint32_t c1 = 0xcc9e2d51; constexpr uint32_t c2 = 0x1b873593; //---------- // body - uint32_t const* const blocks = (uint32_t const*)(data + nblocks * 4); + const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4); for (int i = -nblocks; i; i++) { uint32_t k1 = blocks[i]; // getblock32(blocks,i); k1 *= c1; @@ -177,9 +160,9 @@ struct MurmurHash3_32 { } //---------- // tail - uint8_t const* tail = (uint8_t const*)(data + nblocks * 4); + const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); uint32_t k1 = 0; - switch (len & 3) { + switch (len & 3) { // TODO fix implicit fallthrough warning case 3: k1 ^= tail[2] << 16; case 2: k1 ^= tail[1] << 8; case 1: From 270a94abd0961539e4efe55e823c497c3876f992 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 30 Jun 2023 14:02:34 +0000 Subject: [PATCH 03/15] Remove useless attributes --- include/cuco/detail/hash_functions/xxhash.cuh | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index 18553562a..ea44141c5 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -103,10 +103,9 @@ struct XXHash_32 { constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept { // TODO do we need to add checks/hints for alignment? - auto const nbytes = size; - [[maybe_unused]] auto const bytes = reinterpret_cast(&key); ///< per-byte access - [[maybe_unused]] auto const blocks = - reinterpret_cast(&key); ///< 4-byte word access + auto const nbytes = size; + auto const bytes = reinterpret_cast(&key); ///< per-byte access + auto const blocks = reinterpret_cast(&key); ///< 4-byte word access std::size_t offset = 0; std::uint32_t h32; @@ -269,12 +268,10 @@ struct XXHash_64 { constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept { // TODO do we need to add checks/hints for alignment? - auto const nbytes = static_cast(size); - [[maybe_unused]] auto const bytes = reinterpret_cast(&key); ///< per-byte access - [[maybe_unused]] auto const blocks4 = - reinterpret_cast(&key); ///< 4-byte word access - [[maybe_unused]] auto const blocks8 = - reinterpret_cast(&key); ///< 8-byte word access + auto const nbytes = static_cast(size); + auto const bytes = reinterpret_cast(&key); ///< per-byte access + auto const blocks4 = reinterpret_cast(&key); ///< 4-byte word access + auto const blocks8 = reinterpret_cast(&key); ///< 8-byte word access std::size_t offset = 0; std::uint64_t h64; From a73e3ff97b4d071b6d10526155c9f5dff8c3cab2 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 30 Jun 2023 14:03:08 +0000 Subject: [PATCH 04/15] Add unit test for dynamic vs. static key sizes --- tests/utility/hash_test.cu | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu index 6dca70aea..4ae47d3a1 100644 --- a/tests/utility/hash_test.cu +++ b/tests/utility/hash_test.cu @@ -21,6 +21,7 @@ #include +#include #include template @@ -149,4 +150,25 @@ TEST_CASE("Test cuco::xxhash_32", "") CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; })); } +} + +TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test", + "", + ((typename Hash), Hash), + (cuco::xxhash_32), + (cuco::xxhash_32), + (cuco::xxhash_64), + (cuco::xxhash_64)) +{ + using key_type = typename Hash::argument_type; + + // this makes sure the compiler isn't abl to optimize away the key_size variable + volatile size_t key_size = sizeof(key_type); + + Hash hash; + + SECTION("Identical keys with static and dynamic key size should have the same hash value.") + { + CHECK(hash(42) == hash(42, key_size)); + } } \ No newline at end of file From 4a95fa864fc48d9c0be280bbcdd86fdf95fab726 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 30 Jun 2023 18:26:37 +0000 Subject: [PATCH 05/15] Enable dynamic key length for mumurhash3 --- .../detail/hash_functions/murmurhash3.cuh | 21 +++++++++++++++++-- tests/utility/hash_test.cu | 2 ++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index 53bbf65b1..994c13de8 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -16,6 +16,8 @@ #pragma once +#include + #include namespace cuco::detail { @@ -139,9 +141,24 @@ struct MurmurHash3_32 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - constexpr int len = sizeof(argument_type); + return (*this)(key, cuco::experimental::extent{}); + } + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param key The input argument to hash + * @param size The extent of the key in bytes + * @return A resulting hash value for `key` + */ + template + constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept + { + int const len = size; // TODO remove intermediate variable const uint8_t* const data = (const uint8_t*)&key; - constexpr int nblocks = len / 4; + int const nblocks = len / 4; uint32_t h1 = seed_; constexpr uint32_t c1 = 0xcc9e2d51; diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu index 4ae47d3a1..cc4a9bc75 100644 --- a/tests/utility/hash_test.cu +++ b/tests/utility/hash_test.cu @@ -155,6 +155,8 @@ TEST_CASE("Test cuco::xxhash_32", "") TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test", "", ((typename Hash), Hash), + (cuco::murmurhash3_32), + (cuco::murmurhash3_32), (cuco::xxhash_32), (cuco::xxhash_32), (cuco::xxhash_64), From b15d9aa566e0124257369c86cfa0b4a83da7cd1d Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 3 Jul 2023 11:47:24 +0000 Subject: [PATCH 06/15] Fix implicit fallthrough warning --- include/cuco/detail/hash_functions/murmurhash3.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index 994c13de8..29c4bb55e 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -179,9 +179,9 @@ struct MurmurHash3_32 { // tail const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); uint32_t k1 = 0; - switch (len & 3) { // TODO fix implicit fallthrough warning - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; + switch (len & 3) { + case 3: k1 ^= tail[2] << 16; [[fallthrough]]; + case 2: k1 ^= tail[1] << 8; [[fallthrough]]; case 1: k1 ^= tail[0]; k1 *= c1; From fd2ed9c4a98b80faab6ff994641542b2f2675b97 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 3 Jul 2023 11:55:13 +0000 Subject: [PATCH 07/15] Use reinterpret_cast instead of C-style cast --- include/cuco/detail/hash_functions/murmurhash3.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index 29c4bb55e..405bdd370 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -156,16 +156,16 @@ struct MurmurHash3_32 { template constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept { - int const len = size; // TODO remove intermediate variable - const uint8_t* const data = (const uint8_t*)&key; - int const nblocks = len / 4; + int const len = size; // TODO remove intermediate variable + auto const data = reinterpret_cast(&key); + int const nblocks = len / 4; uint32_t h1 = seed_; constexpr uint32_t c1 = 0xcc9e2d51; constexpr uint32_t c2 = 0x1b873593; //---------- // body - const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4); + auto const blocks = reinterpret_cast(data + nblocks * 4); for (int i = -nblocks; i; i++) { uint32_t k1 = blocks[i]; // getblock32(blocks,i); k1 *= c1; @@ -177,8 +177,8 @@ struct MurmurHash3_32 { } //---------- // tail - const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); - uint32_t k1 = 0; + auto const tail = reinterpret_cast(data + nblocks * 4); + uint32_t k1 = 0; switch (len & 3) { case 3: k1 ^= tail[2] << 16; [[fallthrough]]; case 2: k1 ^= tail[1] << 8; [[fallthrough]]; From 3aedeb06515e01f42d7a5cbde4852e9ad09aef95 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 3 Jul 2023 14:37:33 +0000 Subject: [PATCH 08/15] Use memcpy instead of reinterpret_cast --- .../detail/hash_functions/murmurhash3.cuh | 26 +++++++-------- include/cuco/detail/hash_functions/utils.cuh | 29 +++++++++++++++++ include/cuco/detail/hash_functions/xxhash.cuh | 32 +++++++++---------- 3 files changed, 56 insertions(+), 31 deletions(-) create mode 100644 include/cuco/detail/hash_functions/utils.cuh diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index 405bdd370..75ed7755c 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -16,9 +16,11 @@ #pragma once +#include #include #include +#include namespace cuco::detail { @@ -156,18 +158,16 @@ struct MurmurHash3_32 { template constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept { - int const len = size; // TODO remove intermediate variable - auto const data = reinterpret_cast(&key); - int const nblocks = len / 4; + auto const data = reinterpret_cast(&key); + auto const nblocks = size / 4; uint32_t h1 = seed_; constexpr uint32_t c1 = 0xcc9e2d51; constexpr uint32_t c2 = 0x1b873593; //---------- // body - auto const blocks = reinterpret_cast(data + nblocks * 4); - for (int i = -nblocks; i; i++) { - uint32_t k1 = blocks[i]; // getblock32(blocks,i); + for (std::remove_const_t i = 0; size >= 4 && i < nblocks; i++) { + uint32_t k1 = load_chunk(data, i); k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; @@ -177,13 +177,12 @@ struct MurmurHash3_32 { } //---------- // tail - auto const tail = reinterpret_cast(data + nblocks * 4); - uint32_t k1 = 0; - switch (len & 3) { - case 3: k1 ^= tail[2] << 16; [[fallthrough]]; - case 2: k1 ^= tail[1] << 8; [[fallthrough]]; + uint32_t k1 = 0; + switch (size & 3) { + case 3: k1 ^= data[nblocks * 4 + 2] << 16; [[fallthrough]]; + case 2: k1 ^= data[nblocks * 4 + 1] << 8; [[fallthrough]]; case 1: - k1 ^= tail[0]; + k1 ^= data[nblocks * 4 + 0]; k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; @@ -191,7 +190,7 @@ struct MurmurHash3_32 { }; //---------- // finalization - h1 ^= len; + h1 ^= size; h1 = fmix32_(h1); return h1; } @@ -205,5 +204,4 @@ struct MurmurHash3_32 { MurmurHash3_fmix32 fmix32_; uint32_t seed_; }; - } // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh new file mode 100644 index 000000000..265b80710 --- /dev/null +++ b/include/cuco/detail/hash_functions/utils.cuh @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco::detail { + +template +constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept +{ + T chunk; + memcpy(&chunk, __builtin_assume_aligned(data + index * sizeof(T), sizeof(T)), sizeof(T)); + return chunk; +} + +}; // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index ea44141c5..7b36bae4d 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -104,8 +105,7 @@ struct XXHash_32 { { // TODO do we need to add checks/hints for alignment? auto const nbytes = size; - auto const bytes = reinterpret_cast(&key); ///< per-byte access - auto const blocks = reinterpret_cast(&key); ///< 4-byte word access + auto const bytes = reinterpret_cast(&key); ///< per-byte access std::size_t offset = 0; std::uint32_t h32; @@ -121,16 +121,16 @@ struct XXHash_32 { do { // pipeline 4*4byte computations auto const pipeline_offset = offset / 4; - v1 += blocks[pipeline_offset] * prime2; + v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; v1 = rotl(v1, 13); v1 *= prime1; - v2 += blocks[pipeline_offset + 1] * prime2; + v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; v2 = rotl(v2, 13); v2 *= prime1; - v3 += blocks[pipeline_offset + 2] * prime2; + v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; v3 = rotl(v3, 13); v3 *= prime1; - v4 += blocks[pipeline_offset + 3] * prime2; + v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; v4 = rotl(v4, 13); v4 *= prime1; offset += 16; @@ -146,7 +146,7 @@ struct XXHash_32 { // remaining data can be processed in 4-byte chunks if ((nbytes % 16) >= 4) { for (; offset <= nbytes - 4; offset += 4) { - h32 += blocks[offset / 4] * prime3; + h32 += load_chunk(bytes, offset / 4) * prime3; h32 = rotl(h32, 17) * prime4; } } @@ -268,10 +268,8 @@ struct XXHash_64 { constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept { // TODO do we need to add checks/hints for alignment? - auto const nbytes = static_cast(size); - auto const bytes = reinterpret_cast(&key); ///< per-byte access - auto const blocks4 = reinterpret_cast(&key); ///< 4-byte word access - auto const blocks8 = reinterpret_cast(&key); ///< 8-byte word access + auto const nbytes = static_cast(size); + auto const bytes = reinterpret_cast(&key); ///< per-byte access std::size_t offset = 0; std::uint64_t h64; @@ -287,16 +285,16 @@ struct XXHash_64 { do { // pipeline 4*8byte computations auto const pipeline_offset = offset / 8; - v1 += blocks8[pipeline_offset] * prime2; + v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; v1 = rotl(v1, 31); v1 *= prime1; - v2 += blocks8[pipeline_offset + 1] * prime2; + v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; v2 = rotl(v2, 31); v2 *= prime1; - v3 += blocks8[pipeline_offset + 2] * prime2; + v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; v3 = rotl(v3, 31); v3 *= prime1; - v4 += blocks8[pipeline_offset + 3] * prime2; + v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; v4 = rotl(v4, 31); v4 *= prime1; offset += 32; @@ -336,7 +334,7 @@ struct XXHash_64 { // remaining data can be processed in 8-byte chunks if ((nbytes % 32) >= 8) { for (; offset <= nbytes - 8; offset += 8) { - std::uint64_t k1 = blocks8[offset / 8] * prime2; + std::uint64_t k1 = load_chunk(bytes, offset / 8) * prime2; k1 = rotl(k1, 31) * prime1; h64 ^= k1; h64 = rotl(h64, 27) * prime1 + prime4; @@ -346,7 +344,7 @@ struct XXHash_64 { // remaining data can be processed in 4-byte chunks if (((nbytes % 32) % 8) >= 4) { for (; offset <= nbytes - 4; offset += 4) { - h64 ^= (blocks4[offset / 4] & 0xffffffffull) * prime1; + h64 ^= (load_chunk(bytes, offset / 4) & 0xffffffffull) * prime1; h64 = rotl(h64, 23) * prime2 + prime3; } } From 669467d82ccad49e34f3d4233c0274f05eacb134 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 3 Jul 2023 23:39:18 +0000 Subject: [PATCH 09/15] Remove sync tag from hash bench for better accuracy --- benchmarks/hash_bench.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu index 58c6ee770..973f6976d 100644 --- a/benchmarks/hash_bench.cu +++ b/benchmarks/hash_bench.cu @@ -77,7 +77,7 @@ void hash_eval(nvbench::state& state, nvbench::type_list) state.add_element_count(num_keys); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + state.exec([&](nvbench::launch& launch) { hash_bench_kernel<<>>( Hash{}, num_keys, hash_values.begin(), materialize_result); }); From 418b4197aac6b4998493c28a496ed75df73874dd Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 3 Jul 2023 23:45:27 +0000 Subject: [PATCH 10/15] XXHash cleanups --- include/cuco/detail/hash_functions/xxhash.cuh | 66 +++++++++---------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index 7b36bae4d..2cd736af8 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -103,16 +103,14 @@ struct XXHash_32 { template constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept { - // TODO do we need to add checks/hints for alignment? - auto const nbytes = size; - auto const bytes = reinterpret_cast(&key); ///< per-byte access + auto const data = reinterpret_cast(&key); ///< per-byte access std::size_t offset = 0; std::uint32_t h32; // data can be processed in 16-byte chunks - if (nbytes >= 16) { - auto const limit = nbytes - 16; + if (size >= 16) { + auto const limit = size - 16; std::uint32_t v1 = seed_ + prime1 + prime2; std::uint32_t v2 = seed_ + prime2; std::uint32_t v3 = seed_; @@ -121,16 +119,16 @@ struct XXHash_32 { do { // pipeline 4*4byte computations auto const pipeline_offset = offset / 4; - v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; + v1 += load_chunk(data, pipeline_offset + 0) * prime2; v1 = rotl(v1, 13); v1 *= prime1; - v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; + v2 += load_chunk(data, pipeline_offset + 1) * prime2; v2 = rotl(v2, 13); v2 *= prime1; - v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; + v3 += load_chunk(data, pipeline_offset + 2) * prime2; v3 = rotl(v3, 13); v3 *= prime1; - v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; + v4 += load_chunk(data, pipeline_offset + 3) * prime2; v4 = rotl(v4, 13); v4 *= prime1; offset += 16; @@ -141,20 +139,20 @@ struct XXHash_32 { h32 = seed_ + prime5; } - h32 += nbytes; + h32 += size; // remaining data can be processed in 4-byte chunks - if ((nbytes % 16) >= 4) { - for (; offset <= nbytes - 4; offset += 4) { - h32 += load_chunk(bytes, offset / 4) * prime3; + if ((size % 16) >= 4) { + for (; offset <= size - 4; offset += 4) { + h32 += load_chunk(data, offset / 4) * prime3; h32 = rotl(h32, 17) * prime4; } } // the following loop is only needed if the size of the key is not a multiple of the block size - if (nbytes % 4) { - while (offset < nbytes) { - h32 += (bytes[offset] & 255) * prime5; + if (size % 4) { + while (offset < size) { + h32 += (data[offset] & 255) * prime5; h32 = rotl(h32, 11) * prime1; ++offset; } @@ -267,16 +265,14 @@ struct XXHash_64 { template constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept { - // TODO do we need to add checks/hints for alignment? - auto const nbytes = static_cast(size); - auto const bytes = reinterpret_cast(&key); ///< per-byte access + auto const data = reinterpret_cast(&key); ///< per-byte access std::size_t offset = 0; std::uint64_t h64; // data can be processed in 32-byte chunks - if (nbytes >= 32) { - auto const limit = nbytes - 32; + if (size >= 32) { + auto const limit = size - 32; std::uint64_t v1 = seed_ + prime1 + prime2; std::uint64_t v2 = seed_ + prime2; std::uint64_t v3 = seed_; @@ -285,16 +281,16 @@ struct XXHash_64 { do { // pipeline 4*8byte computations auto const pipeline_offset = offset / 8; - v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; + v1 += load_chunk(data, pipeline_offset + 0) * prime2; v1 = rotl(v1, 31); v1 *= prime1; - v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; + v2 += load_chunk(data, pipeline_offset + 1) * prime2; v2 = rotl(v2, 31); v2 *= prime1; - v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; + v3 += load_chunk(data, pipeline_offset + 2) * prime2; v3 = rotl(v3, 31); v3 *= prime1; - v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; + v4 += load_chunk(data, pipeline_offset + 3) * prime2; v4 = rotl(v4, 31); v4 *= prime1; offset += 32; @@ -329,12 +325,12 @@ struct XXHash_64 { h64 = seed_ + prime5; } - h64 += nbytes; + h64 += size; // remaining data can be processed in 8-byte chunks - if ((nbytes % 32) >= 8) { - for (; offset <= nbytes - 8; offset += 8) { - std::uint64_t k1 = load_chunk(bytes, offset / 8) * prime2; + if ((size % 32) >= 8) { + for (; offset <= size - 8; offset += 8) { + std::uint64_t k1 = load_chunk(data, offset / 8) * prime2; k1 = rotl(k1, 31) * prime1; h64 ^= k1; h64 = rotl(h64, 27) * prime1 + prime4; @@ -342,18 +338,18 @@ struct XXHash_64 { } // remaining data can be processed in 4-byte chunks - if (((nbytes % 32) % 8) >= 4) { - for (; offset <= nbytes - 4; offset += 4) { - h64 ^= (load_chunk(bytes, offset / 4) & 0xffffffffull) * prime1; + if (((size % 32) % 8) >= 4) { + for (; offset <= size - 4; offset += 4) { + h64 ^= (load_chunk(data, offset / 4) & 0xffffffffull) * prime1; h64 = rotl(h64, 23) * prime2 + prime3; } } // the following loop is only needed if the size of the key is not a multiple of a previous // block size - if (nbytes % 4) { - while (offset < nbytes) { - h64 += (bytes[offset] & 0xff) * prime5; + if (size % 4) { + while (offset < size) { + h64 += (data[offset] & 0xff) * prime5; h64 = rotl(h64, 11) * prime1; ++offset; } From afddf1f8d4a7bd9fd01aebff590f602d1cb110bd Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 3 Jul 2023 23:49:48 +0000 Subject: [PATCH 11/15] MurmurHash3 cleanups --- .../detail/hash_functions/murmurhash3.cuh | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index 75ed7755c..fdf438968 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -35,15 +35,15 @@ template struct MurmurHash3_fmix32 { static_assert(sizeof(Key) == 4, "Key type must be 4 bytes in size."); - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint32_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint32_t; ///< The type of the hash values produced /** * @brief Constructs a MurmurHash3_fmix32 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr MurmurHash3_fmix32(uint32_t seed = 0) : seed_{seed} {} + __host__ __device__ constexpr MurmurHash3_fmix32(std::uint32_t seed = 0) : seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. @@ -53,7 +53,7 @@ struct MurmurHash3_fmix32 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - uint32_t h = static_cast(key) ^ seed_; + std::uint32_t h = static_cast(key) ^ seed_; h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; @@ -63,7 +63,7 @@ struct MurmurHash3_fmix32 { } private: - uint32_t seed_; + std::uint32_t seed_; }; /** @@ -77,15 +77,15 @@ template struct MurmurHash3_fmix64 { static_assert(sizeof(Key) == 8, "Key type must be 8 bytes in size."); - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint64_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint64_t; ///< The type of the hash values produced /** * @brief Constructs a MurmurHash3_fmix64 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr MurmurHash3_fmix64(uint64_t seed = 0) : seed_{seed} {} + __host__ __device__ constexpr MurmurHash3_fmix64(std::uint64_t seed = 0) : seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. @@ -95,7 +95,7 @@ struct MurmurHash3_fmix64 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - uint64_t h = static_cast(key) ^ seed_; + std::uint64_t h = static_cast(key) ^ seed_; h ^= h >> 33; h *= 0xff51afd7ed558ccd; h ^= h >> 33; @@ -105,7 +105,7 @@ struct MurmurHash3_fmix64 { } private: - uint64_t seed_; + std::uint64_t seed_; }; /** @@ -125,15 +125,15 @@ struct MurmurHash3_fmix64 { */ template struct MurmurHash3_32 { - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint32_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint32_t; ///< The type of the hash values produced /** * @brief Constructs a MurmurHash3_32 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr MurmurHash3_32(uint32_t seed = 0) : fmix32_{0}, seed_{seed} {} + __host__ __device__ constexpr MurmurHash3_32(std::uint32_t seed = 0) : fmix32_{0}, seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. @@ -158,16 +158,16 @@ struct MurmurHash3_32 { template constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept { - auto const data = reinterpret_cast(&key); + auto const data = reinterpret_cast(&key); auto const nblocks = size / 4; - uint32_t h1 = seed_; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; + std::uint32_t h1 = seed_; + constexpr std::uint32_t c1 = 0xcc9e2d51; + constexpr std::uint32_t c2 = 0x1b873593; //---------- // body for (std::remove_const_t i = 0; size >= 4 && i < nblocks; i++) { - uint32_t k1 = load_chunk(data, i); + std::uint32_t k1 = load_chunk(data, i); k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; @@ -177,7 +177,7 @@ struct MurmurHash3_32 { } //---------- // tail - uint32_t k1 = 0; + std::uint32_t k1 = 0; switch (size & 3) { case 3: k1 ^= data[nblocks * 4 + 2] << 16; [[fallthrough]]; case 2: k1 ^= data[nblocks * 4 + 1] << 8; [[fallthrough]]; @@ -196,12 +196,12 @@ struct MurmurHash3_32 { } private: - constexpr __host__ __device__ uint32_t rotl32(uint32_t x, int8_t r) const noexcept + constexpr __host__ __device__ std::uint32_t rotl32(std::uint32_t x, std::int8_t r) const noexcept { return (x << r) | (x >> (32 - r)); } - MurmurHash3_fmix32 fmix32_; - uint32_t seed_; + MurmurHash3_fmix32 fmix32_; + std::uint32_t seed_; }; } // namespace cuco::detail \ No newline at end of file From fc95dcba2d04de7016a439bd86b2806f15d4b26a Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 3 Jul 2023 23:50:39 +0000 Subject: [PATCH 12/15] Fix typo in comment --- tests/utility/hash_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu index cc4a9bc75..cb78d2374 100644 --- a/tests/utility/hash_test.cu +++ b/tests/utility/hash_test.cu @@ -164,7 +164,7 @@ TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test", { using key_type = typename Hash::argument_type; - // this makes sure the compiler isn't abl to optimize away the key_size variable + // this makes sure the compiler isn't able to optimize away the key_size variable volatile size_t key_size = sizeof(key_type); Hash hash; From f3b435c240f5539438996da47f202fc9414dcfc5 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 4 Jul 2023 00:11:32 +0000 Subject: [PATCH 13/15] Fix load_chunk --- include/cuco/detail/hash_functions/utils.cuh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh index 265b80710..df77d8dd6 100644 --- a/include/cuco/detail/hash_functions/utils.cuh +++ b/include/cuco/detail/hash_functions/utils.cuh @@ -16,13 +16,16 @@ #pragma once +#include + namespace cuco::detail { template constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept { + auto const bytes = reinterpret_cast(data); T chunk; - memcpy(&chunk, __builtin_assume_aligned(data + index * sizeof(T), sizeof(T)), sizeof(T)); + memcpy(&chunk, __builtin_assume_aligned(bytes + index * sizeof(T), sizeof(T)), sizeof(T)); return chunk; } From 74a3739df0719c66e57d4162703d630f3088946e Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 10 Jul 2023 14:42:24 +0000 Subject: [PATCH 14/15] Use compute_hash function instead of operator --- .../detail/hash_functions/murmurhash3.cuh | 24 ++++---- include/cuco/detail/hash_functions/xxhash.cuh | 59 ++++++++++--------- tests/utility/hash_test.cu | 9 +-- 3 files changed, 48 insertions(+), 44 deletions(-) diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index fdf438968..a12143523 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -139,11 +140,12 @@ struct MurmurHash3_32 { * @brief Returns a hash value for its argument, as a value of type `result_type`. * * @param key The input argument to hash - * @return A resulting hash value for `key` + * @return The resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - return (*this)(key, cuco::experimental::extent{}); + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); } /** @@ -151,14 +153,14 @@ struct MurmurHash3_32 { * * @tparam Extent The extent type * - * @param key The input argument to hash - * @param size The extent of the key in bytes - * @return A resulting hash value for `key` + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value */ template - constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept { - auto const data = reinterpret_cast(&key); auto const nblocks = size / 4; std::uint32_t h1 = seed_; @@ -167,7 +169,7 @@ struct MurmurHash3_32 { //---------- // body for (std::remove_const_t i = 0; size >= 4 && i < nblocks; i++) { - std::uint32_t k1 = load_chunk(data, i); + std::uint32_t k1 = load_chunk(bytes, i); k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; @@ -179,10 +181,10 @@ struct MurmurHash3_32 { // tail std::uint32_t k1 = 0; switch (size & 3) { - case 3: k1 ^= data[nblocks * 4 + 2] << 16; [[fallthrough]]; - case 2: k1 ^= data[nblocks * 4 + 1] << 8; [[fallthrough]]; + case 3: k1 ^= std::to_integer(bytes[nblocks * 4 + 2]) << 16; [[fallthrough]]; + case 2: k1 ^= std::to_integer(bytes[nblocks * 4 + 1]) << 8; [[fallthrough]]; case 1: - k1 ^= data[nblocks * 4 + 0]; + k1 ^= std::to_integer(bytes[nblocks * 4 + 0]); k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index 6b7a80220..c686f3b82 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -19,6 +19,7 @@ #include #include +#include #include namespace cuco::detail { @@ -84,11 +85,12 @@ struct XXHash_32 { * @brief Returns a hash value for its argument, as a value of type `result_type`. * * @param key The input argument to hash - * @return A resulting hash value for `key` + * @return The resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - return (*this)(key, cuco::experimental::extent{}); + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); } /** @@ -96,15 +98,14 @@ struct XXHash_32 { * * @tparam Extent The extent type * - * @param key The input argument to hash - * @param size The extent of the key in bytes - * @return A resulting hash value for `key` + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value */ template - constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept { - auto const data = reinterpret_cast(&key); ///< per-byte access - std::size_t offset = 0; std::uint32_t h32; @@ -119,16 +120,16 @@ struct XXHash_32 { do { // pipeline 4*4byte computations auto const pipeline_offset = offset / 4; - v1 += load_chunk(data, pipeline_offset + 0) * prime2; + v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; v1 = rotl(v1, 13); v1 *= prime1; - v2 += load_chunk(data, pipeline_offset + 1) * prime2; + v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; v2 = rotl(v2, 13); v2 *= prime1; - v3 += load_chunk(data, pipeline_offset + 2) * prime2; + v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; v3 = rotl(v3, 13); v3 *= prime1; - v4 += load_chunk(data, pipeline_offset + 3) * prime2; + v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; v4 = rotl(v4, 13); v4 *= prime1; offset += 16; @@ -144,7 +145,7 @@ struct XXHash_32 { // remaining data can be processed in 4-byte chunks if ((size % 16) >= 4) { for (; offset <= size - 4; offset += 4) { - h32 += load_chunk(data, offset / 4) * prime3; + h32 += load_chunk(bytes, offset / 4) * prime3; h32 = rotl(h32, 17) * prime4; } } @@ -152,7 +153,7 @@ struct XXHash_32 { // the following loop is only needed if the size of the key is not a multiple of the block size if (size % 4) { while (offset < size) { - h32 += (data[offset] & 255) * prime5; + h32 += (std::to_integer(bytes[offset]) & 255) * prime5; h32 = rotl(h32, 11) * prime1; ++offset; } @@ -246,11 +247,12 @@ struct XXHash_64 { * @brief Returns a hash value for its argument, as a value of type `result_type`. * * @param key The input argument to hash - * @return A resulting hash value for `key` + * @return The resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - return (*this)(key, cuco::experimental::extent{}); + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); } /** @@ -258,15 +260,14 @@ struct XXHash_64 { * * @tparam Extent The extent type * - * @param key The input argument to hash - * @param size The extent of the key in bytes - * @return A resulting hash value for `key` + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value */ template - constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept { - auto const data = reinterpret_cast(&key); ///< per-byte access - std::size_t offset = 0; std::uint64_t h64; @@ -281,16 +282,16 @@ struct XXHash_64 { do { // pipeline 4*8byte computations auto const pipeline_offset = offset / 8; - v1 += load_chunk(data, pipeline_offset + 0) * prime2; + v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; v1 = rotl(v1, 31); v1 *= prime1; - v2 += load_chunk(data, pipeline_offset + 1) * prime2; + v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; v2 = rotl(v2, 31); v2 *= prime1; - v3 += load_chunk(data, pipeline_offset + 2) * prime2; + v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; v3 = rotl(v3, 31); v3 *= prime1; - v4 += load_chunk(data, pipeline_offset + 3) * prime2; + v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; v4 = rotl(v4, 31); v4 *= prime1; offset += 32; @@ -330,7 +331,7 @@ struct XXHash_64 { // remaining data can be processed in 8-byte chunks if ((size % 32) >= 8) { for (; offset <= size - 8; offset += 8) { - std::uint64_t k1 = load_chunk(data, offset / 8) * prime2; + std::uint64_t k1 = load_chunk(bytes, offset / 8) * prime2; k1 = rotl(k1, 31) * prime1; h64 ^= k1; h64 = rotl(h64, 27) * prime1 + prime4; @@ -340,7 +341,7 @@ struct XXHash_64 { // remaining data can be processed in 4-byte chunks if (((size % 32) % 8) >= 4) { for (; offset <= size - 4; offset += 4) { - h64 ^= (load_chunk(data, offset / 4) & 0xffffffffull) * prime1; + h64 ^= (load_chunk(bytes, offset / 4) & 0xffffffffull) * prime1; h64 = rotl(h64, 23) * prime2 + prime3; } } @@ -349,7 +350,7 @@ struct XXHash_64 { // block size if (size % 4) { while (offset < size) { - h64 ^= (data[offset] & 0xff) * prime5; + h64 ^= (std::to_integer(bytes[offset]) & 0xff) * prime5; h64 = rotl(h64, 11) * prime1; ++offset; } diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu index 44a752c93..3e8880860 100644 --- a/tests/utility/hash_test.cu +++ b/tests/utility/hash_test.cu @@ -24,6 +24,8 @@ #include #include +#include + template struct large_key { constexpr __host__ __device__ large_key(int32_t value) noexcept @@ -185,13 +187,12 @@ TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test", { using key_type = typename Hash::argument_type; - // this makes sure the compiler isn't able to optimize away the key_size variable - volatile size_t key_size = sizeof(key_type); - Hash hash; + key_type key = 42; SECTION("Identical keys with static and dynamic key size should have the same hash value.") { - CHECK(hash(42) == hash(42, key_size)); + CHECK(hash(key) == + hash.compute_hash(reinterpret_cast(&key), sizeof(key_type))); } } \ No newline at end of file From b82158904ae672ae7e4fd3bad5e7358ea0da4bfa Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 11 Jul 2023 00:39:54 +0000 Subject: [PATCH 15/15] Use reinterpret_cast instead of memcpy /sadface --- include/cuco/detail/hash_functions/utils.cuh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh index df77d8dd6..a50779f23 100644 --- a/include/cuco/detail/hash_functions/utils.cuh +++ b/include/cuco/detail/hash_functions/utils.cuh @@ -16,17 +16,13 @@ #pragma once -#include - namespace cuco::detail { template constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept { - auto const bytes = reinterpret_cast(data); - T chunk; - memcpy(&chunk, __builtin_assume_aligned(bytes + index * sizeof(T), sizeof(T)), sizeof(T)); - return chunk; + auto const chunks = reinterpret_cast(data); + return chunks[index]; } }; // namespace cuco::detail \ No newline at end of file