From 5e52320a5c1b03c68b9ae775f265db290873883d Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 30 Jun 2023 00:21:04 +0000
Subject: [PATCH 01/15] Enable hash computation from variable length keys

---
 .../detail/hash_functions/murmurhash3.cuh     | 25 +++++--
 include/cuco/detail/hash_functions/xxhash.cuh | 70 ++++++++++++++-----
 2 files changed, 72 insertions(+), 23 deletions(-)
diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
index ce5ab9d56..690dfa04f 100644
--- a/include/cuco/detail/hash_functions/murmurhash3.cuh
+++ b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuco/extent.cuh>
+
 #include <cstdint>
 
 namespace cuco::detail {
@@ -139,16 +141,31 @@ struct MurmurHash3_32 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    constexpr int len         = sizeof(argument_type);
+    return (*this)(key, cuco::experimental::extent<std::size_t, sizeof(Key)>{});
+  }
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @tparam Extent The extent type
+   *
+   * @param key The input argument to hash
+   * @param size The extent of the key in bytes
+   * @return A resulting hash value for `key`
+   */
+  template <typename Extent>
+  constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
+  {
+    auto const len            = static_cast<int>(size);  // TODO size_t?
     const uint8_t* const data = (const uint8_t*)&key;
-    constexpr int nblocks     = len / 4;
+    auto const nblocks        = len / 4;
 
     uint32_t h1           = seed_;
     constexpr uint32_t c1 = 0xcc9e2d51;
     constexpr uint32_t c2 = 0x1b873593;
     //----------
     // body
-    const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4);
+    uint32_t const* const blocks = (uint32_t const*)(data + nblocks * 4);
     for (int i = -nblocks; i; i++) {
       uint32_t k1 = blocks[i];  // getblock32(blocks,i);
       k1 *= c1;
@@ -160,7 +177,7 @@ struct MurmurHash3_32 {
     }
     //----------
     // tail
-    const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
+    uint8_t const* tail = (uint8_t const*)(data + nblocks * 4);
     uint32_t k1         = 0;
     switch (len & 3) {
       case 3: k1 ^= tail[2] << 16;
diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh
index 10360be9b..18553562a 100644
--- a/include/cuco/detail/hash_functions/xxhash.cuh
+++ b/include/cuco/detail/hash_functions/xxhash.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuco/extent.cuh>
+
 #include <cstdint>
 
 namespace cuco::detail {
@@ -84,9 +86,24 @@ struct XXHash_32 {
    * @return A resulting hash value for `key`
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
+  {
+    return (*this)(key, cuco::experimental::extent<std::size_t, sizeof(Key)>{});
+  }
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @tparam Extent The extent type
+   *
+   * @param key The input argument to hash
+   * @param size The extent of the key in bytes
+   * @return A resulting hash value for `key`
+   */
+  template <typename Extent>
+  constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
   {
     // TODO do we need to add checks/hints for alignment?
-    constexpr auto nbytes             = sizeof(Key);
+    auto const nbytes                 = size;
     [[maybe_unused]] auto const bytes = reinterpret_cast<char const*>(&key);  ///< per-byte access
     [[maybe_unused]] auto const blocks =
       reinterpret_cast<std::uint32_t const*>(&key);  ///< 4-byte word access
@@ -95,12 +112,12 @@ struct XXHash_32 {
     std::uint32_t h32;
 
     // data can be processed in 16-byte chunks
-    if constexpr (nbytes >= 16) {
-      constexpr auto limit = nbytes - 16;
-      std::uint32_t v1     = seed_ + prime1 + prime2;
-      std::uint32_t v2     = seed_ + prime2;
-      std::uint32_t v3     = seed_;
-      std::uint32_t v4     = seed_ - prime1;
+    if (nbytes >= 16) {
+      auto const limit = nbytes - 16;
+      std::uint32_t v1 = seed_ + prime1 + prime2;
+      std::uint32_t v2 = seed_ + prime2;
+      std::uint32_t v3 = seed_;
+      std::uint32_t v4 = seed_ - prime1;
 
       do {
         // pipeline 4*4byte computations
@@ -128,7 +145,7 @@ struct XXHash_32 {
     h32 += nbytes;
 
     // remaining data can be processed in 4-byte chunks
-    if constexpr ((nbytes % 16) >= 4) {
+    if ((nbytes % 16) >= 4) {
       for (; offset <= nbytes - 4; offset += 4) {
         h32 += blocks[offset / 4] * prime3;
         h32 = rotl(h32, 17) * prime4;
@@ -136,7 +153,7 @@ struct XXHash_32 {
     }
 
     // the following loop is only needed if the size of the key is not a multiple of the block size
-    if constexpr (nbytes % 4) {
+    if (nbytes % 4) {
       while (offset < nbytes) {
         h32 += (bytes[offset] & 255) * prime5;
         h32 = rotl(h32, 11) * prime1;
@@ -235,9 +252,24 @@ struct XXHash_64 {
    * @return A resulting hash value for `key`
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
+  {
+    return (*this)(key, cuco::experimental::extent<std::size_t, sizeof(Key)>{});
+  }
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @tparam Extent The extent type
+   *
+   * @param key The input argument to hash
+   * @param size The extent of the key in bytes
+   * @return A resulting hash value for `key`
+   */
+  template <typename Extent>
+  constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
   {
     // TODO do we need to add checks/hints for alignment?
-    constexpr auto nbytes             = sizeof(Key);
+    auto const nbytes                 = static_cast<std::size_t>(size);
     [[maybe_unused]] auto const bytes = reinterpret_cast<char const*>(&key);  ///< per-byte access
     [[maybe_unused]] auto const blocks4 =
       reinterpret_cast<std::uint32_t const*>(&key);  ///< 4-byte word access
@@ -248,12 +280,12 @@ struct XXHash_64 {
     std::uint64_t h64;
 
     // data can be processed in 32-byte chunks
-    if constexpr (nbytes >= 32) {
-      constexpr auto limit = nbytes - 32;
-      std::uint64_t v1     = seed_ + prime1 + prime2;
-      std::uint64_t v2     = seed_ + prime2;
-      std::uint64_t v3     = seed_;
-      std::uint64_t v4     = seed_ - prime1;
+    if (nbytes >= 32) {
+      auto const limit = nbytes - 32;
+      std::uint64_t v1 = seed_ + prime1 + prime2;
+      std::uint64_t v2 = seed_ + prime2;
+      std::uint64_t v3 = seed_;
+      std::uint64_t v4 = seed_ - prime1;
 
       do {
         // pipeline 4*8byte computations
@@ -305,7 +337,7 @@ struct XXHash_64 {
     h64 += nbytes;
 
     // remaining data can be processed in 8-byte chunks
-    if constexpr ((nbytes % 32) >= 8) {
+    if ((nbytes % 32) >= 8) {
       for (; offset <= nbytes - 8; offset += 8) {
         std::uint64_t k1 = blocks8[offset / 8] * prime2;
         k1               = rotl(k1, 31) * prime1;
@@ -315,7 +347,7 @@ struct XXHash_64 {
     }
 
     // remaining data can be processed in 4-byte chunks
-    if constexpr (((nbytes % 32) % 8) >= 4) {
+    if (((nbytes % 32) % 8) >= 4) {
       for (; offset <= nbytes - 4; offset += 4) {
         h64 ^= (blocks4[offset / 4] & 0xffffffffull) * prime1;
         h64 = rotl(h64, 23) * prime2 + prime3;
@@ -324,7 +356,7 @@ struct XXHash_64 {
 
     // the following loop is only needed if the size of the key is not a multiple of a previous
     // block size
-    if constexpr (nbytes % 4) {
+    if (nbytes % 4) {
       while (offset < nbytes) {
         h64 += (bytes[offset] & 0xff) * prime5;
         h64 = rotl(h64, 11) * prime1;

From acd719e3b0433a61285411c6e191c626d6cd180d Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 30 Jun 2023 14:01:56 +0000
Subject: [PATCH 02/15] Revert murmurhash3 changes

---
 .../detail/hash_functions/murmurhash3.cuh     | 27 ++++---------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
index 690dfa04f..53bbf65b1 100644
--- a/include/cuco/detail/hash_functions/murmurhash3.cuh
+++ b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include <cuco/extent.cuh>
-
 #include <cstdint>
 
 namespace cuco::detail {
@@ -141,31 +139,16 @@ struct MurmurHash3_32 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    return (*this)(key, cuco::experimental::extent<std::size_t, sizeof(Key)>{});
-  }
-
-  /**
-   * @brief Returns a hash value for its argument, as a value of type `result_type`.
-   *
-   * @tparam Extent The extent type
-   *
-   * @param key The input argument to hash
-   * @param size The extent of the key in bytes
-   * @return A resulting hash value for `key`
-   */
-  template <typename Extent>
-  constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
-  {
-    auto const len            = static_cast<int>(size);  // TODO size_t?
+    constexpr int len         = sizeof(argument_type);
     const uint8_t* const data = (const uint8_t*)&key;
-    auto const nblocks        = len / 4;
+    constexpr int nblocks     = len / 4;
 
     uint32_t h1           = seed_;
     constexpr uint32_t c1 = 0xcc9e2d51;
     constexpr uint32_t c2 = 0x1b873593;
     //----------
     // body
-    uint32_t const* const blocks = (uint32_t const*)(data + nblocks * 4);
+    const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4);
     for (int i = -nblocks; i; i++) {
       uint32_t k1 = blocks[i];  // getblock32(blocks,i);
       k1 *= c1;
@@ -177,9 +160,9 @@ struct MurmurHash3_32 {
     }
     //----------
     // tail
-    uint8_t const* tail = (uint8_t const*)(data + nblocks * 4);
+    const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
     uint32_t k1         = 0;
-    switch (len & 3) {
+    switch (len & 3) {  // TODO fix implicit fallthrough warning
       case 3: k1 ^= tail[2] << 16;
       case 2: k1 ^= tail[1] << 8;
       case 1:

From 270a94abd0961539e4efe55e823c497c3876f992 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 30 Jun 2023 14:02:34 +0000
Subject: [PATCH 03/15] Remove useless attributes

---
 include/cuco/detail/hash_functions/xxhash.cuh | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh
index 18553562a..ea44141c5 100644
--- a/include/cuco/detail/hash_functions/xxhash.cuh
+++ b/include/cuco/detail/hash_functions/xxhash.cuh
@@ -103,10 +103,9 @@ struct XXHash_32 {
   constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
   {
     // TODO do we need to add checks/hints for alignment?
-    auto const nbytes                 = size;
-    [[maybe_unused]] auto const bytes = reinterpret_cast<char const*>(&key);  ///< per-byte access
-    [[maybe_unused]] auto const blocks =
-      reinterpret_cast<std::uint32_t const*>(&key);  ///< 4-byte word access
+    auto const nbytes = size;
+    auto const bytes  = reinterpret_cast<char const*>(&key);           ///< per-byte access
+    auto const blocks = reinterpret_cast<std::uint32_t const*>(&key);  ///< 4-byte word access
 
     std::size_t offset = 0;
     std::uint32_t h32;
@@ -269,12 +268,10 @@ struct XXHash_64 {
   constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
   {
     // TODO do we need to add checks/hints for alignment?
-    auto const nbytes                 = static_cast<std::size_t>(size);
-    [[maybe_unused]] auto const bytes = reinterpret_cast<char const*>(&key);  ///< per-byte access
-    [[maybe_unused]] auto const blocks4 =
-      reinterpret_cast<std::uint32_t const*>(&key);  ///< 4-byte word access
-    [[maybe_unused]] auto const blocks8 =
-      reinterpret_cast<std::uint64_t const*>(&key);  ///< 8-byte word access
+    auto const nbytes  = static_cast<std::size_t>(size);
+    auto const bytes   = reinterpret_cast<char const*>(&key);           ///< per-byte access
+    auto const blocks4 = reinterpret_cast<std::uint32_t const*>(&key);  ///< 4-byte word access
+    auto const blocks8 = reinterpret_cast<std::uint64_t const*>(&key);  ///< 8-byte word access
 
     std::size_t offset = 0;
     std::uint64_t h64;

From a73e3ff97b4d071b6d10526155c9f5dff8c3cab2 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 30 Jun 2023 14:03:08 +0000
Subject: [PATCH 04/15] Add unit test for dynamic vs. static key sizes

---
 tests/utility/hash_test.cu | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu
index 6dca70aea..4ae47d3a1 100644
--- a/tests/utility/hash_test.cu
+++ b/tests/utility/hash_test.cu
@@ -21,6 +21,7 @@
 
 #include <thrust/device_vector.h>
 
+#include <catch2/catch_template_test_macros.hpp>
 #include <catch2/catch_test_macros.hpp>
 
 template <int32_t Words>
@@ -149,4 +150,25 @@ TEST_CASE("Test cuco::xxhash_32", "")
 
     CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; }));
   }
+}
+
+TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test",
+                       "",
+                       ((typename Hash), Hash),
+                       (cuco::xxhash_32<char>),
+                       (cuco::xxhash_32<int32_t>),
+                       (cuco::xxhash_64<char>),
+                       (cuco::xxhash_64<int32_t>))
+{
+  using key_type = typename Hash::argument_type;
+
+  // this makes sure the compiler isn't abl to optimize away the key_size variable
+  volatile size_t key_size = sizeof(key_type);
+
+  Hash hash;
+
+  SECTION("Identical keys with static and dynamic key size should have the same hash value.")
+  {
+    CHECK(hash(42) == hash(42, key_size));
+  }
 }
\ No newline at end of file

From 4a95fa864fc48d9c0be280bbcdd86fdf95fab726 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 30 Jun 2023 18:26:37 +0000
Subject: [PATCH 05/15] Enable dynamic key length for mumurhash3

---
 .../detail/hash_functions/murmurhash3.cuh     | 21 +++++++++++++++++--
 tests/utility/hash_test.cu                    |  2 ++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
index 53bbf65b1..994c13de8 100644
--- a/include/cuco/detail/hash_functions/murmurhash3.cuh
+++ b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuco/extent.cuh>
+
 #include <cstdint>
 
 namespace cuco::detail {
@@ -139,9 +141,24 @@ struct MurmurHash3_32 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    constexpr int len         = sizeof(argument_type);
+    return (*this)(key, cuco::experimental::extent<std::size_t, sizeof(Key)>{});
+  }
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @tparam Extent The extent type
+   *
+   * @param key The input argument to hash
+   * @param size The extent of the key in bytes
+   * @return A resulting hash value for `key`
+   */
+  template <typename Extent>
+  constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
+  {
+    int const len             = size;  // TODO remove intermediate variable
     const uint8_t* const data = (const uint8_t*)&key;
-    constexpr int nblocks     = len / 4;
+    int const nblocks         = len / 4;
 
     uint32_t h1           = seed_;
     constexpr uint32_t c1 = 0xcc9e2d51;
diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu
index 4ae47d3a1..cc4a9bc75 100644
--- a/tests/utility/hash_test.cu
+++ b/tests/utility/hash_test.cu
@@ -155,6 +155,8 @@ TEST_CASE("Test cuco::xxhash_32", "")
 TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test",
                        "",
                        ((typename Hash), Hash),
+                       (cuco::murmurhash3_32<char>),
+                       (cuco::murmurhash3_32<int32_t>),
                        (cuco::xxhash_32<char>),
                        (cuco::xxhash_32<int32_t>),
                        (cuco::xxhash_64<char>),

From b15d9aa566e0124257369c86cfa0b4a83da7cd1d Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 3 Jul 2023 11:47:24 +0000
Subject: [PATCH 06/15] Fix implicit fallthrough warning

---
 include/cuco/detail/hash_functions/murmurhash3.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
index 994c13de8..29c4bb55e 100644
--- a/include/cuco/detail/hash_functions/murmurhash3.cuh
+++ b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -179,9 +179,9 @@ struct MurmurHash3_32 {
     // tail
     const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
     uint32_t k1         = 0;
-    switch (len & 3) {  // TODO fix implicit fallthrough warning
-      case 3: k1 ^= tail[2] << 16;
-      case 2: k1 ^= tail[1] << 8;
+    switch (len & 3) {
+      case 3: k1 ^= tail[2] << 16; [[fallthrough]];
+      case 2: k1 ^= tail[1] << 8; [[fallthrough]];
       case 1:
         k1 ^= tail[0];
         k1 *= c1;

From fd2ed9c4a98b80faab6ff994641542b2f2675b97 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 3 Jul 2023 11:55:13 +0000
Subject: [PATCH 07/15] Use reinterpret_cast instead of C-style cast

---
 include/cuco/detail/hash_functions/murmurhash3.cuh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
index 29c4bb55e..405bdd370 100644
--- a/include/cuco/detail/hash_functions/murmurhash3.cuh
+++ b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -156,16 +156,16 @@ struct MurmurHash3_32 {
   template <typename Extent>
   constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
   {
-    int const len             = size;  // TODO remove intermediate variable
-    const uint8_t* const data = (const uint8_t*)&key;
-    int const nblocks         = len / 4;
+    int const len     = size;  // TODO remove intermediate variable
+    auto const data   = reinterpret_cast<uint8_t const*>(&key);
+    int const nblocks = len / 4;
 
     uint32_t h1           = seed_;
     constexpr uint32_t c1 = 0xcc9e2d51;
     constexpr uint32_t c2 = 0x1b873593;
     //----------
     // body
-    const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4);
+    auto const blocks = reinterpret_cast<uint32_t const*>(data + nblocks * 4);
     for (int i = -nblocks; i; i++) {
       uint32_t k1 = blocks[i];  // getblock32(blocks,i);
       k1 *= c1;
@@ -177,8 +177,8 @@ struct MurmurHash3_32 {
     }
     //----------
     // tail
-    const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
-    uint32_t k1         = 0;
+    auto const tail = reinterpret_cast<uint8_t const*>(data + nblocks * 4);
+    uint32_t k1     = 0;
     switch (len & 3) {
       case 3: k1 ^= tail[2] << 16; [[fallthrough]];
       case 2: k1 ^= tail[1] << 8; [[fallthrough]];

From 3aedeb06515e01f42d7a5cbde4852e9ad09aef95 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 3 Jul 2023 14:37:33 +0000
Subject: [PATCH 08/15] Use memcpy instead of reinterpret_cast

---
 .../detail/hash_functions/murmurhash3.cuh     | 26 +++++++--------
 include/cuco/detail/hash_functions/utils.cuh  | 29 +++++++++++++++++
 include/cuco/detail/hash_functions/xxhash.cuh | 32 +++++++++----------
 3 files changed, 56 insertions(+), 31 deletions(-)
 create mode 100644 include/cuco/detail/hash_functions/utils.cuh

diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
index 405bdd370..75ed7755c 100644
--- a/include/cuco/detail/hash_functions/murmurhash3.cuh
+++ b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cuco/detail/hash_functions/utils.cuh>
 #include <cuco/extent.cuh>
 
 #include <cstdint>
+#include <type_traits>
 
 namespace cuco::detail {
 
@@ -156,18 +158,16 @@ struct MurmurHash3_32 {
   template <typename Extent>
   constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
   {
-    int const len     = size;  // TODO remove intermediate variable
-    auto const data   = reinterpret_cast<uint8_t const*>(&key);
-    int const nblocks = len / 4;
+    auto const data    = reinterpret_cast<uint8_t const*>(&key);
+    auto const nblocks = size / 4;
 
     uint32_t h1           = seed_;
     constexpr uint32_t c1 = 0xcc9e2d51;
     constexpr uint32_t c2 = 0x1b873593;
     //----------
     // body
-    auto const blocks = reinterpret_cast<uint32_t const*>(data + nblocks * 4);
-    for (int i = -nblocks; i; i++) {
-      uint32_t k1 = blocks[i];  // getblock32(blocks,i);
+    for (std::remove_const_t<decltype(nblocks)> i = 0; size >= 4 && i < nblocks; i++) {
+      uint32_t k1 = load_chunk<uint32_t>(data, i);
       k1 *= c1;
       k1 = rotl32(k1, 15);
       k1 *= c2;
@@ -177,13 +177,12 @@ struct MurmurHash3_32 {
     }
     //----------
     // tail
-    auto const tail = reinterpret_cast<uint8_t const*>(data + nblocks * 4);
-    uint32_t k1     = 0;
-    switch (len & 3) {
-      case 3: k1 ^= tail[2] << 16; [[fallthrough]];
-      case 2: k1 ^= tail[1] << 8; [[fallthrough]];
+    uint32_t k1 = 0;
+    switch (size & 3) {
+      case 3: k1 ^= data[nblocks * 4 + 2] << 16; [[fallthrough]];
+      case 2: k1 ^= data[nblocks * 4 + 1] << 8; [[fallthrough]];
       case 1:
-        k1 ^= tail[0];
+        k1 ^= data[nblocks * 4 + 0];
         k1 *= c1;
         k1 = rotl32(k1, 15);
         k1 *= c2;
@@ -191,7 +190,7 @@ struct MurmurHash3_32 {
     };
     //----------
     // finalization
-    h1 ^= len;
+    h1 ^= size;
     h1 = fmix32_(h1);
     return h1;
   }
@@ -205,5 +204,4 @@ struct MurmurHash3_32 {
   MurmurHash3_fmix32<uint32_t> fmix32_;
   uint32_t seed_;
 };
-
 }  //  namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh
new file mode 100644
index 000000000..265b80710
--- /dev/null
+++ b/include/cuco/detail/hash_functions/utils.cuh
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace cuco::detail {
+
+template <typename T, typename U, typename Extent>
+constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept
+{
+  T chunk;
+  memcpy(&chunk, __builtin_assume_aligned(data + index * sizeof(T), sizeof(T)), sizeof(T));
+  return chunk;
+}
+
+};  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh
index ea44141c5..7b36bae4d 100644
--- a/include/cuco/detail/hash_functions/xxhash.cuh
+++ b/include/cuco/detail/hash_functions/xxhash.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cuco/detail/hash_functions/utils.cuh>
 #include <cuco/extent.cuh>
 
 #include <cstdint>
@@ -104,8 +105,7 @@ struct XXHash_32 {
   {
     // TODO do we need to add checks/hints for alignment?
     auto const nbytes = size;
-    auto const bytes  = reinterpret_cast<char const*>(&key);           ///< per-byte access
-    auto const blocks = reinterpret_cast<std::uint32_t const*>(&key);  ///< 4-byte word access
+    auto const bytes  = reinterpret_cast<char const*>(&key);  ///< per-byte access
 
     std::size_t offset = 0;
     std::uint32_t h32;
@@ -121,16 +121,16 @@ struct XXHash_32 {
       do {
         // pipeline 4*4byte computations
         auto const pipeline_offset = offset / 4;
-        v1 += blocks[pipeline_offset] * prime2;
+        v1 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 0) * prime2;
         v1 = rotl(v1, 13);
         v1 *= prime1;
-        v2 += blocks[pipeline_offset + 1] * prime2;
+        v2 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 1) * prime2;
         v2 = rotl(v2, 13);
         v2 *= prime1;
-        v3 += blocks[pipeline_offset + 2] * prime2;
+        v3 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 2) * prime2;
         v3 = rotl(v3, 13);
         v3 *= prime1;
-        v4 += blocks[pipeline_offset + 3] * prime2;
+        v4 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 3) * prime2;
         v4 = rotl(v4, 13);
         v4 *= prime1;
         offset += 16;
@@ -146,7 +146,7 @@ struct XXHash_32 {
     // remaining data can be processed in 4-byte chunks
     if ((nbytes % 16) >= 4) {
       for (; offset <= nbytes - 4; offset += 4) {
-        h32 += blocks[offset / 4] * prime3;
+        h32 += load_chunk<std::uint32_t>(bytes, offset / 4) * prime3;
         h32 = rotl(h32, 17) * prime4;
       }
     }
@@ -268,10 +268,8 @@ struct XXHash_64 {
   constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
   {
     // TODO do we need to add checks/hints for alignment?
-    auto const nbytes  = static_cast<std::size_t>(size);
-    auto const bytes   = reinterpret_cast<char const*>(&key);           ///< per-byte access
-    auto const blocks4 = reinterpret_cast<std::uint32_t const*>(&key);  ///< 4-byte word access
-    auto const blocks8 = reinterpret_cast<std::uint64_t const*>(&key);  ///< 8-byte word access
+    auto const nbytes = static_cast<std::size_t>(size);
+    auto const bytes  = reinterpret_cast<char const*>(&key);  ///< per-byte access
 
     std::size_t offset = 0;
     std::uint64_t h64;
@@ -287,16 +285,16 @@ struct XXHash_64 {
       do {
         // pipeline 4*8byte computations
         auto const pipeline_offset = offset / 8;
-        v1 += blocks8[pipeline_offset] * prime2;
+        v1 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 0) * prime2;
         v1 = rotl(v1, 31);
         v1 *= prime1;
-        v2 += blocks8[pipeline_offset + 1] * prime2;
+        v2 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 1) * prime2;
         v2 = rotl(v2, 31);
         v2 *= prime1;
-        v3 += blocks8[pipeline_offset + 2] * prime2;
+        v3 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 2) * prime2;
         v3 = rotl(v3, 31);
         v3 *= prime1;
-        v4 += blocks8[pipeline_offset + 3] * prime2;
+        v4 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 3) * prime2;
         v4 = rotl(v4, 31);
         v4 *= prime1;
         offset += 32;
@@ -336,7 +334,7 @@ struct XXHash_64 {
     // remaining data can be processed in 8-byte chunks
     if ((nbytes % 32) >= 8) {
       for (; offset <= nbytes - 8; offset += 8) {
-        std::uint64_t k1 = blocks8[offset / 8] * prime2;
+        std::uint64_t k1 = load_chunk<std::uint64_t>(bytes, offset / 8) * prime2;
         k1               = rotl(k1, 31) * prime1;
         h64 ^= k1;
         h64 = rotl(h64, 27) * prime1 + prime4;
@@ -346,7 +344,7 @@ struct XXHash_64 {
     // remaining data can be processed in 4-byte chunks
     if (((nbytes % 32) % 8) >= 4) {
       for (; offset <= nbytes - 4; offset += 4) {
-        h64 ^= (blocks4[offset / 4] & 0xffffffffull) * prime1;
+        h64 ^= (load_chunk<std::uint32_t>(bytes, offset / 4) & 0xffffffffull) * prime1;
         h64 = rotl(h64, 23) * prime2 + prime3;
       }
     }

From 669467d82ccad49e34f3d4233c0274f05eacb134 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 3 Jul 2023 23:39:18 +0000
Subject: [PATCH 09/15] Remove sync tag from hash bench for better accuracy

---
 benchmarks/hash_bench.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu
index 58c6ee770..973f6976d 100644
--- a/benchmarks/hash_bench.cu
+++ b/benchmarks/hash_bench.cu
@@ -77,7 +77,7 @@ void hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
 
   state.add_element_count(num_keys);
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+  state.exec([&](nvbench::launch& launch) {
     hash_bench_kernel<block_size><<<grid_size, block_size, 0, launch.get_stream()>>>(
       Hash{}, num_keys, hash_values.begin(), materialize_result);
   });

From 418b4197aac6b4998493c28a496ed75df73874dd Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 3 Jul 2023 23:45:27 +0000
Subject: [PATCH 10/15] XXHash cleanups

---
 include/cuco/detail/hash_functions/xxhash.cuh | 66 +++++++++----------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh
index 7b36bae4d..2cd736af8 100644
--- a/include/cuco/detail/hash_functions/xxhash.cuh
+++ b/include/cuco/detail/hash_functions/xxhash.cuh
@@ -103,16 +103,14 @@ struct XXHash_32 {
   template <typename Extent>
   constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
   {
-    // TODO do we need to add checks/hints for alignment?
-    auto const nbytes = size;
-    auto const bytes  = reinterpret_cast<char const*>(&key);  ///< per-byte access
+    auto const data = reinterpret_cast<std::uint8_t const*>(&key);  ///< per-byte access
 
     std::size_t offset = 0;
     std::uint32_t h32;
 
     // data can be processed in 16-byte chunks
-    if (nbytes >= 16) {
-      auto const limit = nbytes - 16;
+    if (size >= 16) {
+      auto const limit = size - 16;
       std::uint32_t v1 = seed_ + prime1 + prime2;
       std::uint32_t v2 = seed_ + prime2;
       std::uint32_t v3 = seed_;
@@ -121,16 +119,16 @@ struct XXHash_32 {
       do {
         // pipeline 4*4byte computations
         auto const pipeline_offset = offset / 4;
-        v1 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 0) * prime2;
+        v1 += load_chunk<std::uint32_t>(data, pipeline_offset + 0) * prime2;
         v1 = rotl(v1, 13);
         v1 *= prime1;
-        v2 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 1) * prime2;
+        v2 += load_chunk<std::uint32_t>(data, pipeline_offset + 1) * prime2;
         v2 = rotl(v2, 13);
         v2 *= prime1;
-        v3 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 2) * prime2;
+        v3 += load_chunk<std::uint32_t>(data, pipeline_offset + 2) * prime2;
         v3 = rotl(v3, 13);
         v3 *= prime1;
-        v4 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 3) * prime2;
+        v4 += load_chunk<std::uint32_t>(data, pipeline_offset + 3) * prime2;
         v4 = rotl(v4, 13);
         v4 *= prime1;
         offset += 16;
@@ -141,20 +139,20 @@ struct XXHash_32 {
       h32 = seed_ + prime5;
     }
 
-    h32 += nbytes;
+    h32 += size;
 
     // remaining data can be processed in 4-byte chunks
-    if ((nbytes % 16) >= 4) {
-      for (; offset <= nbytes - 4; offset += 4) {
-        h32 += load_chunk<std::uint32_t>(bytes, offset / 4) * prime3;
+    if ((size % 16) >= 4) {
+      for (; offset <= size - 4; offset += 4) {
+        h32 += load_chunk<std::uint32_t>(data, offset / 4) * prime3;
         h32 = rotl(h32, 17) * prime4;
       }
     }
 
     // the following loop is only needed if the size of the key is not a multiple of the block size
-    if (nbytes % 4) {
-      while (offset < nbytes) {
-        h32 += (bytes[offset] & 255) * prime5;
+    if (size % 4) {
+      while (offset < size) {
+        h32 += (data[offset] & 255) * prime5;
         h32 = rotl(h32, 11) * prime1;
         ++offset;
       }
@@ -267,16 +265,14 @@ struct XXHash_64 {
   template <typename Extent>
   constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
   {
-    // TODO do we need to add checks/hints for alignment?
-    auto const nbytes = static_cast<std::size_t>(size);
-    auto const bytes  = reinterpret_cast<char const*>(&key);  ///< per-byte access
+    auto const data = reinterpret_cast<std::uint8_t const*>(&key);  ///< per-byte access
 
     std::size_t offset = 0;
     std::uint64_t h64;
 
     // data can be processed in 32-byte chunks
-    if (nbytes >= 32) {
-      auto const limit = nbytes - 32;
+    if (size >= 32) {
+      auto const limit = size - 32;
       std::uint64_t v1 = seed_ + prime1 + prime2;
       std::uint64_t v2 = seed_ + prime2;
       std::uint64_t v3 = seed_;
@@ -285,16 +281,16 @@ struct XXHash_64 {
       do {
         // pipeline 4*8byte computations
         auto const pipeline_offset = offset / 8;
-        v1 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 0) * prime2;
+        v1 += load_chunk<std::uint64_t>(data, pipeline_offset + 0) * prime2;
         v1 = rotl(v1, 31);
         v1 *= prime1;
-        v2 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 1) * prime2;
+        v2 += load_chunk<std::uint64_t>(data, pipeline_offset + 1) * prime2;
         v2 = rotl(v2, 31);
         v2 *= prime1;
-        v3 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 2) * prime2;
+        v3 += load_chunk<std::uint64_t>(data, pipeline_offset + 2) * prime2;
         v3 = rotl(v3, 31);
         v3 *= prime1;
-        v4 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 3) * prime2;
+        v4 += load_chunk<std::uint64_t>(data, pipeline_offset + 3) * prime2;
         v4 = rotl(v4, 31);
         v4 *= prime1;
         offset += 32;
@@ -329,12 +325,12 @@ struct XXHash_64 {
       h64 = seed_ + prime5;
     }
 
-    h64 += nbytes;
+    h64 += size;
 
     // remaining data can be processed in 8-byte chunks
-    if ((nbytes % 32) >= 8) {
-      for (; offset <= nbytes - 8; offset += 8) {
-        std::uint64_t k1 = load_chunk<std::uint64_t>(bytes, offset / 8) * prime2;
+    if ((size % 32) >= 8) {
+      for (; offset <= size - 8; offset += 8) {
+        std::uint64_t k1 = load_chunk<std::uint64_t>(data, offset / 8) * prime2;
         k1               = rotl(k1, 31) * prime1;
         h64 ^= k1;
         h64 = rotl(h64, 27) * prime1 + prime4;
@@ -342,18 +338,18 @@ struct XXHash_64 {
     }
 
     // remaining data can be processed in 4-byte chunks
-    if (((nbytes % 32) % 8) >= 4) {
-      for (; offset <= nbytes - 4; offset += 4) {
-        h64 ^= (load_chunk<std::uint32_t>(bytes, offset / 4) & 0xffffffffull) * prime1;
+    if (((size % 32) % 8) >= 4) {
+      for (; offset <= size - 4; offset += 4) {
+        h64 ^= (load_chunk<std::uint32_t>(data, offset / 4) & 0xffffffffull) * prime1;
         h64 = rotl(h64, 23) * prime2 + prime3;
       }
     }
 
     // the following loop is only needed if the size of the key is not a multiple of a previous
     // block size
-    if (nbytes % 4) {
-      while (offset < nbytes) {
-        h64 += (bytes[offset] & 0xff) * prime5;
+    if (size % 4) {
+      while (offset < size) {
+        h64 += (data[offset] & 0xff) * prime5;
         h64 = rotl(h64, 11) * prime1;
         ++offset;
       }

From afddf1f8d4a7bd9fd01aebff590f602d1cb110bd Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 3 Jul 2023 23:49:48 +0000
Subject: [PATCH 11/15] MurmurHash3 cleanups

---
 .../detail/hash_functions/murmurhash3.cuh     | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
index 75ed7755c..fdf438968 100644
--- a/include/cuco/detail/hash_functions/murmurhash3.cuh
+++ b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -35,15 +35,15 @@ template <typename Key>
 struct MurmurHash3_fmix32 {
   static_assert(sizeof(Key) == 4, "Key type must be 4 bytes in size.");
 
-  using argument_type = Key;       ///< The type of the values taken as argument
-  using result_type   = uint32_t;  ///< The type of the hash values produced
+  using argument_type = Key;            ///< The type of the values taken as argument
+  using result_type   = std::uint32_t;  ///< The type of the hash values produced
 
   /**
    * @brief Constructs a MurmurHash3_fmix32 hash function with the given `seed`.
    *
    * @param seed A custom number to randomize the resulting hash value
    */
-  __host__ __device__ constexpr MurmurHash3_fmix32(uint32_t seed = 0) : seed_{seed} {}
+  __host__ __device__ constexpr MurmurHash3_fmix32(std::uint32_t seed = 0) : seed_{seed} {}
 
   /**
    * @brief Returns a hash value for its argument, as a value of type `result_type`.
@@ -53,7 +53,7 @@ struct MurmurHash3_fmix32 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    uint32_t h = static_cast<uint32_t>(key) ^ seed_;
+    std::uint32_t h = static_cast<std::uint32_t>(key) ^ seed_;
     h ^= h >> 16;
     h *= 0x85ebca6b;
     h ^= h >> 13;
@@ -63,7 +63,7 @@ struct MurmurHash3_fmix32 {
   }
 
  private:
-  uint32_t seed_;
+  std::uint32_t seed_;
 };
 
 /**
@@ -77,15 +77,15 @@ template <typename Key>
 struct MurmurHash3_fmix64 {
   static_assert(sizeof(Key) == 8, "Key type must be 8 bytes in size.");
 
-  using argument_type = Key;       ///< The type of the values taken as argument
-  using result_type   = uint64_t;  ///< The type of the hash values produced
+  using argument_type = Key;            ///< The type of the values taken as argument
+  using result_type   = std::uint64_t;  ///< The type of the hash values produced
 
   /**
    * @brief Constructs a MurmurHash3_fmix64 hash function with the given `seed`.
    *
    * @param seed A custom number to randomize the resulting hash value
    */
-  __host__ __device__ constexpr MurmurHash3_fmix64(uint64_t seed = 0) : seed_{seed} {}
+  __host__ __device__ constexpr MurmurHash3_fmix64(std::uint64_t seed = 0) : seed_{seed} {}
 
   /**
    * @brief Returns a hash value for its argument, as a value of type `result_type`.
@@ -95,7 +95,7 @@ struct MurmurHash3_fmix64 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    uint64_t h = static_cast<uint64_t>(key) ^ seed_;
+    std::uint64_t h = static_cast<std::uint64_t>(key) ^ seed_;
     h ^= h >> 33;
     h *= 0xff51afd7ed558ccd;
     h ^= h >> 33;
@@ -105,7 +105,7 @@ struct MurmurHash3_fmix64 {
   }
 
  private:
-  uint64_t seed_;
+  std::uint64_t seed_;
 };
 
 /**
@@ -125,15 +125,15 @@ struct MurmurHash3_fmix64 {
  */
 template <typename Key>
 struct MurmurHash3_32 {
-  using argument_type = Key;       ///< The type of the values taken as argument
-  using result_type   = uint32_t;  ///< The type of the hash values produced
+  using argument_type = Key;            ///< The type of the values taken as argument
+  using result_type   = std::uint32_t;  ///< The type of the hash values produced
 
   /**
    * @brief Constructs a MurmurHash3_32 hash function with the given `seed`.
    *
    * @param seed A custom number to randomize the resulting hash value
    */
-  __host__ __device__ constexpr MurmurHash3_32(uint32_t seed = 0) : fmix32_{0}, seed_{seed} {}
+  __host__ __device__ constexpr MurmurHash3_32(std::uint32_t seed = 0) : fmix32_{0}, seed_{seed} {}
 
   /**
    * @brief Returns a hash value for its argument, as a value of type `result_type`.
@@ -158,16 +158,16 @@ struct MurmurHash3_32 {
   template <typename Extent>
   constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
   {
-    auto const data    = reinterpret_cast<uint8_t const*>(&key);
+    auto const data    = reinterpret_cast<std::uint8_t const*>(&key);
     auto const nblocks = size / 4;
 
-    uint32_t h1           = seed_;
-    constexpr uint32_t c1 = 0xcc9e2d51;
-    constexpr uint32_t c2 = 0x1b873593;
+    std::uint32_t h1           = seed_;
+    constexpr std::uint32_t c1 = 0xcc9e2d51;
+    constexpr std::uint32_t c2 = 0x1b873593;
     //----------
     // body
     for (std::remove_const_t<decltype(nblocks)> i = 0; size >= 4 && i < nblocks; i++) {
-      uint32_t k1 = load_chunk<uint32_t>(data, i);
+      std::uint32_t k1 = load_chunk<std::uint32_t>(data, i);
       k1 *= c1;
       k1 = rotl32(k1, 15);
       k1 *= c2;
@@ -177,7 +177,7 @@ struct MurmurHash3_32 {
     }
     //----------
     // tail
-    uint32_t k1 = 0;
+    std::uint32_t k1 = 0;
     switch (size & 3) {
       case 3: k1 ^= data[nblocks * 4 + 2] << 16; [[fallthrough]];
       case 2: k1 ^= data[nblocks * 4 + 1] << 8; [[fallthrough]];
@@ -196,12 +196,12 @@ struct MurmurHash3_32 {
   }
 
  private:
-  constexpr __host__ __device__ uint32_t rotl32(uint32_t x, int8_t r) const noexcept
+  constexpr __host__ __device__ std::uint32_t rotl32(std::uint32_t x, std::int8_t r) const noexcept
   {
     return (x << r) | (x >> (32 - r));
   }
 
-  MurmurHash3_fmix32<uint32_t> fmix32_;
-  uint32_t seed_;
+  MurmurHash3_fmix32<std::uint32_t> fmix32_;
+  std::uint32_t seed_;
 };
 }  //  namespace cuco::detail
\ No newline at end of file

From fc95dcba2d04de7016a439bd86b2806f15d4b26a Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 3 Jul 2023 23:50:39 +0000
Subject: [PATCH 12/15] Fix typo in comment

---
 tests/utility/hash_test.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu
index cc4a9bc75..cb78d2374 100644
--- a/tests/utility/hash_test.cu
+++ b/tests/utility/hash_test.cu
@@ -164,7 +164,7 @@ TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test",
 {
   using key_type = typename Hash::argument_type;
 
-  // this makes sure the compiler isn't abl to optimize away the key_size variable
+  // this makes sure the compiler isn't able to optimize away the key_size variable
   volatile size_t key_size = sizeof(key_type);
 
   Hash hash;

From f3b435c240f5539438996da47f202fc9414dcfc5 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 4 Jul 2023 00:11:32 +0000
Subject: [PATCH 13/15] Fix load_chunk

---
 include/cuco/detail/hash_functions/utils.cuh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh
index 265b80710..df77d8dd6 100644
--- a/include/cuco/detail/hash_functions/utils.cuh
+++ b/include/cuco/detail/hash_functions/utils.cuh
@@ -16,13 +16,16 @@
 
 #pragma once
 
+#include <cstddef>
+
 namespace cuco::detail {
 
 template <typename T, typename U, typename Extent>
 constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept
 {
+  auto const bytes = reinterpret_cast<std::byte const*>(data);
   T chunk;
-  memcpy(&chunk, __builtin_assume_aligned(data + index * sizeof(T), sizeof(T)), sizeof(T));
+  memcpy(&chunk, __builtin_assume_aligned(bytes + index * sizeof(T), sizeof(T)), sizeof(T));
   return chunk;
 }
 

From 74a3739df0719c66e57d4162703d630f3088946e Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 10 Jul 2023 14:42:24 +0000
Subject: [PATCH 14/15] Use compute_hash function instead of operator

---
 .../detail/hash_functions/murmurhash3.cuh     | 24 ++++----
 include/cuco/detail/hash_functions/xxhash.cuh | 59 ++++++++++---------
 tests/utility/hash_test.cu                    |  9 +--
 3 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
index fdf438968..a12143523 100644
--- a/include/cuco/detail/hash_functions/murmurhash3.cuh
+++ b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -19,6 +19,7 @@
 #include <cuco/detail/hash_functions/utils.cuh>
 #include <cuco/extent.cuh>
 
+#include <cstddef>
 #include <cstdint>
 #include <type_traits>
 
@@ -139,11 +140,12 @@ struct MurmurHash3_32 {
    * @brief Returns a hash value for its argument, as a value of type `result_type`.
    *
    * @param key The input argument to hash
-   * @return A resulting hash value for `key`
+   * @return The resulting hash value for `key`
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    return (*this)(key, cuco::experimental::extent<std::size_t, sizeof(Key)>{});
+    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+                        cuco::experimental::extent<std::size_t, sizeof(Key)>{});
   }
 
   /**
@@ -151,14 +153,14 @@ struct MurmurHash3_32 {
    *
    * @tparam Extent The extent type
    *
-   * @param key The input argument to hash
-   * @param size The extent of the key in bytes
-   * @return A resulting hash value for `key`
+   * @param bytes The input argument to hash
+   * @param size The extent of the data in bytes
+   * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
+  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+                                                         Extent size) const noexcept
   {
-    auto const data    = reinterpret_cast<std::uint8_t const*>(&key);
     auto const nblocks = size / 4;
 
     std::uint32_t h1           = seed_;
@@ -167,7 +169,7 @@ struct MurmurHash3_32 {
     //----------
     // body
     for (std::remove_const_t<decltype(nblocks)> i = 0; size >= 4 && i < nblocks; i++) {
-      std::uint32_t k1 = load_chunk<std::uint32_t>(data, i);
+      std::uint32_t k1 = load_chunk<std::uint32_t>(bytes, i);
       k1 *= c1;
       k1 = rotl32(k1, 15);
       k1 *= c2;
@@ -179,10 +181,10 @@ struct MurmurHash3_32 {
     // tail
     std::uint32_t k1 = 0;
     switch (size & 3) {
-      case 3: k1 ^= data[nblocks * 4 + 2] << 16; [[fallthrough]];
-      case 2: k1 ^= data[nblocks * 4 + 1] << 8; [[fallthrough]];
+      case 3: k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 2]) << 16; [[fallthrough]];
+      case 2: k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 1]) << 8; [[fallthrough]];
       case 1:
-        k1 ^= data[nblocks * 4 + 0];
+        k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 0]);
         k1 *= c1;
         k1 = rotl32(k1, 15);
         k1 *= c2;
diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh
index 6b7a80220..c686f3b82 100644
--- a/include/cuco/detail/hash_functions/xxhash.cuh
+++ b/include/cuco/detail/hash_functions/xxhash.cuh
@@ -19,6 +19,7 @@
 #include <cuco/detail/hash_functions/utils.cuh>
 #include <cuco/extent.cuh>
 
+#include <cstddef>
 #include <cstdint>
 
 namespace cuco::detail {
@@ -84,11 +85,12 @@ struct XXHash_32 {
    * @brief Returns a hash value for its argument, as a value of type `result_type`.
    *
    * @param key The input argument to hash
-   * @return A resulting hash value for `key`
+   * @return The resulting hash value for `key`
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    return (*this)(key, cuco::experimental::extent<std::size_t, sizeof(Key)>{});
+    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+                        cuco::experimental::extent<std::size_t, sizeof(Key)>{});
   }
 
   /**
@@ -96,15 +98,14 @@ struct XXHash_32 {
    *
    * @tparam Extent The extent type
    *
-   * @param key The input argument to hash
-   * @param size The extent of the key in bytes
-   * @return A resulting hash value for `key`
+   * @param bytes The input argument to hash
+   * @param size The extent of the data in bytes
+   * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
+  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+                                                         Extent size) const noexcept
   {
-    auto const data = reinterpret_cast<std::uint8_t const*>(&key);  ///< per-byte access
-
     std::size_t offset = 0;
     std::uint32_t h32;
 
@@ -119,16 +120,16 @@ struct XXHash_32 {
       do {
         // pipeline 4*4byte computations
         auto const pipeline_offset = offset / 4;
-        v1 += load_chunk<std::uint32_t>(data, pipeline_offset + 0) * prime2;
+        v1 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 0) * prime2;
         v1 = rotl(v1, 13);
         v1 *= prime1;
-        v2 += load_chunk<std::uint32_t>(data, pipeline_offset + 1) * prime2;
+        v2 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 1) * prime2;
         v2 = rotl(v2, 13);
         v2 *= prime1;
-        v3 += load_chunk<std::uint32_t>(data, pipeline_offset + 2) * prime2;
+        v3 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 2) * prime2;
         v3 = rotl(v3, 13);
         v3 *= prime1;
-        v4 += load_chunk<std::uint32_t>(data, pipeline_offset + 3) * prime2;
+        v4 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 3) * prime2;
         v4 = rotl(v4, 13);
         v4 *= prime1;
         offset += 16;
@@ -144,7 +145,7 @@ struct XXHash_32 {
     // remaining data can be processed in 4-byte chunks
     if ((size % 16) >= 4) {
       for (; offset <= size - 4; offset += 4) {
-        h32 += load_chunk<std::uint32_t>(data, offset / 4) * prime3;
+        h32 += load_chunk<std::uint32_t>(bytes, offset / 4) * prime3;
         h32 = rotl(h32, 17) * prime4;
       }
     }
@@ -152,7 +153,7 @@ struct XXHash_32 {
     // the following loop is only needed if the size of the key is not a multiple of the block size
     if (size % 4) {
       while (offset < size) {
-        h32 += (data[offset] & 255) * prime5;
+        h32 += (std::to_integer<std::uint32_t>(bytes[offset]) & 255) * prime5;
         h32 = rotl(h32, 11) * prime1;
         ++offset;
       }
@@ -246,11 +247,12 @@ struct XXHash_64 {
    * @brief Returns a hash value for its argument, as a value of type `result_type`.
    *
    * @param key The input argument to hash
-   * @return A resulting hash value for `key`
+   * @return The resulting hash value for `key`
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    return (*this)(key, cuco::experimental::extent<std::size_t, sizeof(Key)>{});
+    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+                        cuco::experimental::extent<std::size_t, sizeof(Key)>{});
   }
 
   /**
@@ -258,15 +260,14 @@ struct XXHash_64 {
    *
    * @tparam Extent The extent type
    *
-   * @param key The input argument to hash
-   * @param size The extent of the key in bytes
-   * @return A resulting hash value for `key`
+   * @param bytes The input argument to hash
+   * @param size The extent of the data in bytes
+   * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ operator()(Key const& key, Extent size) const noexcept
+  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+                                                         Extent size) const noexcept
   {
-    auto const data = reinterpret_cast<std::uint8_t const*>(&key);  ///< per-byte access
-
     std::size_t offset = 0;
     std::uint64_t h64;
 
@@ -281,16 +282,16 @@ struct XXHash_64 {
       do {
         // pipeline 4*8byte computations
         auto const pipeline_offset = offset / 8;
-        v1 += load_chunk<std::uint64_t>(data, pipeline_offset + 0) * prime2;
+        v1 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 0) * prime2;
         v1 = rotl(v1, 31);
         v1 *= prime1;
-        v2 += load_chunk<std::uint64_t>(data, pipeline_offset + 1) * prime2;
+        v2 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 1) * prime2;
         v2 = rotl(v2, 31);
         v2 *= prime1;
-        v3 += load_chunk<std::uint64_t>(data, pipeline_offset + 2) * prime2;
+        v3 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 2) * prime2;
         v3 = rotl(v3, 31);
         v3 *= prime1;
-        v4 += load_chunk<std::uint64_t>(data, pipeline_offset + 3) * prime2;
+        v4 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 3) * prime2;
         v4 = rotl(v4, 31);
         v4 *= prime1;
         offset += 32;
@@ -330,7 +331,7 @@ struct XXHash_64 {
     // remaining data can be processed in 8-byte chunks
     if ((size % 32) >= 8) {
       for (; offset <= size - 8; offset += 8) {
-        std::uint64_t k1 = load_chunk<std::uint64_t>(data, offset / 8) * prime2;
+        std::uint64_t k1 = load_chunk<std::uint64_t>(bytes, offset / 8) * prime2;
         k1               = rotl(k1, 31) * prime1;
         h64 ^= k1;
         h64 = rotl(h64, 27) * prime1 + prime4;
@@ -340,7 +341,7 @@ struct XXHash_64 {
     // remaining data can be processed in 4-byte chunks
     if (((size % 32) % 8) >= 4) {
       for (; offset <= size - 4; offset += 4) {
-        h64 ^= (load_chunk<std::uint32_t>(data, offset / 4) & 0xffffffffull) * prime1;
+        h64 ^= (load_chunk<std::uint32_t>(bytes, offset / 4) & 0xffffffffull) * prime1;
         h64 = rotl(h64, 23) * prime2 + prime3;
       }
     }
@@ -349,7 +350,7 @@ struct XXHash_64 {
     // block size
     if (size % 4) {
       while (offset < size) {
-        h64 ^= (data[offset] & 0xff) * prime5;
+        h64 ^= (std::to_integer<std::uint32_t>(bytes[offset]) & 0xff) * prime5;
         h64 = rotl(h64, 11) * prime1;
         ++offset;
       }
diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu
index 44a752c93..3e8880860 100644
--- a/tests/utility/hash_test.cu
+++ b/tests/utility/hash_test.cu
@@ -24,6 +24,8 @@
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/catch_test_macros.hpp>
 
+#include <cstddef>
+
 template <int32_t Words>
 struct large_key {
   constexpr __host__ __device__ large_key(int32_t value) noexcept
@@ -185,13 +187,12 @@ TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test",
 {
   using key_type = typename Hash::argument_type;
 
-  // this makes sure the compiler isn't able to optimize away the key_size variable
-  volatile size_t key_size = sizeof(key_type);
-
   Hash hash;
+  key_type key = 42;
 
   SECTION("Identical keys with static and dynamic key size should have the same hash value.")
   {
-    CHECK(hash(42) == hash(42, key_size));
+    CHECK(hash(key) ==
+          hash.compute_hash(reinterpret_cast<std::byte const*>(&key), sizeof(key_type)));
   }
 }
\ No newline at end of file

From b82158904ae672ae7e4fd3bad5e7358ea0da4bfa Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 11 Jul 2023 00:39:54 +0000
Subject: [PATCH 15/15] Use reinterpret_cast instead of memcpy /sadface

---
 include/cuco/detail/hash_functions/utils.cuh | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh
index df77d8dd6..a50779f23 100644
--- a/include/cuco/detail/hash_functions/utils.cuh
+++ b/include/cuco/detail/hash_functions/utils.cuh
@@ -16,17 +16,13 @@
 
 #pragma once
 
-#include <cstddef>
-
 namespace cuco::detail {
 
 template <typename T, typename U, typename Extent>
 constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept
 {
-  auto const bytes = reinterpret_cast<std::byte const*>(data);
-  T chunk;
-  memcpy(&chunk, __builtin_assume_aligned(bytes + index * sizeof(T), sizeof(T)), sizeof(T));
-  return chunk;
+  auto const chunks = reinterpret_cast<T const*>(data);
+  return chunks[index];
 }
 
 };  // namespace cuco::detail
\ No newline at end of file