Skip to content

Commit caf97ef

Browse files
authored
Add XXHash_32 hasher (#17533)
Contributes to #17531 This PR introduces the xxhash_32 hasher to libcudf as a preparatory step for evaluating the impact of replacing murmurhash3_x86_32 with xxhash_32 as the default hash. Authors: - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) - Lawrence Mitchell (https://github.com/wence-) URL: #17533
1 parent f308122 commit caf97ef

File tree

17 files changed

+473
-35
lines changed

17 files changed

+473
-35
lines changed

cpp/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,7 @@ add_library(
461461
src/hash/sha256_hash.cu
462462
src/hash/sha384_hash.cu
463463
src/hash/sha512_hash.cu
464+
src/hash/xxhash_32.cu
464465
src/hash/xxhash_64.cu
465466
src/interop/dlpack.cpp
466467
src/interop/arrow_utilities.cpp

cpp/include/cudf/hashing.hpp

+21-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -166,6 +166,26 @@ std::unique_ptr<column> sha512(
166166
rmm::cuda_stream_view stream = cudf::get_default_stream(),
167167
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
168168

169+
/**
170+
* @brief Computes the XXHash_32 hash value of each row in the given table
171+
*
172+
* This function computes the hash of each column using the `seed` for the first column
173+
* and the resulting hash as a seed for the next column and so on.
174+
* The result is a uint32 value for each row.
175+
*
176+
* @param input The table of columns to hash
177+
* @param seed Optional seed value to use for the hash function
178+
* @param stream CUDA stream used for device memory operations and kernel launches
179+
* @param mr Device memory resource used to allocate the returned column's device memory
180+
*
181+
* @returns A column where each row is the hash of a row from the input
182+
*/
183+
std::unique_ptr<column> xxhash_32(
184+
table_view const& input,
185+
uint32_t seed = DEFAULT_HASH_SEED,
186+
rmm::cuda_stream_view stream = cudf::get_default_stream(),
187+
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
188+
169189
/**
170190
* @brief Computes the XXHash_64 hash value of each row in the given table
171191
*

cpp/include/cudf/hashing/detail/hashing.hpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -61,6 +61,11 @@ std::unique_ptr<column> sha512(table_view const& input,
6161
rmm::cuda_stream_view stream,
6262
rmm::device_async_resource_ref mr);
6363

64+
std::unique_ptr<column> xxhash_32(table_view const& input,
65+
uint64_t seed,
66+
rmm::cuda_stream_view,
67+
rmm::device_async_resource_ref mr);
68+
6469
std::unique_ptr<column> xxhash_64(table_view const& input,
6570
uint64_t seed,
6671
rmm::cuda_stream_view,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <cudf/fixed_point/fixed_point.hpp>
20+
#include <cudf/hashing.hpp>
21+
#include <cudf/hashing/detail/hash_functions.cuh>
22+
#include <cudf/lists/list_view.hpp>
23+
#include <cudf/strings/string_view.cuh>
24+
#include <cudf/structs/struct_view.hpp>
25+
#include <cudf/types.hpp>
26+
27+
#include <cuco/hash_functions.cuh>
28+
#include <cuda/std/cstddef>
29+
30+
namespace cudf::hashing::detail {
31+
32+
template <typename Key>
33+
struct XXHash_32 {
34+
using result_type = std::uint32_t;
35+
36+
CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
37+
38+
__device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
39+
40+
__device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
41+
std::uint64_t size) const
42+
{
43+
return this->_impl.compute_hash(bytes, size);
44+
}
45+
46+
private:
47+
template <typename T>
48+
__device__ constexpr result_type compute(T const& key) const
49+
{
50+
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
51+
}
52+
53+
cuco::xxhash_32<Key> _impl;
54+
};
55+
56+
template <>
57+
XXHash_32<bool>::result_type __device__ inline XXHash_32<bool>::operator()(bool const& key) const
58+
{
59+
return this->compute(static_cast<uint8_t>(key));
60+
}
61+
62+
template <>
63+
XXHash_32<float>::result_type __device__ inline XXHash_32<float>::operator()(float const& key) const
64+
{
65+
return this->compute(normalize_nans_and_zeros(key));
66+
}
67+
68+
template <>
69+
XXHash_32<double>::result_type __device__ inline XXHash_32<double>::operator()(
70+
double const& key) const
71+
{
72+
return this->compute(normalize_nans_and_zeros(key));
73+
}
74+
75+
template <>
76+
XXHash_32<cudf::string_view>::result_type
77+
__device__ inline XXHash_32<cudf::string_view>::operator()(cudf::string_view const& key) const
78+
{
79+
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
80+
key.size_bytes());
81+
}
82+
83+
template <>
84+
XXHash_32<numeric::decimal32>::result_type
85+
__device__ inline XXHash_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
86+
{
87+
return this->compute(key.value());
88+
}
89+
90+
template <>
91+
XXHash_32<numeric::decimal64>::result_type
92+
__device__ inline XXHash_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
93+
{
94+
return this->compute(key.value());
95+
}
96+
97+
template <>
98+
XXHash_32<numeric::decimal128>::result_type
99+
__device__ inline XXHash_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
100+
{
101+
return this->compute(key.value());
102+
}
103+
104+
template <>
105+
XXHash_32<cudf::list_view>::result_type __device__ inline XXHash_32<cudf::list_view>::operator()(
106+
cudf::list_view const& key) const
107+
{
108+
CUDF_UNREACHABLE("List column hashing is not supported");
109+
}
110+
111+
template <>
112+
XXHash_32<cudf::struct_view>::result_type
113+
__device__ inline XXHash_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
114+
{
115+
CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
116+
}
117+
118+
} // namespace cudf::hashing::detail

cpp/src/hash/xxhash_32.cu

+136
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include <cudf/column/column_factories.hpp>
17+
#include <cudf/detail/nvtx/ranges.hpp>
18+
#include <cudf/detail/utilities/algorithm.cuh>
19+
#include <cudf/hashing/detail/hashing.hpp>
20+
#include <cudf/hashing/detail/xxhash_32.cuh>
21+
#include <cudf/table/table_device_view.cuh>
22+
#include <cudf/utilities/memory_resource.hpp>
23+
#include <cudf/utilities/span.hpp>
24+
25+
#include <rmm/cuda_stream_view.hpp>
26+
#include <rmm/exec_policy.hpp>
27+
28+
#include <cuda/std/limits>
29+
#include <thrust/tabulate.h>
30+
31+
namespace cudf {
32+
namespace hashing {
33+
namespace detail {
34+
35+
namespace {
36+
37+
/**
38+
* @brief Computes the hash value of a row in the given table.
39+
*
40+
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
41+
*/
42+
template <typename Nullate>
43+
class device_row_hasher {
44+
public:
45+
device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed)
46+
: _check_nulls(nulls), _table(t), _seed(seed)
47+
{
48+
}
49+
50+
__device__ auto operator()(size_type row_index) const noexcept
51+
{
52+
return cudf::detail::accumulate(
53+
_table.begin(),
54+
_table.end(),
55+
_seed,
56+
[row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
57+
return cudf::type_dispatcher(
58+
column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
59+
});
60+
}
61+
62+
/**
63+
* @brief Computes the hash value of an element in the given column.
64+
*/
65+
class element_hasher_adapter {
66+
public:
67+
template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
68+
__device__ hash_value_type operator()(column_device_view const& col,
69+
size_type const row_index,
70+
Nullate const _check_nulls,
71+
hash_value_type const _seed) const noexcept
72+
{
73+
if (_check_nulls && col.is_null(row_index)) {
74+
return cuda::std::numeric_limits<hash_value_type>::max();
75+
}
76+
auto const hasher = XXHash_32<T>{_seed};
77+
return hasher(col.element<T>(row_index));
78+
}
79+
80+
template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
81+
__device__ hash_value_type operator()(column_device_view const&,
82+
size_type const,
83+
Nullate const,
84+
hash_value_type const) const noexcept
85+
{
86+
CUDF_UNREACHABLE("Unsupported type for XXHash_32");
87+
}
88+
};
89+
90+
Nullate const _check_nulls;
91+
table_device_view const _table;
92+
hash_value_type const _seed;
93+
};
94+
95+
} // namespace
96+
97+
std::unique_ptr<column> xxhash_32(table_view const& input,
98+
uint32_t seed,
99+
rmm::cuda_stream_view stream,
100+
rmm::device_async_resource_ref mr)
101+
{
102+
auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
103+
input.num_rows(),
104+
mask_state::UNALLOCATED,
105+
stream,
106+
mr);
107+
108+
// Return early if there's nothing to hash
109+
if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
110+
111+
bool const nullable = has_nulls(input);
112+
auto const input_view = table_device_view::create(input, stream);
113+
auto output_view = output->mutable_view();
114+
115+
// Compute the hash value for each row
116+
thrust::tabulate(rmm::exec_policy(stream),
117+
output_view.begin<hash_value_type>(),
118+
output_view.end<hash_value_type>(),
119+
device_row_hasher(nullable, *input_view, seed));
120+
121+
return output;
122+
}
123+
124+
} // namespace detail
125+
126+
std::unique_ptr<column> xxhash_32(table_view const& input,
127+
uint32_t seed,
128+
rmm::cuda_stream_view stream,
129+
rmm::device_async_resource_ref mr)
130+
{
131+
CUDF_FUNC_RANGE();
132+
return detail::xxhash_32(input, seed, stream, mr);
133+
}
134+
135+
} // namespace hashing
136+
} // namespace cudf

cpp/src/io/orc/dict_enc.cu

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
1818

1919
#include <cudf/detail/offsets_iterator.cuh>
2020
#include <cudf/detail/utilities/integer_utils.hpp>
21+
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
2122
#include <cudf/io/orc_types.hpp>
2223
#include <cudf/table/experimental/row_operators.cuh>
2324

cpp/src/io/parquet/chunk_dict.cu

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
1818

1919
#include <cudf/detail/iterator.cuh>
2020
#include <cudf/detail/utilities/cuda.cuh>
21+
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
2122
#include <cudf/table/experimental/row_operators.cuh>
2223

2324
#include <rmm/exec_policy.hpp>

cpp/src/join/join_common_utils.cuh

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
2020
#include <cudf/detail/iterator.cuh>
2121
#include <cudf/detail/null_mask.hpp>
2222
#include <cudf/detail/utilities/cuda.cuh>
23+
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
2324
#include <cudf/table/experimental/row_operators.cuh>
2425
#include <cudf/utilities/memory_resource.hpp>
2526

cpp/tests/CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# =============================================================================
2-
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
2+
# Copyright (c) 2018-2025, NVIDIA CORPORATION.
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
55
# in compliance with the License. You may obtain a copy of the License at
@@ -192,6 +192,7 @@ ConfigureTest(
192192
hashing/sha256_test.cpp
193193
hashing/sha384_test.cpp
194194
hashing/sha512_test.cpp
195+
hashing/xxhash_32_test.cpp
195196
hashing/xxhash_64_test.cpp
196197
)
197198

0 commit comments

Comments
 (0)