1//===--------------------------------------------------------------------===//
2// hash.cpp
3// Description: This file contains the vectorized hash implementations
4//===--------------------------------------------------------------------===//
5
6#include "duckdb/common/vector_operations/vector_operations.hpp"
7
8#include "duckdb/common/types/hash.hpp"
9#include "duckdb/common/types/null_value.hpp"
10#include "duckdb/common/value_operations/value_operations.hpp"
11
12namespace duckdb {
13
14struct HashOp {
15 static const hash_t NULL_HASH = 0xbf58476d1ce4e5b9;
16
17 template <class T>
18 static inline hash_t Operation(T input, bool is_null) {
19 return is_null ? NULL_HASH : duckdb::Hash<T>(input);
20 }
21};
22
23static inline hash_t CombineHashScalar(hash_t a, hash_t b) {
24 return (a * UINT64_C(0xbf58476d1ce4e5b9)) ^ b;
25}
26
27template <bool HAS_RSEL, class T>
28static inline void TightLoopHash(const T *__restrict ldata, hash_t *__restrict result_data, const SelectionVector *rsel,
29 idx_t count, const SelectionVector *__restrict sel_vector, ValidityMask &mask) {
30 if (!mask.AllValid()) {
31 for (idx_t i = 0; i < count; i++) {
32 auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i;
33 auto idx = sel_vector->get_index(idx: ridx);
34 result_data[ridx] = HashOp::Operation(ldata[idx], !mask.RowIsValid(row_idx: idx));
35 }
36 } else {
37 for (idx_t i = 0; i < count; i++) {
38 auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i;
39 auto idx = sel_vector->get_index(idx: ridx);
40 result_data[ridx] = duckdb::Hash<T>(ldata[idx]);
41 }
42 }
43}
44
45template <bool HAS_RSEL, class T>
46static inline void TemplatedLoopHash(Vector &input, Vector &result, const SelectionVector *rsel, idx_t count) {
47 if (input.GetVectorType() == VectorType::CONSTANT_VECTOR) {
48 result.SetVectorType(VectorType::CONSTANT_VECTOR);
49
50 auto ldata = ConstantVector::GetData<T>(input);
51 auto result_data = ConstantVector::GetData<hash_t>(vector&: result);
52 *result_data = HashOp::Operation(*ldata, ConstantVector::IsNull(vector: input));
53 } else {
54 result.SetVectorType(VectorType::FLAT_VECTOR);
55
56 UnifiedVectorFormat idata;
57 input.ToUnifiedFormat(count, data&: idata);
58
59 TightLoopHash<HAS_RSEL, T>(UnifiedVectorFormat::GetData<T>(idata), FlatVector::GetData<hash_t>(vector&: result), rsel,
60 count, idata.sel, idata.validity);
61 }
62}
63
64template <bool HAS_RSEL, bool FIRST_HASH>
65static inline void StructLoopHash(Vector &input, Vector &hashes, const SelectionVector *rsel, idx_t count) {
66 auto &children = StructVector::GetEntries(vector&: input);
67
68 D_ASSERT(!children.empty());
69 idx_t col_no = 0;
70 if (HAS_RSEL) {
71 if (FIRST_HASH) {
72 VectorOperations::Hash(input&: *children[col_no++], hashes, rsel: *rsel, count);
73 } else {
74 VectorOperations::CombineHash(hashes, input&: *children[col_no++], rsel: *rsel, count);
75 }
76 while (col_no < children.size()) {
77 VectorOperations::CombineHash(hashes, input&: *children[col_no++], rsel: *rsel, count);
78 }
79 } else {
80 if (FIRST_HASH) {
81 VectorOperations::Hash(input&: *children[col_no++], hashes, count);
82 } else {
83 VectorOperations::CombineHash(hashes, input&: *children[col_no++], count);
84 }
85 while (col_no < children.size()) {
86 VectorOperations::CombineHash(hashes, input&: *children[col_no++], count);
87 }
88 }
89}
90
91template <bool HAS_RSEL, bool FIRST_HASH>
92static inline void ListLoopHash(Vector &input, Vector &hashes, const SelectionVector *rsel, idx_t count) {
93 auto hdata = FlatVector::GetData<hash_t>(vector&: hashes);
94
95 UnifiedVectorFormat idata;
96 input.ToUnifiedFormat(count, data&: idata);
97 const auto ldata = UnifiedVectorFormat::GetData<list_entry_t>(format: idata);
98
99 // Hash the children into a temporary
100 auto &child = ListVector::GetEntry(vector&: input);
101 const auto child_count = ListVector::GetListSize(vector: input);
102
103 Vector child_hashes(LogicalType::HASH, child_count);
104 if (child_count > 0) {
105 VectorOperations::Hash(input&: child, hashes&: child_hashes, count: child_count);
106 }
107 auto chdata = FlatVector::GetData<hash_t>(vector&: child_hashes);
108
109 // Reduce the number of entries to check to the non-empty ones
110 SelectionVector unprocessed(count);
111 SelectionVector cursor(HAS_RSEL ? STANDARD_VECTOR_SIZE : count);
112 idx_t remaining = 0;
113 for (idx_t i = 0; i < count; ++i) {
114 const idx_t ridx = HAS_RSEL ? rsel->get_index(idx: i) : i;
115 const auto lidx = idata.sel->get_index(idx: ridx);
116 const auto &entry = ldata[lidx];
117 if (idata.validity.RowIsValid(row_idx: lidx) && entry.length > 0) {
118 unprocessed.set_index(idx: remaining++, loc: ridx);
119 cursor.set_index(idx: ridx, loc: entry.offset);
120 } else if (FIRST_HASH) {
121 hdata[ridx] = HashOp::NULL_HASH;
122 }
123 // Empty or NULL non-first elements have no effect.
124 }
125
126 count = remaining;
127 if (count == 0) {
128 return;
129 }
130
131 // Merge the first position hash into the main hash
132 idx_t position = 1;
133 if (FIRST_HASH) {
134 remaining = 0;
135 for (idx_t i = 0; i < count; ++i) {
136 const auto ridx = unprocessed.get_index(idx: i);
137 const auto cidx = cursor.get_index(idx: ridx);
138 hdata[ridx] = chdata[cidx];
139
140 const auto lidx = idata.sel->get_index(idx: ridx);
141 const auto &entry = ldata[lidx];
142 if (entry.length > position) {
143 // Entry still has values to hash
144 unprocessed.set_index(idx: remaining++, loc: ridx);
145 cursor.set_index(idx: ridx, loc: cidx + 1);
146 }
147 }
148 count = remaining;
149 if (count == 0) {
150 return;
151 }
152 ++position;
153 }
154
155 // Combine the hashes for the remaining positions until there are none left
156 for (;; ++position) {
157 remaining = 0;
158 for (idx_t i = 0; i < count; ++i) {
159 const auto ridx = unprocessed.get_index(idx: i);
160 const auto cidx = cursor.get_index(idx: ridx);
161 hdata[ridx] = CombineHashScalar(a: hdata[ridx], b: chdata[cidx]);
162
163 const auto lidx = idata.sel->get_index(idx: ridx);
164 const auto &entry = ldata[lidx];
165 if (entry.length > position) {
166 // Entry still has values to hash
167 unprocessed.set_index(idx: remaining++, loc: ridx);
168 cursor.set_index(idx: ridx, loc: cidx + 1);
169 }
170 }
171
172 count = remaining;
173 if (count == 0) {
174 break;
175 }
176 }
177}
178
179template <bool HAS_RSEL>
180static inline void HashTypeSwitch(Vector &input, Vector &result, const SelectionVector *rsel, idx_t count) {
181 D_ASSERT(result.GetType().id() == LogicalType::HASH);
182 switch (input.GetType().InternalType()) {
183 case PhysicalType::BOOL:
184 case PhysicalType::INT8:
185 TemplatedLoopHash<HAS_RSEL, int8_t>(input, result, rsel, count);
186 break;
187 case PhysicalType::INT16:
188 TemplatedLoopHash<HAS_RSEL, int16_t>(input, result, rsel, count);
189 break;
190 case PhysicalType::INT32:
191 TemplatedLoopHash<HAS_RSEL, int32_t>(input, result, rsel, count);
192 break;
193 case PhysicalType::INT64:
194 TemplatedLoopHash<HAS_RSEL, int64_t>(input, result, rsel, count);
195 break;
196 case PhysicalType::UINT8:
197 TemplatedLoopHash<HAS_RSEL, uint8_t>(input, result, rsel, count);
198 break;
199 case PhysicalType::UINT16:
200 TemplatedLoopHash<HAS_RSEL, uint16_t>(input, result, rsel, count);
201 break;
202 case PhysicalType::UINT32:
203 TemplatedLoopHash<HAS_RSEL, uint32_t>(input, result, rsel, count);
204 break;
205 case PhysicalType::UINT64:
206 TemplatedLoopHash<HAS_RSEL, uint64_t>(input, result, rsel, count);
207 break;
208 case PhysicalType::INT128:
209 TemplatedLoopHash<HAS_RSEL, hugeint_t>(input, result, rsel, count);
210 break;
211 case PhysicalType::FLOAT:
212 TemplatedLoopHash<HAS_RSEL, float>(input, result, rsel, count);
213 break;
214 case PhysicalType::DOUBLE:
215 TemplatedLoopHash<HAS_RSEL, double>(input, result, rsel, count);
216 break;
217 case PhysicalType::INTERVAL:
218 TemplatedLoopHash<HAS_RSEL, interval_t>(input, result, rsel, count);
219 break;
220 case PhysicalType::VARCHAR:
221 TemplatedLoopHash<HAS_RSEL, string_t>(input, result, rsel, count);
222 break;
223 case PhysicalType::STRUCT:
224 StructLoopHash<HAS_RSEL, true>(input, result, rsel, count);
225 break;
226 case PhysicalType::LIST:
227 ListLoopHash<HAS_RSEL, true>(input, result, rsel, count);
228 break;
229 default:
230 throw InvalidTypeException(input.GetType(), "Invalid type for hash");
231 }
232}
233
234void VectorOperations::Hash(Vector &input, Vector &result, idx_t count) {
235 HashTypeSwitch<false>(input, result, rsel: nullptr, count);
236}
237
238void VectorOperations::Hash(Vector &input, Vector &result, const SelectionVector &sel, idx_t count) {
239 HashTypeSwitch<true>(input, result, rsel: &sel, count);
240}
241
242template <bool HAS_RSEL, class T>
243static inline void TightLoopCombineHashConstant(const T *__restrict ldata, hash_t constant_hash,
244 hash_t *__restrict hash_data, const SelectionVector *rsel, idx_t count,
245 const SelectionVector *__restrict sel_vector, ValidityMask &mask) {
246 if (!mask.AllValid()) {
247 for (idx_t i = 0; i < count; i++) {
248 auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i;
249 auto idx = sel_vector->get_index(idx: ridx);
250 auto other_hash = HashOp::Operation(ldata[idx], !mask.RowIsValid(row_idx: idx));
251 hash_data[ridx] = CombineHashScalar(constant_hash, other_hash);
252 }
253 } else {
254 for (idx_t i = 0; i < count; i++) {
255 auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i;
256 auto idx = sel_vector->get_index(idx: ridx);
257 auto other_hash = duckdb::Hash<T>(ldata[idx]);
258 hash_data[ridx] = CombineHashScalar(constant_hash, other_hash);
259 }
260 }
261}
262
263template <bool HAS_RSEL, class T>
264static inline void TightLoopCombineHash(const T *__restrict ldata, hash_t *__restrict hash_data,
265 const SelectionVector *rsel, idx_t count,
266 const SelectionVector *__restrict sel_vector, ValidityMask &mask) {
267 if (!mask.AllValid()) {
268 for (idx_t i = 0; i < count; i++) {
269 auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i;
270 auto idx = sel_vector->get_index(idx: ridx);
271 auto other_hash = HashOp::Operation(ldata[idx], !mask.RowIsValid(row_idx: idx));
272 hash_data[ridx] = CombineHashScalar(hash_data[ridx], other_hash);
273 }
274 } else {
275 for (idx_t i = 0; i < count; i++) {
276 auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i;
277 auto idx = sel_vector->get_index(idx: ridx);
278 auto other_hash = duckdb::Hash<T>(ldata[idx]);
279 hash_data[ridx] = CombineHashScalar(hash_data[ridx], other_hash);
280 }
281 }
282}
283
284template <bool HAS_RSEL, class T>
285void TemplatedLoopCombineHash(Vector &input, Vector &hashes, const SelectionVector *rsel, idx_t count) {
286 if (input.GetVectorType() == VectorType::CONSTANT_VECTOR && hashes.GetVectorType() == VectorType::CONSTANT_VECTOR) {
287 auto ldata = ConstantVector::GetData<T>(input);
288 auto hash_data = ConstantVector::GetData<hash_t>(vector&: hashes);
289
290 auto other_hash = HashOp::Operation(*ldata, ConstantVector::IsNull(vector: input));
291 *hash_data = CombineHashScalar(*hash_data, other_hash);
292 } else {
293 UnifiedVectorFormat idata;
294 input.ToUnifiedFormat(count, data&: idata);
295 if (hashes.GetVectorType() == VectorType::CONSTANT_VECTOR) {
296 // mix constant with non-constant, first get the constant value
297 auto constant_hash = *ConstantVector::GetData<hash_t>(vector&: hashes);
298 // now re-initialize the hashes vector to an empty flat vector
299 hashes.SetVectorType(VectorType::FLAT_VECTOR);
300 TightLoopCombineHashConstant<HAS_RSEL, T>(UnifiedVectorFormat::GetData<T>(idata), constant_hash,
301 FlatVector::GetData<hash_t>(vector&: hashes), rsel, count, idata.sel,
302 idata.validity);
303 } else {
304 D_ASSERT(hashes.GetVectorType() == VectorType::FLAT_VECTOR);
305 TightLoopCombineHash<HAS_RSEL, T>(UnifiedVectorFormat::GetData<T>(idata),
306 FlatVector::GetData<hash_t>(vector&: hashes), rsel, count, idata.sel,
307 idata.validity);
308 }
309 }
310}
311
312template <bool HAS_RSEL>
313static inline void CombineHashTypeSwitch(Vector &hashes, Vector &input, const SelectionVector *rsel, idx_t count) {
314 D_ASSERT(hashes.GetType().id() == LogicalType::HASH);
315 switch (input.GetType().InternalType()) {
316 case PhysicalType::BOOL:
317 case PhysicalType::INT8:
318 TemplatedLoopCombineHash<HAS_RSEL, int8_t>(input, hashes, rsel, count);
319 break;
320 case PhysicalType::INT16:
321 TemplatedLoopCombineHash<HAS_RSEL, int16_t>(input, hashes, rsel, count);
322 break;
323 case PhysicalType::INT32:
324 TemplatedLoopCombineHash<HAS_RSEL, int32_t>(input, hashes, rsel, count);
325 break;
326 case PhysicalType::INT64:
327 TemplatedLoopCombineHash<HAS_RSEL, int64_t>(input, hashes, rsel, count);
328 break;
329 case PhysicalType::UINT8:
330 TemplatedLoopCombineHash<HAS_RSEL, uint8_t>(input, hashes, rsel, count);
331 break;
332 case PhysicalType::UINT16:
333 TemplatedLoopCombineHash<HAS_RSEL, uint16_t>(input, hashes, rsel, count);
334 break;
335 case PhysicalType::UINT32:
336 TemplatedLoopCombineHash<HAS_RSEL, uint32_t>(input, hashes, rsel, count);
337 break;
338 case PhysicalType::UINT64:
339 TemplatedLoopCombineHash<HAS_RSEL, uint64_t>(input, hashes, rsel, count);
340 break;
341 case PhysicalType::INT128:
342 TemplatedLoopCombineHash<HAS_RSEL, hugeint_t>(input, hashes, rsel, count);
343 break;
344 case PhysicalType::FLOAT:
345 TemplatedLoopCombineHash<HAS_RSEL, float>(input, hashes, rsel, count);
346 break;
347 case PhysicalType::DOUBLE:
348 TemplatedLoopCombineHash<HAS_RSEL, double>(input, hashes, rsel, count);
349 break;
350 case PhysicalType::INTERVAL:
351 TemplatedLoopCombineHash<HAS_RSEL, interval_t>(input, hashes, rsel, count);
352 break;
353 case PhysicalType::VARCHAR:
354 TemplatedLoopCombineHash<HAS_RSEL, string_t>(input, hashes, rsel, count);
355 break;
356 case PhysicalType::STRUCT:
357 StructLoopHash<HAS_RSEL, false>(input, hashes, rsel, count);
358 break;
359 case PhysicalType::LIST:
360 ListLoopHash<HAS_RSEL, false>(input, hashes, rsel, count);
361 break;
362 default:
363 throw InvalidTypeException(input.GetType(), "Invalid type for hash");
364 }
365}
366
367void VectorOperations::CombineHash(Vector &hashes, Vector &input, idx_t count) {
368 CombineHashTypeSwitch<false>(hashes, input, rsel: nullptr, count);
369}
370
371void VectorOperations::CombineHash(Vector &hashes, Vector &input, const SelectionVector &rsel, idx_t count) {
372 CombineHashTypeSwitch<true>(hashes, input, rsel: &rsel, count);
373}
374
375} // namespace duckdb
376