1 | //===--------------------------------------------------------------------===// |
2 | // hash.cpp |
3 | // Description: This file contains the vectorized hash implementations |
4 | //===--------------------------------------------------------------------===// |
5 | |
6 | #include "duckdb/common/vector_operations/vector_operations.hpp" |
7 | |
8 | #include "duckdb/common/types/hash.hpp" |
9 | #include "duckdb/common/types/null_value.hpp" |
10 | #include "duckdb/common/value_operations/value_operations.hpp" |
11 | |
12 | namespace duckdb { |
13 | |
14 | struct HashOp { |
15 | static const hash_t NULL_HASH = 0xbf58476d1ce4e5b9; |
16 | |
17 | template <class T> |
18 | static inline hash_t Operation(T input, bool is_null) { |
19 | return is_null ? NULL_HASH : duckdb::Hash<T>(input); |
20 | } |
21 | }; |
22 | |
23 | static inline hash_t CombineHashScalar(hash_t a, hash_t b) { |
24 | return (a * UINT64_C(0xbf58476d1ce4e5b9)) ^ b; |
25 | } |
26 | |
27 | template <bool HAS_RSEL, class T> |
28 | static inline void TightLoopHash(const T *__restrict ldata, hash_t *__restrict result_data, const SelectionVector *rsel, |
29 | idx_t count, const SelectionVector *__restrict sel_vector, ValidityMask &mask) { |
30 | if (!mask.AllValid()) { |
31 | for (idx_t i = 0; i < count; i++) { |
32 | auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i; |
33 | auto idx = sel_vector->get_index(idx: ridx); |
34 | result_data[ridx] = HashOp::Operation(ldata[idx], !mask.RowIsValid(row_idx: idx)); |
35 | } |
36 | } else { |
37 | for (idx_t i = 0; i < count; i++) { |
38 | auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i; |
39 | auto idx = sel_vector->get_index(idx: ridx); |
40 | result_data[ridx] = duckdb::Hash<T>(ldata[idx]); |
41 | } |
42 | } |
43 | } |
44 | |
45 | template <bool HAS_RSEL, class T> |
46 | static inline void TemplatedLoopHash(Vector &input, Vector &result, const SelectionVector *rsel, idx_t count) { |
47 | if (input.GetVectorType() == VectorType::CONSTANT_VECTOR) { |
48 | result.SetVectorType(VectorType::CONSTANT_VECTOR); |
49 | |
50 | auto ldata = ConstantVector::GetData<T>(input); |
51 | auto result_data = ConstantVector::GetData<hash_t>(vector&: result); |
52 | *result_data = HashOp::Operation(*ldata, ConstantVector::IsNull(vector: input)); |
53 | } else { |
54 | result.SetVectorType(VectorType::FLAT_VECTOR); |
55 | |
56 | UnifiedVectorFormat idata; |
57 | input.ToUnifiedFormat(count, data&: idata); |
58 | |
59 | TightLoopHash<HAS_RSEL, T>(UnifiedVectorFormat::GetData<T>(idata), FlatVector::GetData<hash_t>(vector&: result), rsel, |
60 | count, idata.sel, idata.validity); |
61 | } |
62 | } |
63 | |
64 | template <bool HAS_RSEL, bool FIRST_HASH> |
65 | static inline void StructLoopHash(Vector &input, Vector &hashes, const SelectionVector *rsel, idx_t count) { |
66 | auto &children = StructVector::GetEntries(vector&: input); |
67 | |
68 | D_ASSERT(!children.empty()); |
69 | idx_t col_no = 0; |
70 | if (HAS_RSEL) { |
71 | if (FIRST_HASH) { |
72 | VectorOperations::Hash(input&: *children[col_no++], hashes, rsel: *rsel, count); |
73 | } else { |
74 | VectorOperations::CombineHash(hashes, input&: *children[col_no++], rsel: *rsel, count); |
75 | } |
76 | while (col_no < children.size()) { |
77 | VectorOperations::CombineHash(hashes, input&: *children[col_no++], rsel: *rsel, count); |
78 | } |
79 | } else { |
80 | if (FIRST_HASH) { |
81 | VectorOperations::Hash(input&: *children[col_no++], hashes, count); |
82 | } else { |
83 | VectorOperations::CombineHash(hashes, input&: *children[col_no++], count); |
84 | } |
85 | while (col_no < children.size()) { |
86 | VectorOperations::CombineHash(hashes, input&: *children[col_no++], count); |
87 | } |
88 | } |
89 | } |
90 | |
91 | template <bool HAS_RSEL, bool FIRST_HASH> |
92 | static inline void ListLoopHash(Vector &input, Vector &hashes, const SelectionVector *rsel, idx_t count) { |
93 | auto hdata = FlatVector::GetData<hash_t>(vector&: hashes); |
94 | |
95 | UnifiedVectorFormat idata; |
96 | input.ToUnifiedFormat(count, data&: idata); |
97 | const auto ldata = UnifiedVectorFormat::GetData<list_entry_t>(format: idata); |
98 | |
99 | // Hash the children into a temporary |
100 | auto &child = ListVector::GetEntry(vector&: input); |
101 | const auto child_count = ListVector::GetListSize(vector: input); |
102 | |
103 | Vector child_hashes(LogicalType::HASH, child_count); |
104 | if (child_count > 0) { |
105 | VectorOperations::Hash(input&: child, hashes&: child_hashes, count: child_count); |
106 | } |
107 | auto chdata = FlatVector::GetData<hash_t>(vector&: child_hashes); |
108 | |
109 | // Reduce the number of entries to check to the non-empty ones |
110 | SelectionVector unprocessed(count); |
111 | SelectionVector cursor(HAS_RSEL ? STANDARD_VECTOR_SIZE : count); |
112 | idx_t remaining = 0; |
113 | for (idx_t i = 0; i < count; ++i) { |
114 | const idx_t ridx = HAS_RSEL ? rsel->get_index(idx: i) : i; |
115 | const auto lidx = idata.sel->get_index(idx: ridx); |
116 | const auto &entry = ldata[lidx]; |
117 | if (idata.validity.RowIsValid(row_idx: lidx) && entry.length > 0) { |
118 | unprocessed.set_index(idx: remaining++, loc: ridx); |
119 | cursor.set_index(idx: ridx, loc: entry.offset); |
120 | } else if (FIRST_HASH) { |
121 | hdata[ridx] = HashOp::NULL_HASH; |
122 | } |
123 | // Empty or NULL non-first elements have no effect. |
124 | } |
125 | |
126 | count = remaining; |
127 | if (count == 0) { |
128 | return; |
129 | } |
130 | |
131 | // Merge the first position hash into the main hash |
132 | idx_t position = 1; |
133 | if (FIRST_HASH) { |
134 | remaining = 0; |
135 | for (idx_t i = 0; i < count; ++i) { |
136 | const auto ridx = unprocessed.get_index(idx: i); |
137 | const auto cidx = cursor.get_index(idx: ridx); |
138 | hdata[ridx] = chdata[cidx]; |
139 | |
140 | const auto lidx = idata.sel->get_index(idx: ridx); |
141 | const auto &entry = ldata[lidx]; |
142 | if (entry.length > position) { |
143 | // Entry still has values to hash |
144 | unprocessed.set_index(idx: remaining++, loc: ridx); |
145 | cursor.set_index(idx: ridx, loc: cidx + 1); |
146 | } |
147 | } |
148 | count = remaining; |
149 | if (count == 0) { |
150 | return; |
151 | } |
152 | ++position; |
153 | } |
154 | |
155 | // Combine the hashes for the remaining positions until there are none left |
156 | for (;; ++position) { |
157 | remaining = 0; |
158 | for (idx_t i = 0; i < count; ++i) { |
159 | const auto ridx = unprocessed.get_index(idx: i); |
160 | const auto cidx = cursor.get_index(idx: ridx); |
161 | hdata[ridx] = CombineHashScalar(a: hdata[ridx], b: chdata[cidx]); |
162 | |
163 | const auto lidx = idata.sel->get_index(idx: ridx); |
164 | const auto &entry = ldata[lidx]; |
165 | if (entry.length > position) { |
166 | // Entry still has values to hash |
167 | unprocessed.set_index(idx: remaining++, loc: ridx); |
168 | cursor.set_index(idx: ridx, loc: cidx + 1); |
169 | } |
170 | } |
171 | |
172 | count = remaining; |
173 | if (count == 0) { |
174 | break; |
175 | } |
176 | } |
177 | } |
178 | |
179 | template <bool HAS_RSEL> |
180 | static inline void HashTypeSwitch(Vector &input, Vector &result, const SelectionVector *rsel, idx_t count) { |
181 | D_ASSERT(result.GetType().id() == LogicalType::HASH); |
182 | switch (input.GetType().InternalType()) { |
183 | case PhysicalType::BOOL: |
184 | case PhysicalType::INT8: |
185 | TemplatedLoopHash<HAS_RSEL, int8_t>(input, result, rsel, count); |
186 | break; |
187 | case PhysicalType::INT16: |
188 | TemplatedLoopHash<HAS_RSEL, int16_t>(input, result, rsel, count); |
189 | break; |
190 | case PhysicalType::INT32: |
191 | TemplatedLoopHash<HAS_RSEL, int32_t>(input, result, rsel, count); |
192 | break; |
193 | case PhysicalType::INT64: |
194 | TemplatedLoopHash<HAS_RSEL, int64_t>(input, result, rsel, count); |
195 | break; |
196 | case PhysicalType::UINT8: |
197 | TemplatedLoopHash<HAS_RSEL, uint8_t>(input, result, rsel, count); |
198 | break; |
199 | case PhysicalType::UINT16: |
200 | TemplatedLoopHash<HAS_RSEL, uint16_t>(input, result, rsel, count); |
201 | break; |
202 | case PhysicalType::UINT32: |
203 | TemplatedLoopHash<HAS_RSEL, uint32_t>(input, result, rsel, count); |
204 | break; |
205 | case PhysicalType::UINT64: |
206 | TemplatedLoopHash<HAS_RSEL, uint64_t>(input, result, rsel, count); |
207 | break; |
208 | case PhysicalType::INT128: |
209 | TemplatedLoopHash<HAS_RSEL, hugeint_t>(input, result, rsel, count); |
210 | break; |
211 | case PhysicalType::FLOAT: |
212 | TemplatedLoopHash<HAS_RSEL, float>(input, result, rsel, count); |
213 | break; |
214 | case PhysicalType::DOUBLE: |
215 | TemplatedLoopHash<HAS_RSEL, double>(input, result, rsel, count); |
216 | break; |
217 | case PhysicalType::INTERVAL: |
218 | TemplatedLoopHash<HAS_RSEL, interval_t>(input, result, rsel, count); |
219 | break; |
220 | case PhysicalType::VARCHAR: |
221 | TemplatedLoopHash<HAS_RSEL, string_t>(input, result, rsel, count); |
222 | break; |
223 | case PhysicalType::STRUCT: |
224 | StructLoopHash<HAS_RSEL, true>(input, result, rsel, count); |
225 | break; |
226 | case PhysicalType::LIST: |
227 | ListLoopHash<HAS_RSEL, true>(input, result, rsel, count); |
228 | break; |
229 | default: |
230 | throw InvalidTypeException(input.GetType(), "Invalid type for hash" ); |
231 | } |
232 | } |
233 | |
234 | void VectorOperations::Hash(Vector &input, Vector &result, idx_t count) { |
235 | HashTypeSwitch<false>(input, result, rsel: nullptr, count); |
236 | } |
237 | |
238 | void VectorOperations::Hash(Vector &input, Vector &result, const SelectionVector &sel, idx_t count) { |
239 | HashTypeSwitch<true>(input, result, rsel: &sel, count); |
240 | } |
241 | |
242 | template <bool HAS_RSEL, class T> |
243 | static inline void TightLoopCombineHashConstant(const T *__restrict ldata, hash_t constant_hash, |
244 | hash_t *__restrict hash_data, const SelectionVector *rsel, idx_t count, |
245 | const SelectionVector *__restrict sel_vector, ValidityMask &mask) { |
246 | if (!mask.AllValid()) { |
247 | for (idx_t i = 0; i < count; i++) { |
248 | auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i; |
249 | auto idx = sel_vector->get_index(idx: ridx); |
250 | auto other_hash = HashOp::Operation(ldata[idx], !mask.RowIsValid(row_idx: idx)); |
251 | hash_data[ridx] = CombineHashScalar(constant_hash, other_hash); |
252 | } |
253 | } else { |
254 | for (idx_t i = 0; i < count; i++) { |
255 | auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i; |
256 | auto idx = sel_vector->get_index(idx: ridx); |
257 | auto other_hash = duckdb::Hash<T>(ldata[idx]); |
258 | hash_data[ridx] = CombineHashScalar(constant_hash, other_hash); |
259 | } |
260 | } |
261 | } |
262 | |
263 | template <bool HAS_RSEL, class T> |
264 | static inline void TightLoopCombineHash(const T *__restrict ldata, hash_t *__restrict hash_data, |
265 | const SelectionVector *rsel, idx_t count, |
266 | const SelectionVector *__restrict sel_vector, ValidityMask &mask) { |
267 | if (!mask.AllValid()) { |
268 | for (idx_t i = 0; i < count; i++) { |
269 | auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i; |
270 | auto idx = sel_vector->get_index(idx: ridx); |
271 | auto other_hash = HashOp::Operation(ldata[idx], !mask.RowIsValid(row_idx: idx)); |
272 | hash_data[ridx] = CombineHashScalar(hash_data[ridx], other_hash); |
273 | } |
274 | } else { |
275 | for (idx_t i = 0; i < count; i++) { |
276 | auto ridx = HAS_RSEL ? rsel->get_index(idx: i) : i; |
277 | auto idx = sel_vector->get_index(idx: ridx); |
278 | auto other_hash = duckdb::Hash<T>(ldata[idx]); |
279 | hash_data[ridx] = CombineHashScalar(hash_data[ridx], other_hash); |
280 | } |
281 | } |
282 | } |
283 | |
284 | template <bool HAS_RSEL, class T> |
285 | void TemplatedLoopCombineHash(Vector &input, Vector &hashes, const SelectionVector *rsel, idx_t count) { |
286 | if (input.GetVectorType() == VectorType::CONSTANT_VECTOR && hashes.GetVectorType() == VectorType::CONSTANT_VECTOR) { |
287 | auto ldata = ConstantVector::GetData<T>(input); |
288 | auto hash_data = ConstantVector::GetData<hash_t>(vector&: hashes); |
289 | |
290 | auto other_hash = HashOp::Operation(*ldata, ConstantVector::IsNull(vector: input)); |
291 | *hash_data = CombineHashScalar(*hash_data, other_hash); |
292 | } else { |
293 | UnifiedVectorFormat idata; |
294 | input.ToUnifiedFormat(count, data&: idata); |
295 | if (hashes.GetVectorType() == VectorType::CONSTANT_VECTOR) { |
296 | // mix constant with non-constant, first get the constant value |
297 | auto constant_hash = *ConstantVector::GetData<hash_t>(vector&: hashes); |
298 | // now re-initialize the hashes vector to an empty flat vector |
299 | hashes.SetVectorType(VectorType::FLAT_VECTOR); |
300 | TightLoopCombineHashConstant<HAS_RSEL, T>(UnifiedVectorFormat::GetData<T>(idata), constant_hash, |
301 | FlatVector::GetData<hash_t>(vector&: hashes), rsel, count, idata.sel, |
302 | idata.validity); |
303 | } else { |
304 | D_ASSERT(hashes.GetVectorType() == VectorType::FLAT_VECTOR); |
305 | TightLoopCombineHash<HAS_RSEL, T>(UnifiedVectorFormat::GetData<T>(idata), |
306 | FlatVector::GetData<hash_t>(vector&: hashes), rsel, count, idata.sel, |
307 | idata.validity); |
308 | } |
309 | } |
310 | } |
311 | |
312 | template <bool HAS_RSEL> |
313 | static inline void CombineHashTypeSwitch(Vector &hashes, Vector &input, const SelectionVector *rsel, idx_t count) { |
314 | D_ASSERT(hashes.GetType().id() == LogicalType::HASH); |
315 | switch (input.GetType().InternalType()) { |
316 | case PhysicalType::BOOL: |
317 | case PhysicalType::INT8: |
318 | TemplatedLoopCombineHash<HAS_RSEL, int8_t>(input, hashes, rsel, count); |
319 | break; |
320 | case PhysicalType::INT16: |
321 | TemplatedLoopCombineHash<HAS_RSEL, int16_t>(input, hashes, rsel, count); |
322 | break; |
323 | case PhysicalType::INT32: |
324 | TemplatedLoopCombineHash<HAS_RSEL, int32_t>(input, hashes, rsel, count); |
325 | break; |
326 | case PhysicalType::INT64: |
327 | TemplatedLoopCombineHash<HAS_RSEL, int64_t>(input, hashes, rsel, count); |
328 | break; |
329 | case PhysicalType::UINT8: |
330 | TemplatedLoopCombineHash<HAS_RSEL, uint8_t>(input, hashes, rsel, count); |
331 | break; |
332 | case PhysicalType::UINT16: |
333 | TemplatedLoopCombineHash<HAS_RSEL, uint16_t>(input, hashes, rsel, count); |
334 | break; |
335 | case PhysicalType::UINT32: |
336 | TemplatedLoopCombineHash<HAS_RSEL, uint32_t>(input, hashes, rsel, count); |
337 | break; |
338 | case PhysicalType::UINT64: |
339 | TemplatedLoopCombineHash<HAS_RSEL, uint64_t>(input, hashes, rsel, count); |
340 | break; |
341 | case PhysicalType::INT128: |
342 | TemplatedLoopCombineHash<HAS_RSEL, hugeint_t>(input, hashes, rsel, count); |
343 | break; |
344 | case PhysicalType::FLOAT: |
345 | TemplatedLoopCombineHash<HAS_RSEL, float>(input, hashes, rsel, count); |
346 | break; |
347 | case PhysicalType::DOUBLE: |
348 | TemplatedLoopCombineHash<HAS_RSEL, double>(input, hashes, rsel, count); |
349 | break; |
350 | case PhysicalType::INTERVAL: |
351 | TemplatedLoopCombineHash<HAS_RSEL, interval_t>(input, hashes, rsel, count); |
352 | break; |
353 | case PhysicalType::VARCHAR: |
354 | TemplatedLoopCombineHash<HAS_RSEL, string_t>(input, hashes, rsel, count); |
355 | break; |
356 | case PhysicalType::STRUCT: |
357 | StructLoopHash<HAS_RSEL, false>(input, hashes, rsel, count); |
358 | break; |
359 | case PhysicalType::LIST: |
360 | ListLoopHash<HAS_RSEL, false>(input, hashes, rsel, count); |
361 | break; |
362 | default: |
363 | throw InvalidTypeException(input.GetType(), "Invalid type for hash" ); |
364 | } |
365 | } |
366 | |
367 | void VectorOperations::CombineHash(Vector &hashes, Vector &input, idx_t count) { |
368 | CombineHashTypeSwitch<false>(hashes, input, rsel: nullptr, count); |
369 | } |
370 | |
371 | void VectorOperations::CombineHash(Vector &hashes, Vector &input, const SelectionVector &rsel, idx_t count) { |
372 | CombineHashTypeSwitch<true>(hashes, input, rsel: &rsel, count); |
373 | } |
374 | |
375 | } // namespace duckdb |
376 | |