1#pragma once
2
3
4#include <Common/HashTable/HashTable.h>
5#include <Common/HashTable/HashTableKeyHolder.h>
6#include <Common/ColumnsHashingImpl.h>
7#include <Common/Arena.h>
8#include <Common/LRUCache.h>
9#include <Common/assert_cast.h>
10#include <common/unaligned.h>
11
12#include <Columns/ColumnString.h>
13#include <Columns/ColumnFixedString.h>
14#include <Columns/ColumnLowCardinality.h>
15
16#include <Core/Defines.h>
17#include <memory>
18
19namespace DB
20{
21
22namespace ColumnsHashing
23{
24
25/// For the case when there is one numeric key.
26/// UInt8/16/32/64 for any type with corresponding bit width.
27template <typename Value, typename Mapped, typename FieldType, bool use_cache = true>
28struct HashMethodOneNumber
29 : public columns_hashing_impl::HashMethodBase<HashMethodOneNumber<Value, Mapped, FieldType, use_cache>, Value, Mapped, use_cache>
30{
31 using Self = HashMethodOneNumber<Value, Mapped, FieldType, use_cache>;
32 using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
33
34 const char * vec;
35
36 /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise.
37 HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
38 {
39 vec = key_columns[0]->getRawData().data;
40 }
41
42 HashMethodOneNumber(const IColumn * column)
43 {
44 vec = column->getRawData().data;
45 }
46
47 /// Creates context. Method is called once and result context is used in all threads.
48 using Base::createContext; /// (const HashMethodContext::Settings &) -> HashMethodContextPtr
49
50 /// Emplace key into HashTable or HashMap. If Data is HashMap, returns ptr to value, otherwise nullptr.
51 /// Data is a HashTable where to insert key from column's row.
52 /// For Serialized method, key may be placed in pool.
53 using Base::emplaceKey; /// (Data & data, size_t row, Arena & pool) -> EmplaceResult
54
55 /// Find key into HashTable or HashMap. If Data is HashMap and key was found, returns ptr to value, otherwise nullptr.
56 using Base::findKey; /// (Data & data, size_t row, Arena & pool) -> FindResult
57
58 /// Get hash value of row.
59 using Base::getHash; /// (const Data & data, size_t row, Arena & pool) -> size_t
60
61 /// Is used for default implementation in HashMethodBase.
62 FieldType getKeyHolder(size_t row, Arena &) const { return unalignedLoad<FieldType>(vec + row * sizeof(FieldType)); }
63};
64
65
66/// For the case when there is one string key.
67template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true>
68struct HashMethodString
69 : public columns_hashing_impl::HashMethodBase<HashMethodString<Value, Mapped, place_string_to_arena, use_cache>, Value, Mapped, use_cache>
70{
71 using Self = HashMethodString<Value, Mapped, place_string_to_arena, use_cache>;
72 using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
73
74 const IColumn::Offset * offsets;
75 const UInt8 * chars;
76
77 HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
78 {
79 const IColumn & column = *key_columns[0];
80 const ColumnString & column_string = assert_cast<const ColumnString &>(column);
81 offsets = column_string.getOffsets().data();
82 chars = column_string.getChars().data();
83 }
84
85 auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena & pool) const
86 {
87 StringRef key(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1);
88
89 if constexpr (place_string_to_arena)
90 {
91 return ArenaKeyHolder{key, pool};
92 }
93 else
94 {
95 return key;
96 }
97 }
98
99protected:
100 friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
101};
102
103
104/// For the case when there is one fixed-length string key.
105template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true>
106struct HashMethodFixedString
107 : public columns_hashing_impl::HashMethodBase<HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache>, Value, Mapped, use_cache>
108{
109 using Self = HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache>;
110 using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
111
112 size_t n;
113 const ColumnFixedString::Chars * chars;
114
115 HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
116 {
117 const IColumn & column = *key_columns[0];
118 const ColumnFixedString & column_string = assert_cast<const ColumnFixedString &>(column);
119 n = column_string.getN();
120 chars = &column_string.getChars();
121 }
122
123 auto getKeyHolder(size_t row, [[maybe_unused]] Arena & pool) const
124 {
125 StringRef key(&(*chars)[row * n], n);
126
127 if constexpr (place_string_to_arena)
128 {
129 return ArenaKeyHolder{key, pool};
130 }
131 else
132 {
133 return key;
134 }
135 }
136
137protected:
138 friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
139};
140
141
142/// Cache stores dictionaries and saved_hash per dictionary key.
143class LowCardinalityDictionaryCache : public HashMethodContext
144{
145public:
146 /// Will assume that dictionaries with same hash has the same keys.
147 /// Just in case, check that they have also the same size.
148 struct DictionaryKey
149 {
150 UInt128 hash;
151 UInt64 size;
152
153 bool operator== (const DictionaryKey & other) const { return hash == other.hash && size == other.size; }
154 };
155
156 struct DictionaryKeyHash
157 {
158 size_t operator()(const DictionaryKey & key) const
159 {
160 SipHash hash;
161 hash.update(key.hash.low);
162 hash.update(key.hash.high);
163 hash.update(key.size);
164 return hash.get64();
165 }
166 };
167
168 struct CachedValues
169 {
170 /// Store ptr to dictionary to be sure it won't be deleted.
171 ColumnPtr dictionary_holder;
172 /// Hashes for dictionary keys.
173 const UInt64 * saved_hash = nullptr;
174 };
175
176 using CachedValuesPtr = std::shared_ptr<CachedValues>;
177
178 explicit LowCardinalityDictionaryCache(const HashMethodContext::Settings & settings) : cache(settings.max_threads) {}
179
180 CachedValuesPtr get(const DictionaryKey & key) { return cache.get(key); }
181 void set(const DictionaryKey & key, const CachedValuesPtr & mapped) { cache.set(key, mapped); }
182
183private:
184 using Cache = LRUCache<DictionaryKey, CachedValues, DictionaryKeyHash>;
185 Cache cache;
186};
187
188
189/// Single low cardinality column.
190template <typename SingleColumnMethod, typename Mapped, bool use_cache>
191struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod
192{
193 using Base = SingleColumnMethod;
194
195 enum class VisitValue
196 {
197 Empty = 0,
198 Found = 1,
199 NotFound = 2,
200 };
201
202 static constexpr bool has_mapped = !std::is_same<Mapped, void>::value;
203 using EmplaceResult = columns_hashing_impl::EmplaceResultImpl<Mapped>;
204 using FindResult = columns_hashing_impl::FindResultImpl<Mapped>;
205
206 static HashMethodContextPtr createContext(const HashMethodContext::Settings & settings)
207 {
208 return std::make_shared<LowCardinalityDictionaryCache>(settings);
209 }
210
211 ColumnRawPtrs key_columns;
212 const IColumn * positions = nullptr;
213 size_t size_of_index_type = 0;
214
215 /// saved hash is from current column or from cache.
216 const UInt64 * saved_hash = nullptr;
217 /// Hold dictionary in case saved_hash is from cache to be sure it won't be deleted.
218 ColumnPtr dictionary_holder;
219
220 /// Cache AggregateDataPtr for current column in order to decrease the number of hash table usages.
221 columns_hashing_impl::MappedCache<Mapped> mapped_cache;
222 PaddedPODArray<VisitValue> visit_cache;
223
224 /// If initialized column is nullable.
225 bool is_nullable = false;
226
227 static const ColumnLowCardinality & getLowCardinalityColumn(const IColumn * low_cardinality_column)
228 {
229 auto column = typeid_cast<const ColumnLowCardinality *>(low_cardinality_column);
230 if (!column)
231 throw Exception("Invalid aggregation key type for HashMethodSingleLowCardinalityColumn method. "
232 "Excepted LowCardinality, got " + column->getName(), ErrorCodes::LOGICAL_ERROR);
233 return *column;
234 }
235
236 HashMethodSingleLowCardinalityColumn(
237 const ColumnRawPtrs & key_columns_low_cardinality, const Sizes & key_sizes, const HashMethodContextPtr & context)
238 : Base({getLowCardinalityColumn(key_columns_low_cardinality[0]).getDictionary().getNestedNotNullableColumn().get()}, key_sizes, context)
239 {
240 auto column = &getLowCardinalityColumn(key_columns_low_cardinality[0]);
241
242 if (!context)
243 throw Exception("Cache wasn't created for HashMethodSingleLowCardinalityColumn",
244 ErrorCodes::LOGICAL_ERROR);
245
246 LowCardinalityDictionaryCache * lcd_cache;
247 if constexpr (use_cache)
248 {
249 lcd_cache = typeid_cast<LowCardinalityDictionaryCache *>(context.get());
250 if (!lcd_cache)
251 {
252 const auto & cached_val = *context;
253 throw Exception("Invalid type for HashMethodSingleLowCardinalityColumn cache: "
254 + demangle(typeid(cached_val).name()), ErrorCodes::LOGICAL_ERROR);
255 }
256 }
257
258 auto * dict = column->getDictionary().getNestedNotNullableColumn().get();
259 is_nullable = column->getDictionary().nestedColumnIsNullable();
260 key_columns = {dict};
261 bool is_shared_dict = column->isSharedDictionary();
262
263 typename LowCardinalityDictionaryCache::DictionaryKey dictionary_key;
264 typename LowCardinalityDictionaryCache::CachedValuesPtr cached_values;
265
266 if (is_shared_dict)
267 {
268 dictionary_key = {column->getDictionary().getHash(), dict->size()};
269 if constexpr (use_cache)
270 cached_values = lcd_cache->get(dictionary_key);
271 }
272
273 if (cached_values)
274 {
275 saved_hash = cached_values->saved_hash;
276 dictionary_holder = cached_values->dictionary_holder;
277 }
278 else
279 {
280 saved_hash = column->getDictionary().tryGetSavedHash();
281 dictionary_holder = column->getDictionaryPtr();
282
283 if constexpr (use_cache)
284 {
285 if (is_shared_dict)
286 {
287 cached_values = std::make_shared<typename LowCardinalityDictionaryCache::CachedValues>();
288 cached_values->saved_hash = saved_hash;
289 cached_values->dictionary_holder = dictionary_holder;
290
291 lcd_cache->set(dictionary_key, cached_values);
292 }
293 }
294 }
295
296 if constexpr (has_mapped)
297 mapped_cache.resize(key_columns[0]->size());
298
299 VisitValue empty(VisitValue::Empty);
300 visit_cache.assign(key_columns[0]->size(), empty);
301
302 size_of_index_type = column->getSizeOfIndexType();
303 positions = column->getIndexesPtr().get();
304 }
305
306 ALWAYS_INLINE size_t getIndexAt(size_t row) const
307 {
308 switch (size_of_index_type)
309 {
310 case sizeof(UInt8): return assert_cast<const ColumnUInt8 *>(positions)->getElement(row);
311 case sizeof(UInt16): return assert_cast<const ColumnUInt16 *>(positions)->getElement(row);
312 case sizeof(UInt32): return assert_cast<const ColumnUInt32 *>(positions)->getElement(row);
313 case sizeof(UInt64): return assert_cast<const ColumnUInt64 *>(positions)->getElement(row);
314 default: throw Exception("Unexpected size of index type for low cardinality column.", ErrorCodes::LOGICAL_ERROR);
315 }
316 }
317
318 /// Get the key holder from the key columns for insertion into the hash table.
319 ALWAYS_INLINE auto getKeyHolder(size_t row, Arena & pool) const
320 {
321 return Base::getKeyHolder(getIndexAt(row), pool);
322 }
323
324 template <typename Data>
325 ALWAYS_INLINE EmplaceResult emplaceKey(Data & data, size_t row_, Arena & pool)
326 {
327 size_t row = getIndexAt(row_);
328
329 if (is_nullable && row == 0)
330 {
331 visit_cache[row] = VisitValue::Found;
332 bool has_null_key = data.hasNullKeyData();
333 data.hasNullKeyData() = true;
334
335 if constexpr (has_mapped)
336 return EmplaceResult(data.getNullKeyData(), mapped_cache[0], !has_null_key);
337 else
338 return EmplaceResult(!has_null_key);
339 }
340
341 if (visit_cache[row] == VisitValue::Found)
342 {
343 if constexpr (has_mapped)
344 return EmplaceResult(mapped_cache[row], mapped_cache[row], false);
345 else
346 return EmplaceResult(false);
347 }
348
349 auto key_holder = getKeyHolder(row_, pool);
350
351 bool inserted = false;
352 typename Data::LookupResult it;
353 if (saved_hash)
354 data.emplace(key_holder, it, inserted, saved_hash[row]);
355 else
356 data.emplace(key_holder, it, inserted);
357
358 visit_cache[row] = VisitValue::Found;
359
360 if constexpr (has_mapped)
361 {
362 auto & mapped = it->getMapped();
363 if (inserted)
364 {
365 new (&mapped) Mapped();
366 }
367 mapped_cache[row] = mapped;
368 return EmplaceResult(mapped, mapped_cache[row], inserted);
369 }
370 else
371 return EmplaceResult(inserted);
372 }
373
374 ALWAYS_INLINE bool isNullAt(size_t i)
375 {
376 if (!is_nullable)
377 return false;
378
379 return getIndexAt(i) == 0;
380 }
381
382 template <typename Data>
383 ALWAYS_INLINE FindResult findFromRow(Data & data, size_t row_, Arena & pool)
384 {
385 size_t row = getIndexAt(row_);
386
387 if (is_nullable && row == 0)
388 {
389 if constexpr (has_mapped)
390 return FindResult(data.hasNullKeyData() ? &data.getNullKeyData() : nullptr, data.hasNullKeyData());
391 else
392 return FindResult(data.hasNullKeyData());
393 }
394
395 if (visit_cache[row] != VisitValue::Empty)
396 {
397 if constexpr (has_mapped)
398 return FindResult(&mapped_cache[row], visit_cache[row] == VisitValue::Found);
399 else
400 return FindResult(visit_cache[row] == VisitValue::Found);
401 }
402
403 auto key_holder = getKeyHolder(row_, pool);
404
405 typename Data::iterator it;
406 if (saved_hash)
407 it = data.find(*key_holder, saved_hash[row]);
408 else
409 it = data.find(*key_holder);
410
411 bool found = it != data.end();
412 visit_cache[row] = found ? VisitValue::Found : VisitValue::NotFound;
413
414 if constexpr (has_mapped)
415 {
416 if (found)
417 mapped_cache[row] = it->second;
418 }
419
420 if constexpr (has_mapped)
421 return FindResult(&mapped_cache[row], found);
422 else
423 return FindResult(found);
424 }
425
426 template <typename Data>
427 ALWAYS_INLINE size_t getHash(const Data & data, size_t row, Arena & pool)
428 {
429 row = getIndexAt(row);
430 if (saved_hash)
431 return saved_hash[row];
432
433 return Base::getHash(data, row, pool);
434 }
435};
436
437
438// Optional mask for low cardinality columns.
439template <bool has_low_cardinality>
440struct LowCardinalityKeys
441{
442 ColumnRawPtrs nested_columns;
443 ColumnRawPtrs positions;
444 Sizes position_sizes;
445};
446
447template <>
448struct LowCardinalityKeys<false> {};
449
450/// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
451template <typename Value, typename Key, typename Mapped, bool has_nullable_keys_ = false, bool has_low_cardinality_ = false, bool use_cache = true>
452struct HashMethodKeysFixed
453 : private columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>
454 , public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache>, Value, Mapped, use_cache>
455{
456 using Self = HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache>;
457 using BaseHashed = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
458 using Base = columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>;
459
460 static constexpr bool has_nullable_keys = has_nullable_keys_;
461 static constexpr bool has_low_cardinality = has_low_cardinality_;
462
463 LowCardinalityKeys<has_low_cardinality> low_cardinality_keys;
464 Sizes key_sizes;
465 size_t keys_size;
466
467 HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &)
468 : Base(key_columns), key_sizes(std::move(key_sizes_)), keys_size(key_columns.size())
469 {
470 if constexpr (has_low_cardinality)
471 {
472 low_cardinality_keys.nested_columns.resize(key_columns.size());
473 low_cardinality_keys.positions.assign(key_columns.size(), nullptr);
474 low_cardinality_keys.position_sizes.resize(key_columns.size());
475 for (size_t i = 0; i < key_columns.size(); ++i)
476 {
477 if (auto * low_cardinality_col = typeid_cast<const ColumnLowCardinality *>(key_columns[i]))
478 {
479 low_cardinality_keys.nested_columns[i] = low_cardinality_col->getDictionary().getNestedColumn().get();
480 low_cardinality_keys.positions[i] = &low_cardinality_col->getIndexes();
481 low_cardinality_keys.position_sizes[i] = low_cardinality_col->getSizeOfIndexType();
482 }
483 else
484 low_cardinality_keys.nested_columns[i] = key_columns[i];
485 }
486 }
487 }
488
489 ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
490 {
491 if constexpr (has_nullable_keys)
492 {
493 auto bitmap = Base::createBitmap(row);
494 return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes, bitmap);
495 }
496 else
497 {
498 if constexpr (has_low_cardinality)
499 return packFixed<Key, true>(row, keys_size, low_cardinality_keys.nested_columns, key_sizes,
500 &low_cardinality_keys.positions, &low_cardinality_keys.position_sizes);
501
502 return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes);
503 }
504 }
505};
506
507/** Hash by concatenating serialized key values.
508 * The serialized value differs in that it uniquely allows to deserialize it, having only the position with which it starts.
509 * That is, for example, for strings, it contains first the serialized length of the string, and then the bytes.
510 * Therefore, when aggregating by several strings, there is no ambiguity.
511 */
512template <typename Value, typename Mapped>
513struct HashMethodSerialized
514 : public columns_hashing_impl::HashMethodBase<HashMethodSerialized<Value, Mapped>, Value, Mapped, false>
515{
516 using Self = HashMethodSerialized<Value, Mapped>;
517 using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
518
519 ColumnRawPtrs key_columns;
520 size_t keys_size;
521
522 HashMethodSerialized(const ColumnRawPtrs & key_columns_, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
523 : key_columns(key_columns_), keys_size(key_columns_.size()) {}
524
525protected:
526 friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
527
528 ALWAYS_INLINE SerializedKeyHolder getKeyHolder(size_t row, Arena & pool) const
529 {
530 return SerializedKeyHolder{
531 serializeKeysToPoolContiguous(row, keys_size, key_columns, pool),
532 pool};
533 }
534};
535
536/// For the case when there is one string key.
537template <typename Value, typename Mapped, bool use_cache = true>
538struct HashMethodHashed
539 : public columns_hashing_impl::HashMethodBase<HashMethodHashed<Value, Mapped, use_cache>, Value, Mapped, use_cache>
540{
541 using Key = UInt128;
542 using Self = HashMethodHashed<Value, Mapped, use_cache>;
543 using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
544
545 ColumnRawPtrs key_columns;
546
547 HashMethodHashed(ColumnRawPtrs key_columns_, const Sizes &, const HashMethodContextPtr &)
548 : key_columns(std::move(key_columns_)) {}
549
550 ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
551 {
552 return hash128(row, key_columns.size(), key_columns);
553 }
554};
555
556}
557}
558