1#pragma once
2
3#include <Common/HashTable/Hash.h>
4#include <Common/HashTable/HashTable.h>
5#include <Common/HashTable/HashTableAllocator.h>
6#include <Columns/ColumnString.h>
7#include <Columns/ColumnsNumber.h>
8#include <Common/assert_cast.h>
9#include <ext/range.h>
10#include <common/unaligned.h>
11
12
13namespace DB
14{
15
16namespace
17{
18 template <typename ColumnType, bool with_saved_hash, bool has_base_index>
19 struct ReverseIndexHashTableState;
20
21 template <typename ColumnType>
22 struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash */ false, /* has_base_index */ false>
23 {
24 constexpr static bool with_saved_hash = false;
25 constexpr static bool has_base_index = false;
26
27 ColumnType * index_column;
28 };
29
30 template <typename ColumnType>
31 struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash */ false, /* has_base_index */ true>
32 {
33 constexpr static bool with_saved_hash = false;
34 constexpr static bool has_base_index = true;
35
36 ColumnType * index_column;
37 size_t base_index;
38 };
39
40 template <typename ColumnType>
41 struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash = */ true, /* has_base_index */ false>
42 {
43 constexpr static bool with_saved_hash = true;
44 constexpr static bool has_base_index = false;
45
46 ColumnType * index_column;
47 typename ColumnVector<UInt64>::Container * saved_hash_column;
48 };
49
50 template <typename ColumnType>
51 struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash = */ true, /* has_base_index */ true>
52 {
53 constexpr static bool with_saved_hash = true;
54 constexpr static bool has_base_index = true;
55
56 ColumnType * index_column;
57 typename ColumnVector<UInt64>::Container * saved_hash_column;
58 size_t base_index;
59 };
60
61
62 struct ReverseIndexHash
63 {
64 template <typename T>
65 size_t operator()(T) const
66 {
67 throw Exception("operator()(key) is not implemented for ReverseIndexHash.", ErrorCodes::LOGICAL_ERROR);
68 }
69 };
70
71 template <typename IndexType, typename Hash, typename HashTable, typename ColumnType, bool string_hash, bool has_base_index>
72 struct ReverseIndexHashTableCell
73 : public HashTableCell<IndexType, Hash, ReverseIndexHashTableState<ColumnType, string_hash, has_base_index>>
74 {
75 using Base = HashTableCell<IndexType, Hash, ReverseIndexHashTableState<ColumnType, string_hash, has_base_index>>;
76 using State = typename Base::State;
77 using Base::Base;
78 using Base::key;
79 using Base::keyEquals;
80 using Base::isZero;
81
82 template <typename T>
83 static bool isZero(const T &, const State & /*state*/)
84 {
85 /// Careful: apparently this uses SFINAE to redefine isZero for all types
86 /// except the IndexType, for which the default ZeroTraits::isZero is used.
87 static_assert(!std::is_same_v<typename std::decay<T>::type, typename std::decay<IndexType>::type>);
88 return false;
89 }
90
91 /// Special case when we want to compare with something not in index_column.
92 /// When we compare something inside column default keyEquals checks only that row numbers are equal.
93 bool keyEquals(const StringRef & object, size_t hash_ [[maybe_unused]], const State & state) const
94 {
95 auto index = key;
96 if constexpr (has_base_index)
97 index -= state.base_index;
98
99 if constexpr (string_hash)
100 return hash_ == (*state.saved_hash_column)[index] && object == state.index_column->getDataAt(index);
101 else
102 return object == state.index_column->getDataAt(index);
103 }
104
105 size_t getHash(const Hash & hash) const
106 {
107 auto index = key;
108
109 /// Hack. HashTable is Hash itself.
110 const auto & state = static_cast<const State &>(static_cast<const HashTable &>(hash));
111
112 if constexpr (has_base_index)
113 index -= state.base_index;
114
115 if constexpr (string_hash)
116 return (*state.saved_hash_column)[index];
117 else
118 {
119 using ValueType = typename ColumnType::ValueType;
120 ValueType value = unalignedLoad<ValueType>(state.index_column->getDataAt(index).data);
121 return DefaultHash<ValueType>()(value);
122 }
123 }
124 };
125
126
127 /**
128 * ReverseIndexHashTableBase implements a special hash table interface for
129 * reverse index.
130 *
131 * The following requirements are different compared to a plain hash table:
132 *
133 * 1) Provide public access to 'hash table state' that contains
134 * additional data needed to calculate cell hashes.
135 *
136 * 2) Support emplace() and find() with a Key different from the resulting
137 * hash table key. This means emplace() accepts a different kind of object
138 * as a key, and then the real key can be read from the returned cell iterator.
139 *
140 * These requirements are unique to ReverseIndex and are in conflict with
141 * supporting hash tables that use alternative key storage, such as FixedHashMap
142 * or StringHashMap. Therefore, we implement an interface for ReverseIndex
143 * separately.
144 */
145 template <typename Key, typename Cell, typename Hash>
146 class ReverseIndexHashTableBase : public HashTable<Key, Cell, Hash, HashTableGrower<>, HashTableAllocator>
147 {
148 using State = typename Cell::State;
149 using Base = HashTable<Key, Cell, Hash, HashTableGrower<>, HashTableAllocator>;
150
151 public:
152 using Base::Base;
153 using iterator = typename Base::iterator;
154 using LookupResult = typename Base::LookupResult;
155 State & getState() { return *this; }
156
157
158 template <typename ObjectToCompareWith>
159 size_t ALWAYS_INLINE reverseIndexFindCell(const ObjectToCompareWith & x,
160 size_t hash_value, size_t place_value) const
161 {
162 while (!this->buf[place_value].isZero(*this)
163 && !this->buf[place_value].keyEquals(x, hash_value, *this))
164 {
165 place_value = this->grower.next(place_value);
166 }
167
168 return place_value;
169 }
170
171 template <typename ObjectToCompareWith>
172 void ALWAYS_INLINE reverseIndexEmplaceNonZero(const Key & key, LookupResult & it,
173 bool & inserted, size_t hash_value, const ObjectToCompareWith & object)
174 {
175 size_t place_value = reverseIndexFindCell(object, hash_value,
176 this->grower.place(hash_value));
177 // emplaceNonZeroImpl() might need to re-find the cell if the table grows,
178 // but it will find it correctly by the key alone, so we don't have to
179 // pass it the 'object'.
180 this->emplaceNonZeroImpl(place_value, key, it, inserted, hash_value);
181 }
182
183 /// Searches position by object.
184 template <typename ObjectToCompareWith>
185 void ALWAYS_INLINE reverseIndexEmplace(Key key, iterator & it, bool & inserted,
186 size_t hash_value, const ObjectToCompareWith& object)
187 {
188 LookupResult impl_it = nullptr;
189
190 if (!this->emplaceIfZero(key, impl_it, inserted, hash_value))
191 {
192 reverseIndexEmplaceNonZero(key, impl_it, inserted, hash_value, object);
193 }
194 assert(impl_it != nullptr);
195 it = iterator(this, impl_it);
196 }
197
198 template <typename ObjectToCompareWith>
199 iterator ALWAYS_INLINE reverseIndexFind(ObjectToCompareWith x, size_t hash_value)
200 {
201 if (Cell::isZero(x, *this))
202 return this->hasZero() ? this->iteratorToZero() : this->end();
203
204 size_t place_value = reverseIndexFindCell(x, hash_value,
205 this->grower.place(hash_value));
206 return !this->buf[place_value].isZero(*this)
207 ? iterator(this, &this->buf[place_value])
208 : this->end();
209 }
210 };
211
212 template <typename IndexType, typename ColumnType, bool has_base_index>
213 class ReverseIndexStringHashTable : public ReverseIndexHashTableBase<
214 IndexType,
215 ReverseIndexHashTableCell<
216 IndexType,
217 ReverseIndexHash,
218 ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>,
219 ColumnType,
220 true,
221 has_base_index>,
222 ReverseIndexHash>
223 {
224 using Base = ReverseIndexHashTableBase<
225 IndexType,
226 ReverseIndexHashTableCell<
227 IndexType,
228 ReverseIndexHash,
229 ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>,
230 ColumnType,
231 true,
232 has_base_index>,
233 ReverseIndexHash>;
234 public:
235 using Base::Base;
236 friend struct ReverseIndexHashTableCell<
237 IndexType,
238 ReverseIndexHash,
239 ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>,
240 ColumnType,
241 true,
242 has_base_index>;
243 };
244
245 template <typename IndexType, typename ColumnType, bool has_base_index>
246 class ReverseIndexNumberHashTable : public ReverseIndexHashTableBase<
247 IndexType,
248 ReverseIndexHashTableCell<
249 IndexType,
250 ReverseIndexHash,
251 ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>,
252 ColumnType,
253 false,
254 has_base_index>,
255 ReverseIndexHash>
256 {
257 using Base = ReverseIndexHashTableBase<
258 IndexType,
259 ReverseIndexHashTableCell<
260 IndexType,
261 ReverseIndexHash,
262 ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>,
263 ColumnType,
264 false,
265 has_base_index>,
266 ReverseIndexHash>;
267 public:
268 using Base::Base;
269 friend struct ReverseIndexHashTableCell<
270 IndexType,
271 ReverseIndexHash,
272 ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>,
273 ColumnType,
274 false,
275 has_base_index>;
276 };
277
278
279 template <typename IndexType, typename ColumnType, bool has_base_index, bool is_numeric_column>
280 struct SelectReverseIndexHashTable;
281
282 template <typename IndexType, typename ColumnType, bool has_base_index>
283 struct SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index, true>
284 {
285 using Type = ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>;
286 };
287
288 template <typename IndexType, typename ColumnType, bool has_base_index>
289 struct SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index, false>
290 {
291 using Type = ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>;
292 };
293
294
295 template <typename T>
296 constexpr bool isNumericColumn(const T *) { return false; }
297
298 template <typename T>
299 constexpr bool isNumericColumn(const ColumnVector<T> *) { return true; }
300
301 static_assert(isNumericColumn(static_cast<ColumnVector<UInt8> *>(nullptr)));
302 static_assert(!isNumericColumn(static_cast<ColumnString *>(nullptr)));
303
304
305 template <typename IndexType, typename ColumnType, bool has_base_index>
306 using ReverseIndexHashTable = typename SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index,
307 isNumericColumn(static_cast<ColumnType *>(nullptr))>::Type;
308}
309
310
311template <typename IndexType, typename ColumnType>
312class ReverseIndex
313{
314public:
315 explicit ReverseIndex(UInt64 num_prefix_rows_to_skip_, UInt64 base_index_)
316 : num_prefix_rows_to_skip(num_prefix_rows_to_skip_), base_index(base_index_), saved_hash_ptr(nullptr) {}
317
318 void setColumn(ColumnType * column_);
319
320 static constexpr bool is_numeric_column = isNumericColumn(static_cast<ColumnType *>(nullptr));
321 static constexpr bool use_saved_hash = !is_numeric_column;
322
323 UInt64 insert(const StringRef & data);
324 UInt64 getInsertionPoint(const StringRef & data);
325 UInt64 lastInsertionPoint() const { return size() + base_index; }
326
327 ColumnType * getColumn() const { return column; }
328 size_t size() const;
329
330 const UInt64 * tryGetSavedHash() const
331 {
332 if (!use_saved_hash)
333 return nullptr;
334
335 UInt64 * ptr = saved_hash_ptr.load();
336 if (!ptr)
337 {
338 auto hash = calcHashes();
339 ptr = &hash->getData()[0];
340 UInt64 * expected = nullptr;
341 if (saved_hash_ptr.compare_exchange_strong(expected, ptr))
342 saved_hash = std::move(hash);
343 else
344 ptr = expected;
345 }
346
347 return ptr;
348 }
349
350 size_t allocatedBytes() const { return index ? index->getBufferSizeInBytes() : 0; }
351
352private:
353 ColumnType * column = nullptr;
354 UInt64 num_prefix_rows_to_skip; /// The number prefix tows in column which won't be sored at index.
355 UInt64 base_index; /// This values will be added to row number which is inserted into index.
356
357 using IndexMapType = ReverseIndexHashTable<IndexType, ColumnType, true>;
358
359 /// Lazy initialized.
360 std::unique_ptr<IndexMapType> index;
361 mutable ColumnUInt64::MutablePtr saved_hash;
362 mutable std::atomic<UInt64 *> saved_hash_ptr;
363
364 void buildIndex();
365
366 UInt64 getHash(const StringRef & ref) const
367 {
368 if constexpr (is_numeric_column)
369 {
370 using ValueType = typename ColumnType::ValueType;
371 ValueType value = unalignedLoad<ValueType>(ref.data);
372 return DefaultHash<ValueType>()(value);
373 }
374 else
375 return StringRefHash()(ref);
376 }
377
378 ColumnUInt64::MutablePtr calcHashes() const;
379};
380
381
382template <typename IndexType, typename ColumnType>
383void ReverseIndex<IndexType, ColumnType>:: setColumn(ColumnType * column_)
384{
385 if (column != column_)
386 {
387 index = nullptr;
388 saved_hash = nullptr;
389 }
390
391 column = column_;
392}
393
394template <typename IndexType, typename ColumnType>
395size_t ReverseIndex<IndexType, ColumnType>::size() const
396{
397 if (!column)
398 throw Exception("ReverseIndex has not size because index column wasn't set.", ErrorCodes::LOGICAL_ERROR);
399
400 return column->size();
401}
402
403template <typename IndexType, typename ColumnType>
404void ReverseIndex<IndexType, ColumnType>::buildIndex()
405{
406 if (index)
407 return;
408
409 if (!column)
410 throw Exception("ReverseIndex can't build index because index column wasn't set.", ErrorCodes::LOGICAL_ERROR);
411
412 auto size = column->size();
413 index = std::make_unique<IndexMapType>(size);
414
415 if constexpr (use_saved_hash)
416 saved_hash = calcHashes();
417
418 auto & state = index->getState();
419 state.index_column = column;
420 state.base_index = base_index;
421 if constexpr (use_saved_hash)
422 state.saved_hash_column = &saved_hash->getData();
423
424 using IteratorType = typename IndexMapType::iterator;
425 IteratorType iterator;
426 bool inserted;
427
428 for (auto row : ext::range(num_prefix_rows_to_skip, size))
429 {
430 UInt64 hash;
431 if constexpr (use_saved_hash)
432 hash = saved_hash->getElement(row);
433 else
434 hash = getHash(column->getDataAt(row));
435
436 index->reverseIndexEmplace(row + base_index, iterator, inserted, hash, column->getDataAt(row));
437
438 if (!inserted)
439 throw Exception("Duplicating keys found in ReverseIndex.", ErrorCodes::LOGICAL_ERROR);
440 }
441}
442
443template <typename IndexType, typename ColumnType>
444ColumnUInt64::MutablePtr ReverseIndex<IndexType, ColumnType>::calcHashes() const
445{
446 if (!column)
447 throw Exception("ReverseIndex can't build index because index column wasn't set.", ErrorCodes::LOGICAL_ERROR);
448
449 auto size = column->size();
450 auto hash = ColumnUInt64::create(size);
451
452 for (auto row : ext::range(0, size))
453 hash->getElement(row) = getHash(column->getDataAt(row));
454
455 return hash;
456}
457
458template <typename IndexType, typename ColumnType>
459UInt64 ReverseIndex<IndexType, ColumnType>::insert(const StringRef & data)
460{
461 if (!index)
462 buildIndex();
463
464 using IteratorType = typename IndexMapType::iterator;
465 IteratorType iterator;
466 bool inserted;
467
468 auto hash = getHash(data);
469 UInt64 num_rows = size();
470
471 if constexpr (use_saved_hash)
472 {
473 auto & column_data = saved_hash->getData();
474 if (column_data.size() <= num_rows)
475 column_data.resize(num_rows + 1);
476 column_data[num_rows] = hash;
477 }
478 else
479 column->insertData(data.data, data.size);
480
481 index->reverseIndexEmplace(num_rows + base_index, iterator, inserted, hash, data);
482
483 if constexpr (use_saved_hash)
484 {
485 if (inserted)
486 column->insertData(data.data, data.size);
487 }
488 else
489 {
490 if (!inserted)
491 column->popBack(1);
492 }
493
494 return iterator->getValue();
495}
496
497template <typename IndexType, typename ColumnType>
498UInt64 ReverseIndex<IndexType, ColumnType>::getInsertionPoint(const StringRef & data)
499{
500 if (!index)
501 buildIndex();
502
503 using IteratorType = typename IndexMapType::iterator;
504 IteratorType iterator;
505
506 auto hash = getHash(data);
507 iterator = index->reverseIndexFind(data, hash);
508
509 return iterator == index->end() ? size() + base_index : iterator->getValue();
510}
511
512}
513