1 | #pragma once |
2 | |
3 | #include <Common/HashTable/Hash.h> |
4 | #include <Common/HashTable/HashTable.h> |
5 | #include <Common/HashTable/HashTableAllocator.h> |
6 | #include <Columns/ColumnString.h> |
7 | #include <Columns/ColumnsNumber.h> |
8 | #include <Common/assert_cast.h> |
9 | #include <ext/range.h> |
10 | #include <common/unaligned.h> |
11 | |
12 | |
13 | namespace DB |
14 | { |
15 | |
16 | namespace |
17 | { |
18 | template <typename ColumnType, bool with_saved_hash, bool has_base_index> |
19 | struct ReverseIndexHashTableState; |
20 | |
21 | template <typename ColumnType> |
22 | struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash */ false, /* has_base_index */ false> |
23 | { |
24 | constexpr static bool with_saved_hash = false; |
25 | constexpr static bool has_base_index = false; |
26 | |
27 | ColumnType * index_column; |
28 | }; |
29 | |
30 | template <typename ColumnType> |
31 | struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash */ false, /* has_base_index */ true> |
32 | { |
33 | constexpr static bool with_saved_hash = false; |
34 | constexpr static bool has_base_index = true; |
35 | |
36 | ColumnType * index_column; |
37 | size_t base_index; |
38 | }; |
39 | |
40 | template <typename ColumnType> |
41 | struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash = */ true, /* has_base_index */ false> |
42 | { |
43 | constexpr static bool with_saved_hash = true; |
44 | constexpr static bool has_base_index = false; |
45 | |
46 | ColumnType * index_column; |
47 | typename ColumnVector<UInt64>::Container * saved_hash_column; |
48 | }; |
49 | |
50 | template <typename ColumnType> |
51 | struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash = */ true, /* has_base_index */ true> |
52 | { |
53 | constexpr static bool with_saved_hash = true; |
54 | constexpr static bool has_base_index = true; |
55 | |
56 | ColumnType * index_column; |
57 | typename ColumnVector<UInt64>::Container * saved_hash_column; |
58 | size_t base_index; |
59 | }; |
60 | |
61 | |
62 | struct ReverseIndexHash |
63 | { |
64 | template <typename T> |
65 | size_t operator()(T) const |
66 | { |
67 | throw Exception("operator()(key) is not implemented for ReverseIndexHash." , ErrorCodes::LOGICAL_ERROR); |
68 | } |
69 | }; |
70 | |
71 | template <typename IndexType, typename Hash, typename HashTable, typename ColumnType, bool string_hash, bool has_base_index> |
72 | struct ReverseIndexHashTableCell |
73 | : public HashTableCell<IndexType, Hash, ReverseIndexHashTableState<ColumnType, string_hash, has_base_index>> |
74 | { |
75 | using Base = HashTableCell<IndexType, Hash, ReverseIndexHashTableState<ColumnType, string_hash, has_base_index>>; |
76 | using State = typename Base::State; |
77 | using Base::Base; |
78 | using Base::key; |
79 | using Base::keyEquals; |
80 | using Base::isZero; |
81 | |
82 | template <typename T> |
83 | static bool isZero(const T &, const State & /*state*/) |
84 | { |
85 | /// Careful: apparently this uses SFINAE to redefine isZero for all types |
86 | /// except the IndexType, for which the default ZeroTraits::isZero is used. |
87 | static_assert(!std::is_same_v<typename std::decay<T>::type, typename std::decay<IndexType>::type>); |
88 | return false; |
89 | } |
90 | |
91 | /// Special case when we want to compare with something not in index_column. |
92 | /// When we compare something inside column default keyEquals checks only that row numbers are equal. |
93 | bool keyEquals(const StringRef & object, size_t hash_ [[maybe_unused]], const State & state) const |
94 | { |
95 | auto index = key; |
96 | if constexpr (has_base_index) |
97 | index -= state.base_index; |
98 | |
99 | if constexpr (string_hash) |
100 | return hash_ == (*state.saved_hash_column)[index] && object == state.index_column->getDataAt(index); |
101 | else |
102 | return object == state.index_column->getDataAt(index); |
103 | } |
104 | |
105 | size_t getHash(const Hash & hash) const |
106 | { |
107 | auto index = key; |
108 | |
109 | /// Hack. HashTable is Hash itself. |
110 | const auto & state = static_cast<const State &>(static_cast<const HashTable &>(hash)); |
111 | |
112 | if constexpr (has_base_index) |
113 | index -= state.base_index; |
114 | |
115 | if constexpr (string_hash) |
116 | return (*state.saved_hash_column)[index]; |
117 | else |
118 | { |
119 | using ValueType = typename ColumnType::ValueType; |
120 | ValueType value = unalignedLoad<ValueType>(state.index_column->getDataAt(index).data); |
121 | return DefaultHash<ValueType>()(value); |
122 | } |
123 | } |
124 | }; |
125 | |
126 | |
127 | /** |
128 | * ReverseIndexHashTableBase implements a special hash table interface for |
129 | * reverse index. |
130 | * |
131 | * The following requirements are different compared to a plain hash table: |
132 | * |
133 | * 1) Provide public access to 'hash table state' that contains |
134 | * additional data needed to calculate cell hashes. |
135 | * |
136 | * 2) Support emplace() and find() with a Key different from the resulting |
137 | * hash table key. This means emplace() accepts a different kind of object |
138 | * as a key, and then the real key can be read from the returned cell iterator. |
139 | * |
140 | * These requirements are unique to ReverseIndex and are in conflict with |
141 | * supporting hash tables that use alternative key storage, such as FixedHashMap |
142 | * or StringHashMap. Therefore, we implement an interface for ReverseIndex |
143 | * separately. |
144 | */ |
145 | template <typename Key, typename Cell, typename Hash> |
146 | class ReverseIndexHashTableBase : public HashTable<Key, Cell, Hash, HashTableGrower<>, HashTableAllocator> |
147 | { |
148 | using State = typename Cell::State; |
149 | using Base = HashTable<Key, Cell, Hash, HashTableGrower<>, HashTableAllocator>; |
150 | |
151 | public: |
152 | using Base::Base; |
153 | using iterator = typename Base::iterator; |
154 | using LookupResult = typename Base::LookupResult; |
155 | State & getState() { return *this; } |
156 | |
157 | |
158 | template <typename ObjectToCompareWith> |
159 | size_t ALWAYS_INLINE reverseIndexFindCell(const ObjectToCompareWith & x, |
160 | size_t hash_value, size_t place_value) const |
161 | { |
162 | while (!this->buf[place_value].isZero(*this) |
163 | && !this->buf[place_value].keyEquals(x, hash_value, *this)) |
164 | { |
165 | place_value = this->grower.next(place_value); |
166 | } |
167 | |
168 | return place_value; |
169 | } |
170 | |
171 | template <typename ObjectToCompareWith> |
172 | void ALWAYS_INLINE reverseIndexEmplaceNonZero(const Key & key, LookupResult & it, |
173 | bool & inserted, size_t hash_value, const ObjectToCompareWith & object) |
174 | { |
175 | size_t place_value = reverseIndexFindCell(object, hash_value, |
176 | this->grower.place(hash_value)); |
177 | // emplaceNonZeroImpl() might need to re-find the cell if the table grows, |
178 | // but it will find it correctly by the key alone, so we don't have to |
179 | // pass it the 'object'. |
180 | this->emplaceNonZeroImpl(place_value, key, it, inserted, hash_value); |
181 | } |
182 | |
183 | /// Searches position by object. |
184 | template <typename ObjectToCompareWith> |
185 | void ALWAYS_INLINE reverseIndexEmplace(Key key, iterator & it, bool & inserted, |
186 | size_t hash_value, const ObjectToCompareWith& object) |
187 | { |
188 | LookupResult impl_it = nullptr; |
189 | |
190 | if (!this->emplaceIfZero(key, impl_it, inserted, hash_value)) |
191 | { |
192 | reverseIndexEmplaceNonZero(key, impl_it, inserted, hash_value, object); |
193 | } |
194 | assert(impl_it != nullptr); |
195 | it = iterator(this, impl_it); |
196 | } |
197 | |
198 | template <typename ObjectToCompareWith> |
199 | iterator ALWAYS_INLINE reverseIndexFind(ObjectToCompareWith x, size_t hash_value) |
200 | { |
201 | if (Cell::isZero(x, *this)) |
202 | return this->hasZero() ? this->iteratorToZero() : this->end(); |
203 | |
204 | size_t place_value = reverseIndexFindCell(x, hash_value, |
205 | this->grower.place(hash_value)); |
206 | return !this->buf[place_value].isZero(*this) |
207 | ? iterator(this, &this->buf[place_value]) |
208 | : this->end(); |
209 | } |
210 | }; |
211 | |
212 | template <typename IndexType, typename ColumnType, bool has_base_index> |
213 | class ReverseIndexStringHashTable : public ReverseIndexHashTableBase< |
214 | IndexType, |
215 | ReverseIndexHashTableCell< |
216 | IndexType, |
217 | ReverseIndexHash, |
218 | ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>, |
219 | ColumnType, |
220 | true, |
221 | has_base_index>, |
222 | ReverseIndexHash> |
223 | { |
224 | using Base = ReverseIndexHashTableBase< |
225 | IndexType, |
226 | ReverseIndexHashTableCell< |
227 | IndexType, |
228 | ReverseIndexHash, |
229 | ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>, |
230 | ColumnType, |
231 | true, |
232 | has_base_index>, |
233 | ReverseIndexHash>; |
234 | public: |
235 | using Base::Base; |
236 | friend struct ReverseIndexHashTableCell< |
237 | IndexType, |
238 | ReverseIndexHash, |
239 | ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>, |
240 | ColumnType, |
241 | true, |
242 | has_base_index>; |
243 | }; |
244 | |
245 | template <typename IndexType, typename ColumnType, bool has_base_index> |
246 | class ReverseIndexNumberHashTable : public ReverseIndexHashTableBase< |
247 | IndexType, |
248 | ReverseIndexHashTableCell< |
249 | IndexType, |
250 | ReverseIndexHash, |
251 | ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>, |
252 | ColumnType, |
253 | false, |
254 | has_base_index>, |
255 | ReverseIndexHash> |
256 | { |
257 | using Base = ReverseIndexHashTableBase< |
258 | IndexType, |
259 | ReverseIndexHashTableCell< |
260 | IndexType, |
261 | ReverseIndexHash, |
262 | ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>, |
263 | ColumnType, |
264 | false, |
265 | has_base_index>, |
266 | ReverseIndexHash>; |
267 | public: |
268 | using Base::Base; |
269 | friend struct ReverseIndexHashTableCell< |
270 | IndexType, |
271 | ReverseIndexHash, |
272 | ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>, |
273 | ColumnType, |
274 | false, |
275 | has_base_index>; |
276 | }; |
277 | |
278 | |
279 | template <typename IndexType, typename ColumnType, bool has_base_index, bool is_numeric_column> |
280 | struct SelectReverseIndexHashTable; |
281 | |
282 | template <typename IndexType, typename ColumnType, bool has_base_index> |
283 | struct SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index, true> |
284 | { |
285 | using Type = ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>; |
286 | }; |
287 | |
288 | template <typename IndexType, typename ColumnType, bool has_base_index> |
289 | struct SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index, false> |
290 | { |
291 | using Type = ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>; |
292 | }; |
293 | |
294 | |
295 | template <typename T> |
296 | constexpr bool isNumericColumn(const T *) { return false; } |
297 | |
298 | template <typename T> |
299 | constexpr bool isNumericColumn(const ColumnVector<T> *) { return true; } |
300 | |
301 | static_assert(isNumericColumn(static_cast<ColumnVector<UInt8> *>(nullptr))); |
302 | static_assert(!isNumericColumn(static_cast<ColumnString *>(nullptr))); |
303 | |
304 | |
305 | template <typename IndexType, typename ColumnType, bool has_base_index> |
306 | using ReverseIndexHashTable = typename SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index, |
307 | isNumericColumn(static_cast<ColumnType *>(nullptr))>::Type; |
308 | } |
309 | |
310 | |
311 | template <typename IndexType, typename ColumnType> |
312 | class ReverseIndex |
313 | { |
314 | public: |
315 | explicit ReverseIndex(UInt64 num_prefix_rows_to_skip_, UInt64 base_index_) |
316 | : num_prefix_rows_to_skip(num_prefix_rows_to_skip_), base_index(base_index_), saved_hash_ptr(nullptr) {} |
317 | |
318 | void setColumn(ColumnType * column_); |
319 | |
320 | static constexpr bool is_numeric_column = isNumericColumn(static_cast<ColumnType *>(nullptr)); |
321 | static constexpr bool use_saved_hash = !is_numeric_column; |
322 | |
323 | UInt64 insert(const StringRef & data); |
324 | UInt64 getInsertionPoint(const StringRef & data); |
325 | UInt64 lastInsertionPoint() const { return size() + base_index; } |
326 | |
327 | ColumnType * getColumn() const { return column; } |
328 | size_t size() const; |
329 | |
330 | const UInt64 * tryGetSavedHash() const |
331 | { |
332 | if (!use_saved_hash) |
333 | return nullptr; |
334 | |
335 | UInt64 * ptr = saved_hash_ptr.load(); |
336 | if (!ptr) |
337 | { |
338 | auto hash = calcHashes(); |
339 | ptr = &hash->getData()[0]; |
340 | UInt64 * expected = nullptr; |
341 | if (saved_hash_ptr.compare_exchange_strong(expected, ptr)) |
342 | saved_hash = std::move(hash); |
343 | else |
344 | ptr = expected; |
345 | } |
346 | |
347 | return ptr; |
348 | } |
349 | |
350 | size_t allocatedBytes() const { return index ? index->getBufferSizeInBytes() : 0; } |
351 | |
352 | private: |
353 | ColumnType * column = nullptr; |
354 | UInt64 num_prefix_rows_to_skip; /// The number prefix tows in column which won't be sored at index. |
355 | UInt64 base_index; /// This values will be added to row number which is inserted into index. |
356 | |
357 | using IndexMapType = ReverseIndexHashTable<IndexType, ColumnType, true>; |
358 | |
359 | /// Lazy initialized. |
360 | std::unique_ptr<IndexMapType> index; |
361 | mutable ColumnUInt64::MutablePtr saved_hash; |
362 | mutable std::atomic<UInt64 *> saved_hash_ptr; |
363 | |
364 | void buildIndex(); |
365 | |
366 | UInt64 getHash(const StringRef & ref) const |
367 | { |
368 | if constexpr (is_numeric_column) |
369 | { |
370 | using ValueType = typename ColumnType::ValueType; |
371 | ValueType value = unalignedLoad<ValueType>(ref.data); |
372 | return DefaultHash<ValueType>()(value); |
373 | } |
374 | else |
375 | return StringRefHash()(ref); |
376 | } |
377 | |
378 | ColumnUInt64::MutablePtr calcHashes() const; |
379 | }; |
380 | |
381 | |
382 | template <typename IndexType, typename ColumnType> |
383 | void ReverseIndex<IndexType, ColumnType>:: setColumn(ColumnType * column_) |
384 | { |
385 | if (column != column_) |
386 | { |
387 | index = nullptr; |
388 | saved_hash = nullptr; |
389 | } |
390 | |
391 | column = column_; |
392 | } |
393 | |
394 | template <typename IndexType, typename ColumnType> |
395 | size_t ReverseIndex<IndexType, ColumnType>::size() const |
396 | { |
397 | if (!column) |
398 | throw Exception("ReverseIndex has not size because index column wasn't set." , ErrorCodes::LOGICAL_ERROR); |
399 | |
400 | return column->size(); |
401 | } |
402 | |
403 | template <typename IndexType, typename ColumnType> |
404 | void ReverseIndex<IndexType, ColumnType>::buildIndex() |
405 | { |
406 | if (index) |
407 | return; |
408 | |
409 | if (!column) |
410 | throw Exception("ReverseIndex can't build index because index column wasn't set." , ErrorCodes::LOGICAL_ERROR); |
411 | |
412 | auto size = column->size(); |
413 | index = std::make_unique<IndexMapType>(size); |
414 | |
415 | if constexpr (use_saved_hash) |
416 | saved_hash = calcHashes(); |
417 | |
418 | auto & state = index->getState(); |
419 | state.index_column = column; |
420 | state.base_index = base_index; |
421 | if constexpr (use_saved_hash) |
422 | state.saved_hash_column = &saved_hash->getData(); |
423 | |
424 | using IteratorType = typename IndexMapType::iterator; |
425 | IteratorType iterator; |
426 | bool inserted; |
427 | |
428 | for (auto row : ext::range(num_prefix_rows_to_skip, size)) |
429 | { |
430 | UInt64 hash; |
431 | if constexpr (use_saved_hash) |
432 | hash = saved_hash->getElement(row); |
433 | else |
434 | hash = getHash(column->getDataAt(row)); |
435 | |
436 | index->reverseIndexEmplace(row + base_index, iterator, inserted, hash, column->getDataAt(row)); |
437 | |
438 | if (!inserted) |
439 | throw Exception("Duplicating keys found in ReverseIndex." , ErrorCodes::LOGICAL_ERROR); |
440 | } |
441 | } |
442 | |
443 | template <typename IndexType, typename ColumnType> |
444 | ColumnUInt64::MutablePtr ReverseIndex<IndexType, ColumnType>::calcHashes() const |
445 | { |
446 | if (!column) |
447 | throw Exception("ReverseIndex can't build index because index column wasn't set." , ErrorCodes::LOGICAL_ERROR); |
448 | |
449 | auto size = column->size(); |
450 | auto hash = ColumnUInt64::create(size); |
451 | |
452 | for (auto row : ext::range(0, size)) |
453 | hash->getElement(row) = getHash(column->getDataAt(row)); |
454 | |
455 | return hash; |
456 | } |
457 | |
458 | template <typename IndexType, typename ColumnType> |
459 | UInt64 ReverseIndex<IndexType, ColumnType>::insert(const StringRef & data) |
460 | { |
461 | if (!index) |
462 | buildIndex(); |
463 | |
464 | using IteratorType = typename IndexMapType::iterator; |
465 | IteratorType iterator; |
466 | bool inserted; |
467 | |
468 | auto hash = getHash(data); |
469 | UInt64 num_rows = size(); |
470 | |
471 | if constexpr (use_saved_hash) |
472 | { |
473 | auto & column_data = saved_hash->getData(); |
474 | if (column_data.size() <= num_rows) |
475 | column_data.resize(num_rows + 1); |
476 | column_data[num_rows] = hash; |
477 | } |
478 | else |
479 | column->insertData(data.data, data.size); |
480 | |
481 | index->reverseIndexEmplace(num_rows + base_index, iterator, inserted, hash, data); |
482 | |
483 | if constexpr (use_saved_hash) |
484 | { |
485 | if (inserted) |
486 | column->insertData(data.data, data.size); |
487 | } |
488 | else |
489 | { |
490 | if (!inserted) |
491 | column->popBack(1); |
492 | } |
493 | |
494 | return iterator->getValue(); |
495 | } |
496 | |
497 | template <typename IndexType, typename ColumnType> |
498 | UInt64 ReverseIndex<IndexType, ColumnType>::getInsertionPoint(const StringRef & data) |
499 | { |
500 | if (!index) |
501 | buildIndex(); |
502 | |
503 | using IteratorType = typename IndexMapType::iterator; |
504 | IteratorType iterator; |
505 | |
506 | auto hash = getHash(data); |
507 | iterator = index->reverseIndexFind(data, hash); |
508 | |
509 | return iterator == index->end() ? size() + base_index : iterator->getValue(); |
510 | } |
511 | |
512 | } |
513 | |