1 | #pragma once |
2 | |
3 | #include <string.h> |
4 | |
5 | #include <math.h> |
6 | |
7 | #include <utility> |
8 | |
9 | #include <boost/noncopyable.hpp> |
10 | |
11 | #include <common/likely.h> |
12 | |
13 | #include <Core/Defines.h> |
14 | #include <Core/Types.h> |
15 | #include <Common/Exception.h> |
16 | |
17 | #include <IO/WriteBuffer.h> |
18 | #include <IO/WriteHelpers.h> |
19 | #include <IO/ReadBuffer.h> |
20 | #include <IO/ReadHelpers.h> |
21 | #include <IO/VarInt.h> |
22 | |
23 | #include <Common/HashTable/HashTableAllocator.h> |
24 | #include <Common/HashTable/HashTableKeyHolder.h> |
25 | |
26 | #ifdef DBMS_HASH_MAP_DEBUG_RESIZES |
27 | #include <iostream> |
28 | #include <iomanip> |
29 | #include <Common/Stopwatch.h> |
30 | #endif |
31 | |
32 | /** NOTE HashTable could only be used for memmoveable (position independent) types. |
33 | * Example: std::string is not position independent in libstdc++ with C++11 ABI or in libc++. |
34 | * Also, key in hash table must be of type, that zero bytes is compared equals to zero key. |
35 | */ |
36 | |
37 | |
38 | namespace DB |
39 | { |
40 | namespace ErrorCodes |
41 | { |
42 | extern const int LOGICAL_ERROR; |
43 | extern const int NO_AVAILABLE_DATA; |
44 | } |
45 | } |
46 | |
47 | |
48 | /** The state of the hash table that affects the properties of its cells. |
49 | * Used as a template parameter. |
50 | * For example, there is an implementation of an instantly clearable hash table - ClearableHashMap. |
51 | * For it, each cell holds the version number, and in the hash table itself is the current version. |
52 | * When clearing, the current version simply increases; All cells with a mismatching version are considered empty. |
53 | * Another example: for an approximate calculation of the number of unique visitors, there is a hash table for UniquesHashSet. |
54 | * It has the concept of "degree". At each overflow, cells with keys that do not divide by the corresponding power of the two are deleted. |
55 | */ |
56 | struct HashTableNoState |
57 | { |
58 | /// Serialization, in binary and text form. |
59 | void write(DB::WriteBuffer &) const {} |
60 | void writeText(DB::WriteBuffer &) const {} |
61 | |
62 | /// Deserialization, in binary and text form. |
63 | void read(DB::ReadBuffer &) {} |
64 | void readText(DB::ReadBuffer &) {} |
65 | }; |
66 | |
67 | |
68 | /// These functions can be overloaded for custom types. |
69 | namespace ZeroTraits |
70 | { |
71 | |
72 | template <typename T> |
73 | bool check(const T x) { return x == 0; } |
74 | |
75 | template <typename T> |
76 | void set(T & x) { x = 0; } |
77 | |
78 | } |
79 | |
80 | /** |
81 | * getKey/Mapped -- methods to get key/"mapped" values from the LookupResult returned by find() and |
82 | * emplace() methods of HashTable. Must not be called for a null LookupResult. |
83 | * |
84 | * We don't use iterators for lookup result. Instead, LookupResult is a pointer of some kind. There |
85 | * are methods getKey/Mapped, that return references or values to key/"mapped" values. |
86 | * |
87 | * Different hash table implementations support this interface to a varying degree: |
88 | * |
89 | * 1) Hash tables that store neither the key in its original form, nor a "mapped" value: |
90 | * FixedHashTable or StringHashTable. Neither GetKey nor GetMapped are supported, the only valid |
91 | * operation is checking LookupResult for null. |
92 | * |
93 | * 2) Hash maps that do not store the key, e.g. FixedHashMap or StringHashMap. Only GetMapped is |
94 | * supported. |
95 | * |
96 | * 3) Hash tables that store the key and do not have a "mapped" value, e.g. the normal HashTable. |
97 | * GetKey returns the key, and GetMapped returns a zero void pointer. This simplifies generic |
98 | * code that works with mapped values: it can overload on the return type of GetMapped(), and |
99 | * doesn't need other parameters. One example is insertSetMapped() function. |
100 | * |
101 | * 4) Hash tables that store both the key and the "mapped" value, e.g. HashMap. Both GetKey and |
102 | * GetMapped are supported. |
103 | * |
104 | * The implementation side goes as follows: |
105 | * |
106 | * for (1), LookupResult->getKey = const VoidKey, LookupResult->getMapped = VoidMapped; |
107 | * |
108 | * for (2), LookupResult->getKey = const VoidKey, LookupResult->getMapped = Mapped &; |
109 | * |
110 | * for (3) and (4), LookupResult->getKey = const Key [&], LookupResult->getMapped = Mapped &; |
111 | * VoidKey and VoidMapped may have specialized function overloads for generic code. |
112 | */ |
113 | |
114 | struct VoidKey {}; |
115 | struct VoidMapped |
116 | { |
117 | template <typename T> |
118 | auto & operator=(const T &) |
119 | { |
120 | return *this; |
121 | } |
122 | }; |
123 | |
124 | /** Compile-time interface for cell of the hash table. |
125 | * Different cell types are used to implement different hash tables. |
126 | * The cell must contain a key. |
127 | * It can also contain a value and arbitrary additional data |
128 | * (example: the stored hash value; version number for ClearableHashMap). |
129 | */ |
130 | template <typename Key, typename Hash, typename TState = HashTableNoState> |
131 | struct HashTableCell |
132 | { |
133 | using State = TState; |
134 | |
135 | using key_type = Key; |
136 | using value_type = Key; |
137 | using mapped_type = VoidMapped; |
138 | |
139 | Key key; |
140 | |
141 | HashTableCell() {} |
142 | |
143 | /// Create a cell with the given key / key and value. |
144 | HashTableCell(const Key & key_, const State &) : key(key_) {} |
145 | |
146 | /// Get the key (externally). |
147 | const Key & getKey() const { return key; } |
148 | VoidMapped getMapped() const { return {}; } |
149 | const value_type & getValue() const { return key; } |
150 | |
151 | /// Get the key (internally). |
152 | static const Key & getKey(const value_type & value) { return value; } |
153 | |
154 | /// Are the keys at the cells equal? |
155 | bool keyEquals(const Key & key_) const { return key == key_; } |
156 | bool keyEquals(const Key & key_, size_t /*hash_*/) const { return key == key_; } |
157 | bool keyEquals(const Key & key_, size_t /*hash_*/, const State & /*state*/) const { return key == key_; } |
158 | |
159 | /// If the cell can remember the value of the hash function, then remember it. |
160 | void setHash(size_t /*hash_value*/) {} |
161 | |
162 | /// If the cell can store the hash value in itself, then return the stored value. |
163 | /// It must be at least once calculated before. |
164 | /// If storing the hash value is not provided, then just compute the hash. |
165 | size_t getHash(const Hash & hash) const { return hash(key); } |
166 | |
167 | /// Whether the key is zero. In the main buffer, cells with a zero key are considered empty. |
168 | /// If zero keys can be inserted into the table, then the cell for the zero key is stored separately, not in the main buffer. |
169 | /// Zero keys must be such that the zeroed-down piece of memory is a zero key. |
170 | bool isZero(const State & state) const { return isZero(key, state); } |
171 | static bool isZero(const Key & key, const State & /*state*/) { return ZeroTraits::check(key); } |
172 | |
173 | /// Set the key value to zero. |
174 | void setZero() { ZeroTraits::set(key); } |
175 | |
176 | /// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table). |
177 | static constexpr bool need_zero_value_storage = true; |
178 | |
179 | /// Whether the cell is deleted. |
180 | bool isDeleted() const { return false; } |
181 | |
182 | /// Set the mapped value, if any (for HashMap), to the corresponding `value`. |
183 | void setMapped(const value_type & /*value*/) {} |
184 | |
185 | /// Serialization, in binary and text form. |
186 | void write(DB::WriteBuffer & wb) const { DB::writeBinary(key, wb); } |
187 | void writeText(DB::WriteBuffer & wb) const { DB::writeDoubleQuoted(key, wb); } |
188 | |
189 | /// Deserialization, in binary and text form. |
190 | void read(DB::ReadBuffer & rb) { DB::readBinary(key, rb); } |
191 | void readText(DB::ReadBuffer & rb) { DB::readDoubleQuoted(key, rb); } |
192 | }; |
193 | |
194 | /** |
195 | * A helper function for HashTable::insert() to set the "mapped" value. |
196 | * Overloaded on the mapped type, does nothing if it's VoidMapped. |
197 | */ |
198 | template <typename ValueType> |
199 | void insertSetMapped(VoidMapped /* dest */, const ValueType & /* src */) {} |
200 | |
201 | template <typename MappedType, typename ValueType> |
202 | void insertSetMapped(MappedType & dest, const ValueType & src) { dest = src.second; } |
203 | |
204 | |
205 | /** Determines the size of the hash table, and when and how much it should be resized. |
206 | */ |
207 | template <size_t initial_size_degree = 8> |
208 | struct HashTableGrower |
209 | { |
210 | /// The state of this structure is enough to get the buffer size of the hash table. |
211 | |
212 | UInt8 size_degree = initial_size_degree; |
213 | |
214 | /// The size of the hash table in the cells. |
215 | size_t bufSize() const { return 1ULL << size_degree; } |
216 | |
217 | size_t maxFill() const { return 1ULL << (size_degree - 1); } |
218 | size_t mask() const { return bufSize() - 1; } |
219 | |
220 | /// From the hash value, get the cell number in the hash table. |
221 | size_t place(size_t x) const { return x & mask(); } |
222 | |
223 | /// The next cell in the collision resolution chain. |
224 | size_t next(size_t pos) const { ++pos; return pos & mask(); } |
225 | |
226 | /// Whether the hash table is sufficiently full. You need to increase the size of the hash table, or remove something unnecessary from it. |
227 | bool overflow(size_t elems) const { return elems > maxFill(); } |
228 | |
229 | /// Increase the size of the hash table. |
230 | void increaseSize() |
231 | { |
232 | size_degree += size_degree >= 23 ? 1 : 2; |
233 | } |
234 | |
235 | /// Set the buffer size by the number of elements in the hash table. Used when deserializing a hash table. |
236 | void set(size_t num_elems) |
237 | { |
238 | size_degree = num_elems <= 1 |
239 | ? initial_size_degree |
240 | : ((initial_size_degree > static_cast<size_t>(log2(num_elems - 1)) + 2) |
241 | ? initial_size_degree |
242 | : (static_cast<size_t>(log2(num_elems - 1)) + 2)); |
243 | } |
244 | |
245 | void setBufSize(size_t buf_size_) |
246 | { |
247 | size_degree = static_cast<size_t>(log2(buf_size_ - 1) + 1); |
248 | } |
249 | }; |
250 | |
251 | |
252 | /** When used as a Grower, it turns a hash table into something like a lookup table. |
253 | * It remains non-optimal - the cells store the keys. |
254 | * Also, the compiler can not completely remove the code of passing through the collision resolution chain, although it is not needed. |
255 | * NOTE: Better to use FixedHashTable instead. |
256 | */ |
257 | template <size_t key_bits> |
258 | struct HashTableFixedGrower |
259 | { |
260 | size_t bufSize() const { return 1ULL << key_bits; } |
261 | size_t place(size_t x) const { return x; } |
262 | /// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently. |
263 | size_t next(size_t pos) const { return pos + 1; } |
264 | bool overflow(size_t /*elems*/) const { return false; } |
265 | |
266 | void increaseSize() { __builtin_unreachable(); } |
267 | void set(size_t /*num_elems*/) {} |
268 | void setBufSize(size_t /*buf_size_*/) {} |
269 | }; |
270 | |
271 | |
272 | /** If you want to store the zero key separately - a place to store it. */ |
273 | template <bool need_zero_value_storage, typename Cell> |
274 | struct ZeroValueStorage; |
275 | |
276 | template <typename Cell> |
277 | struct ZeroValueStorage<true, Cell> |
278 | { |
279 | private: |
280 | bool has_zero = false; |
281 | std::aligned_storage_t<sizeof(Cell), alignof(Cell)> zero_value_storage; /// Storage of element with zero key. |
282 | |
283 | public: |
284 | bool hasZero() const { return has_zero; } |
285 | |
286 | void setHasZero() |
287 | { |
288 | has_zero = true; |
289 | new (zeroValue()) Cell(); |
290 | } |
291 | |
292 | void clearHasZero() |
293 | { |
294 | has_zero = false; |
295 | zeroValue()->~Cell(); |
296 | } |
297 | |
298 | Cell * zeroValue() { return reinterpret_cast<Cell*>(&zero_value_storage); } |
299 | const Cell * zeroValue() const { return reinterpret_cast<const Cell*>(&zero_value_storage); } |
300 | }; |
301 | |
302 | template <typename Cell> |
303 | struct ZeroValueStorage<false, Cell> |
304 | { |
305 | bool hasZero() const { return false; } |
306 | void setHasZero() { throw DB::Exception("HashTable: logical error" , DB::ErrorCodes::LOGICAL_ERROR); } |
307 | void clearHasZero() {} |
308 | |
309 | Cell * zeroValue() { return nullptr; } |
310 | const Cell * zeroValue() const { return nullptr; } |
311 | }; |
312 | |
313 | |
314 | template |
315 | < |
316 | typename Key, |
317 | typename Cell, |
318 | typename Hash, |
319 | typename Grower, |
320 | typename Allocator |
321 | > |
322 | class HashTable : |
323 | private boost::noncopyable, |
324 | protected Hash, |
325 | protected Allocator, |
326 | protected Cell::State, |
327 | protected ZeroValueStorage<Cell::need_zero_value_storage, Cell> /// empty base optimization |
328 | { |
329 | protected: |
330 | friend class const_iterator; |
331 | friend class iterator; |
332 | friend class Reader; |
333 | |
334 | template <typename, typename, typename, typename, typename, typename, size_t> |
335 | friend class TwoLevelHashTable; |
336 | |
337 | template <typename, typename, size_t> |
338 | friend class TwoLevelStringHashTable; |
339 | |
340 | template <typename SubMaps> |
341 | friend class StringHashTable; |
342 | |
343 | using HashValue = size_t; |
344 | using Self = HashTable; |
345 | |
346 | size_t m_size = 0; /// Amount of elements |
347 | Cell * buf; /// A piece of memory for all elements except the element with zero key. |
348 | Grower grower; |
349 | |
350 | #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS |
351 | mutable size_t collisions = 0; |
352 | #endif |
353 | |
354 | /// Find a cell with the same key or an empty cell, starting from the specified position and further along the collision resolution chain. |
355 | size_t ALWAYS_INLINE findCell(const Key & x, size_t hash_value, size_t place_value) const |
356 | { |
357 | while (!buf[place_value].isZero(*this) && !buf[place_value].keyEquals(x, hash_value, *this)) |
358 | { |
359 | place_value = grower.next(place_value); |
360 | #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS |
361 | ++collisions; |
362 | #endif |
363 | } |
364 | |
365 | return place_value; |
366 | } |
367 | |
368 | |
369 | /// Find an empty cell, starting with the specified position and further along the collision resolution chain. |
370 | size_t ALWAYS_INLINE findEmptyCell(size_t place_value) const |
371 | { |
372 | while (!buf[place_value].isZero(*this)) |
373 | { |
374 | place_value = grower.next(place_value); |
375 | #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS |
376 | ++collisions; |
377 | #endif |
378 | } |
379 | |
380 | return place_value; |
381 | } |
382 | |
383 | void alloc(const Grower & new_grower) |
384 | { |
385 | buf = reinterpret_cast<Cell *>(Allocator::alloc(new_grower.bufSize() * sizeof(Cell))); |
386 | grower = new_grower; |
387 | } |
388 | |
389 | void free() |
390 | { |
391 | if (buf) |
392 | { |
393 | Allocator::free(buf, getBufferSizeInBytes()); |
394 | buf = nullptr; |
395 | } |
396 | } |
397 | |
398 | |
399 | /// Increase the size of the buffer. |
400 | void resize(size_t for_num_elems = 0, size_t for_buf_size = 0) |
401 | { |
402 | #ifdef DBMS_HASH_MAP_DEBUG_RESIZES |
403 | Stopwatch watch; |
404 | #endif |
405 | |
406 | size_t old_size = grower.bufSize(); |
407 | |
408 | /** In case of exception for the object to remain in the correct state, |
409 | * changing the variable `grower` (which determines the buffer size of the hash table) |
410 | * is postponed for a moment after a real buffer change. |
411 | * The temporary variable `new_grower` is used to determine the new size. |
412 | */ |
413 | Grower new_grower = grower; |
414 | |
415 | if (for_num_elems) |
416 | { |
417 | new_grower.set(for_num_elems); |
418 | if (new_grower.bufSize() <= old_size) |
419 | return; |
420 | } |
421 | else if (for_buf_size) |
422 | { |
423 | new_grower.setBufSize(for_buf_size); |
424 | if (new_grower.bufSize() <= old_size) |
425 | return; |
426 | } |
427 | else |
428 | new_grower.increaseSize(); |
429 | |
430 | /// Expand the space. |
431 | buf = reinterpret_cast<Cell *>(Allocator::realloc(buf, getBufferSizeInBytes(), new_grower.bufSize() * sizeof(Cell))); |
432 | grower = new_grower; |
433 | |
434 | /** Now some items may need to be moved to a new location. |
435 | * The element can stay in place, or move to a new location "on the right", |
436 | * or move to the left of the collision resolution chain, because the elements to the left of it have been moved to the new "right" location. |
437 | */ |
438 | size_t i = 0; |
439 | for (; i < old_size; ++i) |
440 | if (!buf[i].isZero(*this) && !buf[i].isDeleted()) |
441 | reinsert(buf[i], buf[i].getHash(*this)); |
442 | |
443 | /** There is also a special case: |
444 | * if the element was to be at the end of the old buffer, [ x] |
445 | * but is at the beginning because of the collision resolution chain, [o x] |
446 | * then after resizing, it will first be out of place again, [ xo ] |
447 | * and in order to transfer it where necessary, |
448 | * after transferring all the elements from the old halves you need to [ o x ] |
449 | * process tail from the collision resolution chain immediately after it [ o x ] |
450 | */ |
451 | for (; !buf[i].isZero(*this) && !buf[i].isDeleted(); ++i) |
452 | reinsert(buf[i], buf[i].getHash(*this)); |
453 | |
454 | #ifdef DBMS_HASH_MAP_DEBUG_RESIZES |
455 | watch.stop(); |
456 | std::cerr << std::fixed << std::setprecision(3) |
457 | << "Resize from " << old_size << " to " << grower.bufSize() << " took " << watch.elapsedSeconds() << " sec." |
458 | << std::endl; |
459 | #endif |
460 | } |
461 | |
462 | |
463 | /** Paste into the new buffer the value that was in the old buffer. |
464 | * Used when increasing the buffer size. |
465 | */ |
466 | void reinsert(Cell & x, size_t hash_value) |
467 | { |
468 | size_t place_value = grower.place(hash_value); |
469 | |
470 | /// If the element is in its place. |
471 | if (&x == &buf[place_value]) |
472 | return; |
473 | |
474 | /// Compute a new location, taking into account the collision resolution chain. |
475 | place_value = findCell(Cell::getKey(x.getValue()), hash_value, place_value); |
476 | |
477 | /// If the item remains in its place in the old collision resolution chain. |
478 | if (!buf[place_value].isZero(*this)) |
479 | return; |
480 | |
481 | /// Copy to a new location and zero the old one. |
482 | x.setHash(hash_value); |
483 | memcpy(static_cast<void*>(&buf[place_value]), &x, sizeof(x)); |
484 | x.setZero(); |
485 | |
486 | /// Then the elements that previously were in collision with this can move to the old place. |
487 | } |
488 | |
489 | |
490 | void destroyElements() |
491 | { |
492 | if (!std::is_trivially_destructible_v<Cell>) |
493 | for (iterator it = begin(), it_end = end(); it != it_end; ++it) |
494 | it.ptr->~Cell(); |
495 | } |
496 | |
497 | |
498 | template <typename Derived, bool is_const> |
499 | class iterator_base |
500 | { |
501 | using Container = std::conditional_t<is_const, const Self, Self>; |
502 | using cell_type = std::conditional_t<is_const, const Cell, Cell>; |
503 | |
504 | Container * container; |
505 | cell_type * ptr; |
506 | |
507 | friend class HashTable; |
508 | |
509 | public: |
510 | iterator_base() {} |
511 | iterator_base(Container * container_, cell_type * ptr_) : container(container_), ptr(ptr_) {} |
512 | |
513 | bool operator== (const iterator_base & rhs) const { return ptr == rhs.ptr; } |
514 | bool operator!= (const iterator_base & rhs) const { return ptr != rhs.ptr; } |
515 | |
516 | Derived & operator++() |
517 | { |
518 | /// If iterator was pointed to ZeroValueStorage, move it to the beginning of the main buffer. |
519 | if (unlikely(ptr->isZero(*container))) |
520 | ptr = container->buf; |
521 | else |
522 | ++ptr; |
523 | |
524 | /// Skip empty cells in the main buffer. |
525 | auto buf_end = container->buf + container->grower.bufSize(); |
526 | while (ptr < buf_end && ptr->isZero(*container)) |
527 | ++ptr; |
528 | |
529 | return static_cast<Derived &>(*this); |
530 | } |
531 | |
532 | auto & operator* () const { return *ptr; } |
533 | auto * operator->() const { return ptr; } |
534 | |
535 | auto getPtr() const { return ptr; } |
536 | size_t getHash() const { return ptr->getHash(*container); } |
537 | |
538 | size_t getCollisionChainLength() const |
539 | { |
540 | return container->grower.place((ptr - container->buf) - container->grower.place(getHash())); |
541 | } |
542 | |
543 | /** |
544 | * A hack for HashedDictionary. |
545 | * |
546 | * The problem: std-like find() returns an iterator, which has to be |
547 | * compared to end(). On the other hand, HashMap::find() returns |
548 | * LookupResult, which is compared to nullptr. HashedDictionary has to |
549 | * support both hash maps with the same code, hence the need for this |
550 | * hack. |
551 | * |
552 | * The proper way would be to remove iterator interface from our |
553 | * HashMap completely, change all its users to the existing internal |
554 | * iteration interface, and redefine end() to return LookupResult for |
555 | * compatibility with std find(). Unfortunately, now is not the time to |
556 | * do this. |
557 | */ |
558 | operator Cell * () const { return nullptr; } |
559 | }; |
560 | |
561 | |
562 | public: |
563 | using key_type = Key; |
564 | using mapped_type = typename Cell::mapped_type; |
565 | using value_type = typename Cell::value_type; |
566 | using cell_type = Cell; |
567 | |
568 | using LookupResult = Cell *; |
569 | using ConstLookupResult = const Cell *; |
570 | |
571 | size_t hash(const Key & x) const { return Hash::operator()(x); } |
572 | |
573 | |
574 | HashTable() |
575 | { |
576 | if (Cell::need_zero_value_storage) |
577 | this->zeroValue()->setZero(); |
578 | alloc(grower); |
579 | } |
580 | |
581 | HashTable(size_t reserve_for_num_elements) |
582 | { |
583 | if (Cell::need_zero_value_storage) |
584 | this->zeroValue()->setZero(); |
585 | grower.set(reserve_for_num_elements); |
586 | alloc(grower); |
587 | } |
588 | |
589 | HashTable(HashTable && rhs) |
590 | : buf(nullptr) |
591 | { |
592 | *this = std::move(rhs); |
593 | } |
594 | |
595 | ~HashTable() |
596 | { |
597 | destroyElements(); |
598 | free(); |
599 | } |
600 | |
601 | HashTable & operator= (HashTable && rhs) |
602 | { |
603 | destroyElements(); |
604 | free(); |
605 | |
606 | std::swap(buf, rhs.buf); |
607 | std::swap(m_size, rhs.m_size); |
608 | std::swap(grower, rhs.grower); |
609 | |
610 | Hash::operator=(std::move(rhs)); |
611 | Allocator::operator=(std::move(rhs)); |
612 | Cell::State::operator=(std::move(rhs)); |
613 | ZeroValueStorage<Cell::need_zero_value_storage, Cell>::operator=(std::move(rhs)); |
614 | |
615 | return *this; |
616 | } |
617 | |
618 | class Reader final : private Cell::State |
619 | { |
620 | public: |
621 | Reader(DB::ReadBuffer & in_) |
622 | : in(in_) |
623 | { |
624 | } |
625 | |
626 | Reader(const Reader &) = delete; |
627 | Reader & operator=(const Reader &) = delete; |
628 | |
629 | bool next() |
630 | { |
631 | if (!is_initialized) |
632 | { |
633 | Cell::State::read(in); |
634 | DB::readVarUInt(size, in); |
635 | is_initialized = true; |
636 | } |
637 | |
638 | if (read_count == size) |
639 | { |
640 | is_eof = true; |
641 | return false; |
642 | } |
643 | |
644 | cell.read(in); |
645 | ++read_count; |
646 | |
647 | return true; |
648 | } |
649 | |
650 | inline const value_type & get() const |
651 | { |
652 | if (!is_initialized || is_eof) |
653 | throw DB::Exception("No available data" , DB::ErrorCodes::NO_AVAILABLE_DATA); |
654 | |
655 | return cell.getValue(); |
656 | } |
657 | |
658 | private: |
659 | DB::ReadBuffer & in; |
660 | Cell cell; |
661 | size_t read_count = 0; |
662 | size_t size = 0; |
663 | bool is_eof = false; |
664 | bool is_initialized = false; |
665 | }; |
666 | |
667 | |
668 | class iterator : public iterator_base<iterator, false> |
669 | { |
670 | public: |
671 | using iterator_base<iterator, false>::iterator_base; |
672 | }; |
673 | |
674 | class const_iterator : public iterator_base<const_iterator, true> |
675 | { |
676 | public: |
677 | using iterator_base<const_iterator, true>::iterator_base; |
678 | }; |
679 | |
680 | |
681 | const_iterator begin() const |
682 | { |
683 | if (!buf) |
684 | return end(); |
685 | |
686 | if (this->hasZero()) |
687 | return iteratorToZero(); |
688 | |
689 | const Cell * ptr = buf; |
690 | auto buf_end = buf + grower.bufSize(); |
691 | while (ptr < buf_end && ptr->isZero(*this)) |
692 | ++ptr; |
693 | |
694 | return const_iterator(this, ptr); |
695 | } |
696 | |
697 | const_iterator cbegin() const { return begin(); } |
698 | |
699 | iterator begin() |
700 | { |
701 | if (!buf) |
702 | return end(); |
703 | |
704 | if (this->hasZero()) |
705 | return iteratorToZero(); |
706 | |
707 | Cell * ptr = buf; |
708 | auto buf_end = buf + grower.bufSize(); |
709 | while (ptr < buf_end && ptr->isZero(*this)) |
710 | ++ptr; |
711 | |
712 | return iterator(this, ptr); |
713 | } |
714 | |
715 | const_iterator end() const { return const_iterator(this, buf + grower.bufSize()); } |
716 | const_iterator cend() const { return end(); } |
717 | iterator end() { return iterator(this, buf + grower.bufSize()); } |
718 | |
719 | |
720 | protected: |
721 | const_iterator iteratorTo(const Cell * ptr) const { return const_iterator(this, ptr); } |
722 | iterator iteratorTo(Cell * ptr) { return iterator(this, ptr); } |
723 | const_iterator iteratorToZero() const { return iteratorTo(this->zeroValue()); } |
724 | iterator iteratorToZero() { return iteratorTo(this->zeroValue()); } |
725 | |
726 | |
727 | /// If the key is zero, insert it into a special place and return true. |
728 | /// We don't have to persist a zero key, because it's not actually inserted. |
729 | /// That's why we just take a Key by value, an not a key holder. |
730 | bool ALWAYS_INLINE emplaceIfZero(const Key & x, LookupResult & it, bool & inserted, size_t hash_value) |
731 | { |
732 | /// If it is claimed that the zero key can not be inserted into the table. |
733 | if (!Cell::need_zero_value_storage) |
734 | return false; |
735 | |
736 | if (Cell::isZero(x, *this)) |
737 | { |
738 | it = this->zeroValue(); |
739 | |
740 | if (!this->hasZero()) |
741 | { |
742 | ++m_size; |
743 | this->setHasZero(); |
744 | this->zeroValue()->setHash(hash_value); |
745 | inserted = true; |
746 | } |
747 | else |
748 | inserted = false; |
749 | |
750 | return true; |
751 | } |
752 | |
753 | return false; |
754 | } |
755 | |
756 | template <typename KeyHolder> |
757 | void ALWAYS_INLINE emplaceNonZeroImpl(size_t place_value, KeyHolder && key_holder, |
758 | LookupResult & it, bool & inserted, size_t hash_value) |
759 | { |
760 | it = &buf[place_value]; |
761 | |
762 | if (!buf[place_value].isZero(*this)) |
763 | { |
764 | keyHolderDiscardKey(key_holder); |
765 | inserted = false; |
766 | return; |
767 | } |
768 | |
769 | keyHolderPersistKey(key_holder); |
770 | const auto & key = keyHolderGetKey(key_holder); |
771 | |
772 | new (&buf[place_value]) Cell(key, *this); |
773 | buf[place_value].setHash(hash_value); |
774 | inserted = true; |
775 | ++m_size; |
776 | |
777 | if (unlikely(grower.overflow(m_size))) |
778 | { |
779 | try |
780 | { |
781 | resize(); |
782 | } |
783 | catch (...) |
784 | { |
785 | /** If we have not resized successfully, then there will be problems. |
786 | * There remains a key, but uninitialized mapped-value, |
787 | * which, perhaps, can not even be called a destructor. |
788 | */ |
789 | --m_size; |
790 | buf[place_value].setZero(); |
791 | throw; |
792 | } |
793 | |
794 | // The hash table was rehashed, so we have to re-find the key. |
795 | size_t new_place = findCell(key, hash_value, grower.place(hash_value)); |
796 | assert(!buf[new_place].isZero(*this)); |
797 | it = &buf[new_place]; |
798 | } |
799 | } |
800 | |
801 | /// Only for non-zero keys. Find the right place, insert the key there, if it does not already exist. Set iterator to the cell in output parameter. |
802 | template <typename KeyHolder> |
803 | void ALWAYS_INLINE emplaceNonZero(KeyHolder && key_holder, LookupResult & it, |
804 | bool & inserted, size_t hash_value) |
805 | { |
806 | const auto & key = keyHolderGetKey(key_holder); |
807 | size_t place_value = findCell(key, hash_value, grower.place(hash_value)); |
808 | emplaceNonZeroImpl(place_value, key_holder, it, inserted, hash_value); |
809 | } |
810 | |
811 | |
812 | public: |
813 | /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function. |
814 | std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type & x) |
815 | { |
816 | std::pair<LookupResult, bool> res; |
817 | |
818 | size_t hash_value = hash(Cell::getKey(x)); |
819 | if (!emplaceIfZero(Cell::getKey(x), res.first, res.second, hash_value)) |
820 | { |
821 | emplaceNonZero(Cell::getKey(x), res.first, res.second, hash_value); |
822 | } |
823 | |
824 | if (res.second) |
825 | insertSetMapped(res.first->getMapped(), x); |
826 | |
827 | return res; |
828 | } |
829 | |
830 | |
831 | /// Reinsert node pointed to by iterator |
832 | void ALWAYS_INLINE reinsert(iterator & it, size_t hash_value) |
833 | { |
834 | reinsert(*it.getPtr(), hash_value); |
835 | } |
836 | |
837 | |
838 | /** Insert the key. |
839 | * Return values: |
840 | * 'it' -- a LookupResult pointing to the corresponding key/mapped pair. |
841 | * 'inserted' -- whether a new key was inserted. |
842 | * |
843 | * You have to make `placement new` of value if you inserted a new key, |
844 | * since when destroying a hash table, it will call the destructor! |
845 | * |
846 | * Example usage: |
847 | * |
848 | * Map::LookupResult it; |
849 | * bool inserted; |
850 | * map.emplace(key, it, inserted); |
851 | * if (inserted) |
852 | * new (&it->getMapped()) Mapped(value); |
853 | */ |
854 | template <typename KeyHolder> |
855 | void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted) |
856 | { |
857 | const auto & key = keyHolderGetKey(key_holder); |
858 | emplace(key_holder, it, inserted, hash(key)); |
859 | } |
860 | |
861 | template <typename KeyHolder> |
862 | void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, |
863 | bool & inserted, size_t hash_value) |
864 | { |
865 | const auto & key = keyHolderGetKey(key_holder); |
866 | if (!emplaceIfZero(key, it, inserted, hash_value)) |
867 | emplaceNonZero(key_holder, it, inserted, hash_value); |
868 | } |
869 | |
870 | /// Copy the cell from another hash table. It is assumed that the cell is not zero, and also that there was no such key in the table yet. |
871 | void ALWAYS_INLINE insertUniqueNonZero(const Cell * cell, size_t hash_value) |
872 | { |
873 | size_t place_value = findEmptyCell(grower.place(hash_value)); |
874 | |
875 | memcpy(static_cast<void*>(&buf[place_value]), cell, sizeof(*cell)); |
876 | ++m_size; |
877 | |
878 | if (unlikely(grower.overflow(m_size))) |
879 | resize(); |
880 | } |
881 | |
882 | LookupResult ALWAYS_INLINE find(const Key & x) |
883 | { |
884 | if (Cell::isZero(x, *this)) |
885 | return this->hasZero() ? this->zeroValue() : nullptr; |
886 | |
887 | size_t hash_value = hash(x); |
888 | size_t place_value = findCell(x, hash_value, grower.place(hash_value)); |
889 | return !buf[place_value].isZero(*this) ? &buf[place_value] : nullptr; |
890 | } |
891 | |
892 | ConstLookupResult ALWAYS_INLINE find(const Key & x) const |
893 | { |
894 | return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x); |
895 | } |
896 | |
897 | LookupResult ALWAYS_INLINE find(const Key & x, size_t hash_value) |
898 | { |
899 | if (Cell::isZero(x, *this)) |
900 | return this->hasZero() ? this->zeroValue() : nullptr; |
901 | |
902 | size_t place_value = findCell(x, hash_value, grower.place(hash_value)); |
903 | return !buf[place_value].isZero(*this) ? &buf[place_value] : nullptr; |
904 | } |
905 | |
906 | ConstLookupResult ALWAYS_INLINE find(const Key & x, size_t hash_value) const |
907 | { |
908 | return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value); |
909 | } |
910 | |
911 | bool ALWAYS_INLINE has(const Key & x) const |
912 | { |
913 | if (Cell::isZero(x, *this)) |
914 | return this->hasZero(); |
915 | |
916 | size_t hash_value = hash(x); |
917 | size_t place_value = findCell(x, hash_value, grower.place(hash_value)); |
918 | return !buf[place_value].isZero(*this); |
919 | } |
920 | |
921 | |
922 | bool ALWAYS_INLINE has(const Key & x, size_t hash_value) const |
923 | { |
924 | if (Cell::isZero(x, *this)) |
925 | return this->hasZero(); |
926 | |
927 | size_t place_value = findCell(x, hash_value, grower.place(hash_value)); |
928 | return !buf[place_value].isZero(*this); |
929 | } |
930 | |
931 | |
932 | void write(DB::WriteBuffer & wb) const |
933 | { |
934 | Cell::State::write(wb); |
935 | DB::writeVarUInt(m_size, wb); |
936 | |
937 | if (this->hasZero()) |
938 | this->zeroValue()->write(wb); |
939 | |
940 | for (auto ptr = buf, buf_end = buf + grower.bufSize(); ptr < buf_end; ++ptr) |
941 | if (!ptr->isZero(*this)) |
942 | ptr->write(wb); |
943 | } |
944 | |
945 | void writeText(DB::WriteBuffer & wb) const |
946 | { |
947 | Cell::State::writeText(wb); |
948 | DB::writeText(m_size, wb); |
949 | |
950 | if (this->hasZero()) |
951 | { |
952 | DB::writeChar(',', wb); |
953 | this->zeroValue()->writeText(wb); |
954 | } |
955 | |
956 | for (auto ptr = buf, buf_end = buf + grower.bufSize(); ptr < buf_end; ++ptr) |
957 | { |
958 | if (!ptr->isZero(*this)) |
959 | { |
960 | DB::writeChar(',', wb); |
961 | ptr->writeText(wb); |
962 | } |
963 | } |
964 | } |
965 | |
966 | void read(DB::ReadBuffer & rb) |
967 | { |
968 | Cell::State::read(rb); |
969 | |
970 | destroyElements(); |
971 | this->clearHasZero(); |
972 | m_size = 0; |
973 | |
974 | size_t new_size = 0; |
975 | DB::readVarUInt(new_size, rb); |
976 | |
977 | free(); |
978 | Grower new_grower = grower; |
979 | new_grower.set(new_size); |
980 | alloc(new_grower); |
981 | |
982 | for (size_t i = 0; i < new_size; ++i) |
983 | { |
984 | Cell x; |
985 | x.read(rb); |
986 | insert(Cell::getKey(x.getValue())); |
987 | } |
988 | } |
989 | |
990 | void readText(DB::ReadBuffer & rb) |
991 | { |
992 | Cell::State::readText(rb); |
993 | |
994 | destroyElements(); |
995 | this->clearHasZero(); |
996 | m_size = 0; |
997 | |
998 | size_t new_size = 0; |
999 | DB::readText(new_size, rb); |
1000 | |
1001 | free(); |
1002 | Grower new_grower = grower; |
1003 | new_grower.set(new_size); |
1004 | alloc(new_grower); |
1005 | |
1006 | for (size_t i = 0; i < new_size; ++i) |
1007 | { |
1008 | Cell x; |
1009 | DB::assertChar(',', rb); |
1010 | x.readText(rb); |
1011 | insert(Cell::getKey(x.getValue())); |
1012 | } |
1013 | } |
1014 | |
1015 | |
1016 | size_t size() const |
1017 | { |
1018 | return m_size; |
1019 | } |
1020 | |
1021 | bool empty() const |
1022 | { |
1023 | return 0 == m_size; |
1024 | } |
1025 | |
1026 | void clear() |
1027 | { |
1028 | destroyElements(); |
1029 | this->clearHasZero(); |
1030 | m_size = 0; |
1031 | |
1032 | memset(static_cast<void*>(buf), 0, grower.bufSize() * sizeof(*buf)); |
1033 | } |
1034 | |
1035 | /// After executing this function, the table can only be destroyed, |
1036 | /// and also you can use the methods `size`, `empty`, `begin`, `end`. |
1037 | void clearAndShrink() |
1038 | { |
1039 | destroyElements(); |
1040 | this->clearHasZero(); |
1041 | m_size = 0; |
1042 | free(); |
1043 | } |
1044 | |
1045 | size_t getBufferSizeInBytes() const |
1046 | { |
1047 | return grower.bufSize() * sizeof(Cell); |
1048 | } |
1049 | |
1050 | size_t getBufferSizeInCells() const |
1051 | { |
1052 | return grower.bufSize(); |
1053 | } |
1054 | |
1055 | #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS |
1056 | size_t getCollisions() const |
1057 | { |
1058 | return collisions; |
1059 | } |
1060 | #endif |
1061 | }; |
1062 | |