HashTable.h source code [ClickHouse/dbms/src/Common/HashTable/HashTable.h]

1	#pragma once
2
3	#include <string.h>
4
5	#include <math.h>
6
7	#include <utility>
8
9	#include <boost/noncopyable.hpp>
10
11	#include <common/likely.h>
12
13	#include <Core/Defines.h>
14	#include <Core/Types.h>
15	#include <Common/Exception.h>
16
17	#include <IO/WriteBuffer.h>
18	#include <IO/WriteHelpers.h>
19	#include <IO/ReadBuffer.h>
20	#include <IO/ReadHelpers.h>
21	#include <IO/VarInt.h>
22
23	#include <Common/HashTable/HashTableAllocator.h>
24	#include <Common/HashTable/HashTableKeyHolder.h>
25
26	#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
27	#include <iostream>
28	#include <iomanip>
29	#include <Common/Stopwatch.h>
30	#endif
31
32	/* NOTE HashTable could only be used for memmoveable (position independent) types.*
33	* Example: std::string is not position independent in libstdc++ with C++11 ABI or in libc++.
34	* Also, key in hash table must be of type, that zero bytes is compared equals to zero key.
35	*/
36
37
38	namespace DB
39	{
40	namespace ErrorCodes
41	{
42	extern const int LOGICAL_ERROR;
43	extern const int NO_AVAILABLE_DATA;
44	}
45	}
46
47
48	/* The state of the hash table that affects the properties of its cells.*
49	* Used as a template parameter.
50	* For example, there is an implementation of an instantly clearable hash table - ClearableHashMap.
51	* For it, each cell holds the version number, and in the hash table itself is the current version.
52	* When clearing, the current version simply increases; All cells with a mismatching version are considered empty.
53	* Another example: for an approximate calculation of the number of unique visitors, there is a hash table for UniquesHashSet.
54	* It has the concept of "degree". At each overflow, cells with keys that do not divide by the corresponding power of the two are deleted.
55	*/
56	struct HashTableNoState
57	{
58	/// Serialization, in binary and text form.
59	void write(DB::WriteBuffer &) const {}
60	void writeText(DB::WriteBuffer &) const {}
61
62	/// Deserialization, in binary and text form.
63	void read(DB::ReadBuffer &) {}
64	void readText(DB::ReadBuffer &) {}
65	};
66
67
68	/// These functions can be overloaded for custom types.
69	namespace ZeroTraits
70	{
71
72	template <typename T>
73	bool check(const T x) { return x == `0`; }
74
75	template <typename T>
76	void set(T & x) { x = `0`; }
77
78	}
79
80	/**
81	* getKey/Mapped -- methods to get key/"mapped" values from the LookupResult returned by find() and
82	* emplace() methods of HashTable. Must not be called for a null LookupResult.
83	*
84	* We don't use iterators for lookup result. Instead, LookupResult is a pointer of some kind. There
85	* are methods getKey/Mapped, that return references or values to key/"mapped" values.
86	*
87	* Different hash table implementations support this interface to a varying degree:
88	*
89	* 1) Hash tables that store neither the key in its original form, nor a "mapped" value:
90	* FixedHashTable or StringHashTable. Neither GetKey nor GetMapped are supported, the only valid
91	* operation is checking LookupResult for null.
92	*
93	* 2) Hash maps that do not store the key, e.g. FixedHashMap or StringHashMap. Only GetMapped is
94	* supported.
95	*
96	* 3) Hash tables that store the key and do not have a "mapped" value, e.g. the normal HashTable.
97	* GetKey returns the key, and GetMapped returns a zero void pointer. This simplifies generic
98	* code that works with mapped values: it can overload on the return type of GetMapped(), and
99	* doesn't need other parameters. One example is insertSetMapped() function.
100	*
101	* 4) Hash tables that store both the key and the "mapped" value, e.g. HashMap. Both GetKey and
102	* GetMapped are supported.
103	*
104	* The implementation side goes as follows:
105	*
106	* for (1), LookupResult->getKey = const VoidKey, LookupResult->getMapped = VoidMapped;
107	*
108	* for (2), LookupResult->getKey = const VoidKey, LookupResult->getMapped = Mapped &;
109	*
110	* for (3) and (4), LookupResult->getKey = const Key [&], LookupResult->getMapped = Mapped &;
111	* VoidKey and VoidMapped may have specialized function overloads for generic code.
112	*/
113
114	struct VoidKey {};
115	struct VoidMapped
116	{
117	template <typename T>
118	auto & operator=(const T &)
119	{
120	return *this;
121	}
122	};
123
124	/* Compile-time interface for cell of the hash table.*
125	* Different cell types are used to implement different hash tables.
126	* The cell must contain a key.
127	* It can also contain a value and arbitrary additional data
128	* (example: the stored hash value; version number for ClearableHashMap).
129	*/
130	template <typename Key, typename Hash, typename TState = HashTableNoState>
131	struct HashTableCell
132	{
133	using State = TState;
134
135	using key_type = Key;
136	using value_type = Key;
137	using mapped_type = VoidMapped;
138
139	Key key;
140
141	HashTableCell() {}
142
143	/// Create a cell with the given key / key and value.
144	HashTableCell(const Key & key_, const State &) : key(key_) {}
145
146	/// Get the key (externally).
147	const Key & getKey() const { return key; }
148	VoidMapped getMapped() const { return {}; }
149	const value_type & getValue() const { return key; }
150
151	/// Get the key (internally).
152	static const Key & getKey(const value_type & value) { return value; }
153
154	/// Are the keys at the cells equal?
155	bool keyEquals(const Key & key_) const { return key == key_; }
156	bool keyEquals(const Key & key_, size_t /hash_/) const { return key == key_; }
157	bool keyEquals(const Key & key_, size_t /hash_/, const State & /state/) const { return key == key_; }
158
159	/// If the cell can remember the value of the hash function, then remember it.
160	void setHash(size_t /hash_value/) {}
161
162	/// If the cell can store the hash value in itself, then return the stored value.
163	/// It must be at least once calculated before.
164	/// If storing the hash value is not provided, then just compute the hash.
165	size_t getHash(const Hash & hash) const { return hash(key); }
166
167	/// Whether the key is zero. In the main buffer, cells with a zero key are considered empty.
168	/// If zero keys can be inserted into the table, then the cell for the zero key is stored separately, not in the main buffer.
169	/// Zero keys must be such that the zeroed-down piece of memory is a zero key.
170	bool isZero(const State & state) const { return isZero(key, state); }
171	static bool isZero(const Key & key, const State & /state/) { return ZeroTraits::check(key); }
172
173	/// Set the key value to zero.
174	void setZero() { ZeroTraits::set(key); }
175
176	/// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table).
177	static constexpr bool need_zero_value_storage = true;
178
179	/// Whether the cell is deleted.
180	bool isDeleted() const { return false; }
181
182	/// Set the mapped value, if any (for HashMap), to the corresponding `value`.
183	void setMapped(const value_type & /value/) {}
184
185	/// Serialization, in binary and text form.
186	void write(DB::WriteBuffer & wb) const { DB::writeBinary(key, wb); }
187	void writeText(DB::WriteBuffer & wb) const { DB::writeDoubleQuoted(key, wb); }
188
189	/// Deserialization, in binary and text form.
190	void read(DB::ReadBuffer & rb) { DB::readBinary(key, rb); }
191	void readText(DB::ReadBuffer & rb) { DB::readDoubleQuoted(key, rb); }
192	};
193
194	/**
195	* A helper function for HashTable::insert() to set the "mapped" value.
196	* Overloaded on the mapped type, does nothing if it's VoidMapped.
197	*/
198	template <typename ValueType>
199	void insertSetMapped(VoidMapped / dest /, const ValueType & / src /) {}
200
201	template <typename MappedType, typename ValueType>
202	void insertSetMapped(MappedType & dest, const ValueType & src) { dest = src.second; }
203
204
205	/* Determines the size of the hash table, and when and how much it should be resized.*
206	*/
207	template <size_t initial_size_degree = `8`>
208	struct HashTableGrower
209	{
210	/// The state of this structure is enough to get the buffer size of the hash table.
211
212	UInt8 size_degree = initial_size_degree;
213
214	/// The size of the hash table in the cells.
215	size_t bufSize() const { return `1ULL` << size_degree; }
216
217	size_t maxFill() const { return `1ULL` << (size_degree - `1`); }
218	size_t mask() const { return bufSize() - `1`; }
219
220	/// From the hash value, get the cell number in the hash table.
221	size_t place(size_t x) const { return x & mask(); }
222
223	/// The next cell in the collision resolution chain.
224	size_t next(size_t pos) const { ++pos; return pos & mask(); }
225
226	/// Whether the hash table is sufficiently full. You need to increase the size of the hash table, or remove something unnecessary from it.
227	bool overflow(size_t elems) const { return elems > maxFill(); }
228
229	/// Increase the size of the hash table.
230	void increaseSize()
231	{
232	size_degree += size_degree >= `23` ? `1` : `2`;
233	}
234
235	/// Set the buffer size by the number of elements in the hash table. Used when deserializing a hash table.
236	void set(size_t num_elems)
237	{
238	size_degree = num_elems <= `1`
239	? initial_size_degree
240	: ((initial_size_degree > static_cast<size_t>(log2(num_elems - `1`)) + `2`)
241	? initial_size_degree
242	: (static_cast<size_t>(log2(num_elems - `1`)) + `2`));
243	}
244
245	void setBufSize(size_t buf_size_)
246	{
247	size_degree = static_cast<size_t>(log2(buf_size_ - `1`) + `1`);
248	}
249	};
250
251
252	/* When used as a Grower, it turns a hash table into something like a lookup table.*
253	* It remains non-optimal - the cells store the keys.
254	* Also, the compiler can not completely remove the code of passing through the collision resolution chain, although it is not needed.
255	* NOTE: Better to use FixedHashTable instead.
256	*/
257	template <size_t key_bits>
258	struct HashTableFixedGrower
259	{
260	size_t bufSize() const { return `1ULL` << key_bits; }
261	size_t place(size_t x) const { return x; }
262	/// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently.
263	size_t next(size_t pos) const { return pos + `1`; }
264	bool overflow(size_t /elems/) const { return false; }
265
266	void increaseSize() { __builtin_unreachable(); }
267	void set(size_t /num_elems/) {}
268	void setBufSize(size_t /buf_size_/) {}
269	};
270
271
272	/* If you want to store the zero key separately - a place to store it. /
273	template <bool need_zero_value_storage, typename Cell>
274	struct ZeroValueStorage;
275
276	template <typename Cell>
277	struct ZeroValueStorage<true, Cell>
278	{
279	private:
280	bool has_zero = false;
281	std::aligned_storage_t<sizeof(Cell), alignof(Cell)> zero_value_storage; /// Storage of element with zero key.
282
283	public:
284	bool hasZero() const { return has_zero; }
285
286	void setHasZero()
287	{
288	has_zero = true;
289	new (zeroValue()) Cell();
290	}
291
292	void clearHasZero()
293	{
294	has_zero = false;
295	zeroValue()->~Cell();
296	}
297
298	Cell * zeroValue() { return reinterpret_cast<Cell*>(&zero_value_storage); }
299	const Cell * zeroValue() const { return reinterpret_cast<const Cell*>(&zero_value_storage); }
300	};
301
302	template <typename Cell>
303	struct ZeroValueStorage<false, Cell>
304	{
305	bool hasZero() const { return false; }
306	void setHasZero() { throw DB::Exception ("HashTable: logical error", DB::ErrorCodes::LOGICAL_ERROR); }
307	void clearHasZero() {}
308
309	Cell * zeroValue() { return nullptr; }
310	const Cell * zeroValue() const { return nullptr; }
311	};
312
313
314	template
315	<
316	typename Key,
317	typename Cell,
318	typename Hash,
319	typename Grower,
320	typename Allocator
321	>
322	class HashTable :
323	private boost::noncopyable,
324	protected Hash,
325	protected Allocator,
326	protected Cell::State,
327	protected ZeroValueStorage<Cell::need_zero_value_storage, Cell> /// empty base optimization
328	{
329	protected:
330	friend class const_iterator;
331	friend class iterator;
332	friend class Reader;
333
334	template <typename, typename, typename, typename, typename, typename, size_t>
335	friend class TwoLevelHashTable;
336
337	template <typename, typename, size_t>
338	friend class TwoLevelStringHashTable;
339
340	template <typename SubMaps>
341	friend class StringHashTable;
342
343	using HashValue = size_t;
344	using Self = HashTable;
345
346	size_t m_size = `0`; /// Amount of elements
347	Cell * buf; /// A piece of memory for all elements except the element with zero key.
348	Grower grower;
349
350	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
351	mutable size_t collisions = `0`;
352	#endif
353
354	/// Find a cell with the same key or an empty cell, starting from the specified position and further along the collision resolution chain.
355	size_t ALWAYS_INLINE findCell(const Key & x, size_t hash_value, size_t place_value) const
356	{
357	while (!buf[place_value].isZero(*this) && !buf[place_value].keyEquals(x, hash_value, *this))
358	{
359	place_value = grower.next(place_value);
360	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
361	++collisions;
362	#endif
363	}
364
365	return place_value;
366	}
367
368
369	/// Find an empty cell, starting with the specified position and further along the collision resolution chain.
370	size_t ALWAYS_INLINE findEmptyCell(size_t place_value) const
371	{
372	while (!buf[place_value].isZero(*this))
373	{
374	place_value = grower.next(place_value);
375	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
376	++collisions;
377	#endif
378	}
379
380	return place_value;
381	}
382
383	void alloc(const Grower & new_grower)
384	{
385	buf = reinterpret_cast<Cell >(Allocator::alloc(new_grower.bufSize() sizeof(Cell)));
386	grower = new_grower;
387	}
388
389	void free()
390	{
391	if (buf)
392	{
393	Allocator::free(buf, getBufferSizeInBytes());
394	buf = nullptr;
395	}
396	}
397
398
399	/// Increase the size of the buffer.
400	void resize(size_t for_num_elems = `0`, size_t for_buf_size = `0`)
401	{
402	#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
403	Stopwatch watch;
404	#endif
405
406	size_t old_size = grower.bufSize();
407
408	/* In case of exception for the object to remain in the correct state,*
409	* changing the variable `grower` (which determines the buffer size of the hash table)
410	* is postponed for a moment after a real buffer change.
411	* The temporary variable `new_grower` is used to determine the new size.
412	*/
413	Grower new_grower = grower;
414
415	if (for_num_elems)
416	{
417	new_grower.set(for_num_elems);
418	if (new_grower.bufSize() <= old_size)
419	return;
420	}
421	else if (for_buf_size)
422	{
423	new_grower.setBufSize(for_buf_size);
424	if (new_grower.bufSize() <= old_size)
425	return;
426	}
427	else
428	new_grower.increaseSize();
429
430	/// Expand the space.
431	buf = reinterpret_cast<Cell >(Allocator::realloc(buf, getBufferSizeInBytes(), new_grower.bufSize() sizeof(Cell)));
432	grower = new_grower;
433
434	/* Now some items may need to be moved to a new location.*
435	* The element can stay in place, or move to a new location "on the right",
436	* or move to the left of the collision resolution chain, because the elements to the left of it have been moved to the new "right" location.
437	*/
438	size_t i = `0`;
439	for (; i < old_size; ++i)
440	if (!buf[i].isZero(*this) && !buf[i].isDeleted())
441	reinsert(buf[i], buf[i].getHash(*this));
442
443	/* There is also a special case:*
444	* if the element was to be at the end of the old buffer, [ x]
445	* but is at the beginning because of the collision resolution chain, [o x]
446	* then after resizing, it will first be out of place again, [ xo ]
447	* and in order to transfer it where necessary,
448	* after transferring all the elements from the old halves you need to [ o x ]
449	* process tail from the collision resolution chain immediately after it [ o x ]
450	*/
451	for (; !buf[i].isZero(*this) && !buf[i].isDeleted(); ++i)
452	reinsert(buf[i], buf[i].getHash(*this));
453
454	#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
455	watch.stop();
456	std::cerr << std::fixed << std::setprecision(`3`)
457	<< "Resize from " << old_size << " to " << grower.bufSize() << " took " << watch.elapsedSeconds() << " sec."
458	<< std::endl;
459	#endif
460	}
461
462
463	/* Paste into the new buffer the value that was in the old buffer.*
464	* Used when increasing the buffer size.
465	*/
466	void reinsert(Cell & x, size_t hash_value)
467	{
468	size_t place_value = grower.place(hash_value);
469
470	/// If the element is in its place.
471	if (&x == &buf[place_value])
472	return;
473
474	/// Compute a new location, taking into account the collision resolution chain.
475	place_value = findCell(Cell::getKey(x.getValue()), hash_value, place_value);
476
477	/// If the item remains in its place in the old collision resolution chain.
478	if (!buf[place_value].isZero(*this))
479	return;
480
481	/// Copy to a new location and zero the old one.
482	x.setHash(hash_value);
483	memcpy(static_cast<void>(&buf[place_value]), &x, sizeof*(x));
484	x.setZero();
485
486	/// Then the elements that previously were in collision with this can move to the old place.
487	}
488
489
490	void destroyElements()
491	{
492	if (!std::is_trivially_destructible_v<Cell>)
493	for (iterator it = begin(), it_end = end(); it != it_end; ++it)
494	it.ptr->~Cell();
495	}
496
497
498	template <typename Derived, bool is_const>
499	class iterator_base
500	{
501	using Container = std::conditional_t<is_const, const Self, Self>;
502	using cell_type = std::conditional_t<is_const, const Cell, Cell>;
503
504	Container * container;
505	cell_type * ptr;
506
507	friend class HashTable;
508
509	public:
510	iterator_base() {}
511	iterator_base(Container * container_, cell_type * ptr_) : container(container_), ptr(ptr_) {}
512
513	bool operator== (const iterator_base & rhs) const { return ptr == rhs.ptr; }
514	bool operator!= (const iterator_base & rhs) const { return ptr != rhs.ptr; }
515
516	Derived & operator++()
517	{
518	/// If iterator was pointed to ZeroValueStorage, move it to the beginning of the main buffer.
519	if (unlikely(ptr->isZero(*container)))
520	ptr = container->buf;
521	else
522	++ptr;
523
524	/// Skip empty cells in the main buffer.
525	auto buf_end = container->buf + container->grower.bufSize();
526	while (ptr < buf_end && ptr->isZero(*container))
527	++ptr;
528
529	return static_cast<Derived &>(*this);
530	}
531
532	auto & operator* () const { return *ptr; }
533	auto * operator->() const { return ptr; }
534
535	auto getPtr() const { return ptr; }
536	size_t getHash() const { return ptr->getHash(*container); }
537
538	size_t getCollisionChainLength() const
539	{
540	return container->grower.place((ptr - container->buf) - container->grower.place(getHash()));
541	}
542
543	/**
544	* A hack for HashedDictionary.
545	*
546	* The problem: std-like find() returns an iterator, which has to be
547	* compared to end(). On the other hand, HashMap::find() returns
548	* LookupResult, which is compared to nullptr. HashedDictionary has to
549	* support both hash maps with the same code, hence the need for this
550	* hack.
551	*
552	* The proper way would be to remove iterator interface from our
553	* HashMap completely, change all its users to the existing internal
554	* iteration interface, and redefine end() to return LookupResult for
555	* compatibility with std find(). Unfortunately, now is not the time to
556	* do this.
557	*/
558	operator Cell * () const { return nullptr; }
559	};
560
561
562	public:
563	using key_type = Key;
564	using mapped_type = typename Cell::mapped_type;
565	using value_type = typename Cell::value_type;
566	using cell_type = Cell;
567
568	using LookupResult = Cell *;
569	using ConstLookupResult = const Cell *;
570
571	size_t hash(const Key & x) const { return Hash::operator()(x); }
572
573
574	HashTable()
575	{
576	if (Cell::need_zero_value_storage)
577	this->zeroValue()->setZero();
578	alloc(grower);
579	}
580
581	HashTable(size_t reserve_for_num_elements)
582	{
583	if (Cell::need_zero_value_storage)
584	this->zeroValue()->setZero();
585	grower.set(reserve_for_num_elements);
586	alloc(grower);
587	}
588
589	HashTable(HashTable && rhs)
590	: buf(nullptr)
591	{
592	*this = std::move(rhs);
593	}
594
595	~HashTable()
596	{
597	destroyElements();
598	free();
599	}
600
601	HashTable & operator= (HashTable && rhs)
602	{
603	destroyElements();
604	free();
605
606	std::swap(buf, rhs.buf);
607	std::swap(m_size, rhs.m_size);
608	std::swap(grower, rhs.grower);
609
610	Hash::operator=(std::move(rhs));
611	Allocator::operator=(std::move(rhs));
612	Cell::State::operator=(std::move(rhs));
613	ZeroValueStorage<Cell::need_zero_value_storage, Cell>::operator=(std::move(rhs));
614
615	return *this;
616	}
617
618	class Reader final : private Cell::State
619	{
620	public:
621	Reader(DB::ReadBuffer & in_)
622	: in(in_)
623	{
624	}
625
626	Reader(const Reader &) = delete;
627	Reader & operator=(const Reader &) = delete;
628
629	bool next()
630	{
631	if (!is_initialized)
632	{
633	Cell::State::read(in);
634	DB::readVarUInt(size, in);
635	is_initialized = true;
636	}
637
638	if (read_count == size)
639	{
640	is_eof = true;
641	return false;
642	}
643
644	cell.read(in);
645	++read_count;
646
647	return true;
648	}
649
650	inline const value_type & get() const
651	{
652	if (!is_initialized \|\| is_eof)
653	throw DB::Exception ("No available data", DB::ErrorCodes::NO_AVAILABLE_DATA);
654
655	return cell.getValue();
656	}
657
658	private:
659	DB::ReadBuffer & in;
660	Cell cell;
661	size_t read_count = `0`;
662	size_t size = `0`;
663	bool is_eof = false;
664	bool is_initialized = false;
665	};
666
667
668	class iterator : public iterator_base<iterator, false>
669	{
670	public:
671	using iterator_base<iterator, false>::iterator_base;
672	};
673
674	class const_iterator : public iterator_base<const_iterator, true>
675	{
676	public:
677	using iterator_base<const_iterator, true>::iterator_base;
678	};
679
680
681	const_iterator begin() const
682	{
683	if (!buf)
684	return end();
685
686	if (this->hasZero())
687	return iteratorToZero();
688
689	const Cell * ptr = buf;
690	auto buf_end = buf + grower.bufSize();
691	while (ptr < buf_end && ptr->isZero(*this))
692	++ptr;
693
694	return const_iterator(this, ptr);
695	}
696
697	const_iterator cbegin() const { return begin(); }
698
699	iterator begin()
700	{
701	if (!buf)
702	return end();
703
704	if (this->hasZero())
705	return iteratorToZero();
706
707	Cell * ptr = buf;
708	auto buf_end = buf + grower.bufSize();
709	while (ptr < buf_end && ptr->isZero(*this))
710	++ptr;
711
712	return iterator(this, ptr);
713	}
714
715	const_iterator end() const { return const_iterator(this, buf + grower.bufSize()); }
716	const_iterator cend() const { return end(); }
717	iterator end() { return iterator(this, buf + grower.bufSize()); }
718
719
720	protected:
721	const_iterator iteratorTo(const Cell * ptr) const { return const_iterator(this, ptr); }
722	iterator iteratorTo(Cell * ptr) { return iterator(this, ptr); }
723	const_iterator iteratorToZero() const { return iteratorTo(this->zeroValue()); }
724	iterator iteratorToZero() { return iteratorTo(this->zeroValue()); }
725
726
727	/// If the key is zero, insert it into a special place and return true.
728	/// We don't have to persist a zero key, because it's not actually inserted.
729	/// That's why we just take a Key by value, an not a key holder.
730	bool ALWAYS_INLINE emplaceIfZero(const Key & x, LookupResult & it, bool & inserted, size_t hash_value)
731	{
732	/// If it is claimed that the zero key can not be inserted into the table.
733	if (!Cell::need_zero_value_storage)
734	return false;
735
736	if (Cell::isZero(x, *this))
737	{
738	it = this->zeroValue();
739
740	if (!this->hasZero())
741	{
742	++m_size;
743	this->setHasZero();
744	this->zeroValue()->setHash(hash_value);
745	inserted = true;
746	}
747	else
748	inserted = false;
749
750	return true;
751	}
752
753	return false;
754	}
755
756	template <typename KeyHolder>
757	void ALWAYS_INLINE emplaceNonZeroImpl(size_t place_value, KeyHolder && key_holder,
758	LookupResult & it, bool & inserted, size_t hash_value)
759	{
760	it = &buf[place_value];
761
762	if (!buf[place_value].isZero(*this))
763	{
764	keyHolderDiscardKey(key_holder);
765	inserted = false;
766	return;
767	}
768
769	keyHolderPersistKey(key_holder);
770	const auto & key = keyHolderGetKey(key_holder);
771
772	new (&buf[place_value]) Cell(key, *this);
773	buf[place_value].setHash(hash_value);
774	inserted = true;
775	++m_size;
776
777	if (unlikely(grower.overflow(m_size)))
778	{
779	try
780	{
781	resize();
782	}
783	catch (...)
784	{
785	/* If we have not resized successfully, then there will be problems.*
786	* There remains a key, but uninitialized mapped-value,
787	* which, perhaps, can not even be called a destructor.
788	*/
789	--m_size;
790	buf[place_value].setZero();
791	throw;
792	}
793
794	// The hash table was rehashed, so we have to re-find the key.
795	size_t new_place = findCell(key, hash_value, grower.place(hash_value));
796	assert(!buf[new_place].isZero(*this));
797	it = &buf[new_place];
798	}
799	}
800
801	/// Only for non-zero keys. Find the right place, insert the key there, if it does not already exist. Set iterator to the cell in output parameter.
802	template <typename KeyHolder>
803	void ALWAYS_INLINE emplaceNonZero(KeyHolder && key_holder, LookupResult & it,
804	bool & inserted, size_t hash_value)
805	{
806	const auto & key = keyHolderGetKey(key_holder);
807	size_t place_value = findCell(key, hash_value, grower.place(hash_value));
808	emplaceNonZeroImpl(place_value, key_holder, it, inserted, hash_value);
809	}
810
811
812	public:
813	/// Insert a value. In the case of any more complex values, it is better to use the `emplace` function.
814	std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type & x)
815	{
816	std::pair<LookupResult, bool> res;
817
818	size_t hash_value = hash(Cell::getKey(x));
819	if (!emplaceIfZero(Cell::getKey(x), res.first, res.second, hash_value))
820	{
821	emplaceNonZero(Cell::getKey(x), res.first, res.second, hash_value);
822	}
823
824	if (res.second)
825	insertSetMapped(res.first->getMapped(), x);
826
827	return res;
828	}
829
830
831	/// Reinsert node pointed to by iterator
832	void ALWAYS_INLINE reinsert(iterator & it, size_t hash_value)
833	{
834	reinsert(*it.getPtr(), hash_value);
835	}
836
837
838	/* Insert the key.*
839	* Return values:
840	* 'it' -- a LookupResult pointing to the corresponding key/mapped pair.
841	* 'inserted' -- whether a new key was inserted.
842	*
843	* You have to make `placement new` of value if you inserted a new key,
844	* since when destroying a hash table, it will call the destructor!
845	*
846	* Example usage:
847	*
848	* Map::LookupResult it;
849	* bool inserted;
850	* map.emplace(key, it, inserted);
851	* if (inserted)
852	* new (&it->getMapped()) Mapped(value);
853	*/
854	template <typename KeyHolder>
855	void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted)
856	{
857	const auto & key = keyHolderGetKey(key_holder);
858	emplace(key_holder, it, inserted, hash(key));
859	}
860
861	template <typename KeyHolder>
862	void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it,
863	bool & inserted, size_t hash_value)
864	{
865	const auto & key = keyHolderGetKey(key_holder);
866	if (!emplaceIfZero(key, it, inserted, hash_value))
867	emplaceNonZero(key_holder, it, inserted, hash_value);
868	}
869
870	/// Copy the cell from another hash table. It is assumed that the cell is not zero, and also that there was no such key in the table yet.
871	void ALWAYS_INLINE insertUniqueNonZero(const Cell * cell, size_t hash_value)
872	{
873	size_t place_value = findEmptyCell(grower.place(hash_value));
874
875	memcpy(static_cast<void>(&buf[place_value]), cell, sizeof(cell));
876	++m_size;
877
878	if (unlikely(grower.overflow(m_size)))
879	resize();
880	}
881
882	LookupResult ALWAYS_INLINE find(const Key & x)
883	{
884	if (Cell::isZero(x, *this))
885	return this->hasZero() ? this->zeroValue() : nullptr;
886
887	size_t hash_value = hash(x);
888	size_t place_value = findCell(x, hash_value, grower.place(hash_value));
889	return !buf[place_value].isZero(*this) ? &buf[place_value] : nullptr;
890	}
891
892	ConstLookupResult ALWAYS_INLINE find(const Key & x) const
893	{
894	return const_cast<std::decay_t<decltype(*this)> >(this*)->find(x);
895	}
896
897	LookupResult ALWAYS_INLINE find(const Key & x, size_t hash_value)
898	{
899	if (Cell::isZero(x, *this))
900	return this->hasZero() ? this->zeroValue() : nullptr;
901
902	size_t place_value = findCell(x, hash_value, grower.place(hash_value));
903	return !buf[place_value].isZero(*this) ? &buf[place_value] : nullptr;
904	}
905
906	ConstLookupResult ALWAYS_INLINE find(const Key & x, size_t hash_value) const
907	{
908	return const_cast<std::decay_t<decltype(*this)> >(this*)->find(x, hash_value);
909	}
910
911	bool ALWAYS_INLINE has(const Key & x) const
912	{
913	if (Cell::isZero(x, *this))
914	return this->hasZero();
915
916	size_t hash_value = hash(x);
917	size_t place_value = findCell(x, hash_value, grower.place(hash_value));
918	return !buf[place_value].isZero(*this);
919	}
920
921
922	bool ALWAYS_INLINE has(const Key & x, size_t hash_value) const
923	{
924	if (Cell::isZero(x, *this))
925	return this->hasZero();
926
927	size_t place_value = findCell(x, hash_value, grower.place(hash_value));
928	return !buf[place_value].isZero(*this);
929	}
930
931
932	void write(DB::WriteBuffer & wb) const
933	{
934	Cell::State::write(wb);
935	DB::writeVarUInt(m_size, wb);
936
937	if (this->hasZero())
938	this->zeroValue()->write(wb);
939
940	for (auto ptr = buf, buf_end = buf + grower.bufSize(); ptr < buf_end; ++ptr)
941	if (!ptr->isZero(*this))
942	ptr->write(wb);
943	}
944
945	void writeText(DB::WriteBuffer & wb) const
946	{
947	Cell::State::writeText(wb);
948	DB::writeText(m_size, wb);
949
950	if (this->hasZero())
951	{
952	DB::writeChar(`','`, wb);
953	this->zeroValue()->writeText(wb);
954	}
955
956	for (auto ptr = buf, buf_end = buf + grower.bufSize(); ptr < buf_end; ++ptr)
957	{
958	if (!ptr->isZero(*this))
959	{
960	DB::writeChar(`','`, wb);
961	ptr->writeText(wb);
962	}
963	}
964	}
965
966	void read(DB::ReadBuffer & rb)
967	{
968	Cell::State::read(rb);
969
970	destroyElements();
971	this->clearHasZero();
972	m_size = `0`;
973
974	size_t new_size = `0`;
975	DB::readVarUInt(new_size, rb);
976
977	free();
978	Grower new_grower = grower;
979	new_grower.set(new_size);
980	alloc(new_grower);
981
982	for (size_t i = `0`; i < new_size; ++i)
983	{
984	Cell x;
985	x.read(rb);
986	insert(Cell::getKey(x.getValue()));
987	}
988	}
989
990	void readText(DB::ReadBuffer & rb)
991	{
992	Cell::State::readText(rb);
993
994	destroyElements();
995	this->clearHasZero();
996	m_size = `0`;
997
998	size_t new_size = `0`;
999	DB::readText(new_size, rb);
1000
1001	free();
1002	Grower new_grower = grower;
1003	new_grower.set(new_size);
1004	alloc(new_grower);
1005
1006	for (size_t i = `0`; i < new_size; ++i)
1007	{
1008	Cell x;
1009	DB::assertChar(`','`, rb);
1010	x.readText(rb);
1011	insert(Cell::getKey(x.getValue()));
1012	}
1013	}
1014
1015
1016	size_t size() const
1017	{
1018	return m_size;
1019	}
1020
1021	bool empty() const
1022	{
1023	return `0` == m_size;
1024	}
1025
1026	void clear()
1027	{
1028	destroyElements();
1029	this->clearHasZero();
1030	m_size = `0`;
1031
1032	memset(static_cast<void>(buf), `0`, grower.bufSize() sizeof(*buf));
1033	}
1034
1035	/// After executing this function, the table can only be destroyed,
1036	/// and also you can use the methods `size`, `empty`, `begin`, `end`.
1037	void clearAndShrink()
1038	{
1039	destroyElements();
1040	this->clearHasZero();
1041	m_size = `0`;
1042	free();
1043	}
1044
1045	size_t getBufferSizeInBytes() const
1046	{
1047	return grower.bufSize() * sizeof(Cell);
1048	}
1049
1050	size_t getBufferSizeInCells() const
1051	{
1052	return grower.bufSize();
1053	}
1054
1055	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
1056	size_t getCollisions() const
1057	{
1058	return collisions;
1059	}
1060	#endif
1061	};
1062

Browse the source code of ClickHouse/dbms/src/Common/HashTable/HashTable.h