TwoLevelHashTable.h source code [ClickHouse/dbms/src/Common/HashTable/TwoLevelHashTable.h]

1	#pragma once
2
3	#include <Common/HashTable/HashTable.h>
4
5
6	/* Two-level hash table.*
7	* Represents 256 (or 1ULL << BITS_FOR_BUCKET) small hash tables (buckets of the first level).
8	* To determine which one to use, one of the bytes of the hash function is taken.
9	*
10	* Usually works a little slower than a simple hash table.
11	* However, it has advantages in some cases:
12	* - if you need to merge two hash tables together, then you can easily parallelize it by buckets;
13	* - delay during resizes is amortized, since the small hash tables will be resized separately;
14	* - in theory, resizes are cache-local in a larger range of sizes.
15	*/
16
17	template <size_t initial_size_degree = `8`>
18	struct TwoLevelHashTableGrower : public HashTableGrower<initial_size_degree>
19	{
20	/// Increase the size of the hash table.
21	void increaseSize()
22	{
23	this->size_degree += this->size_degree >= `15` ? `1` : `2`;
24	}
25	};
26
27	template
28	<
29	typename Key,
30	typename Cell,
31	typename Hash,
32	typename Grower,
33	typename Allocator, /// TODO WithStackMemory
34	typename ImplTable = HashTable<Key, Cell, Hash, Grower, Allocator>,
35	size_t BITS_FOR_BUCKET = `8`
36	>
37	class TwoLevelHashTable :
38	private boost::noncopyable,
39	protected Hash /// empty base optimization
40	{
41	protected:
42	friend class const_iterator;
43	friend class iterator;
44
45	using HashValue = size_t;
46	using Self = TwoLevelHashTable;
47	public:
48	using Impl = ImplTable;
49
50	static constexpr size_t NUM_BUCKETS = `1ULL` << BITS_FOR_BUCKET;
51	static constexpr size_t MAX_BUCKET = NUM_BUCKETS - `1`;
52
53	size_t hash(const Key & x) const { return Hash::operator()(x); }
54
55	/// NOTE Bad for hash tables with more than 2^32 cells.
56	static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (`32` - BITS_FOR_BUCKET)) & MAX_BUCKET; }
57
58	protected:
59	typename Impl::iterator beginOfNextNonEmptyBucket(size_t & bucket)
60	{
61	while (bucket != NUM_BUCKETS && impls[bucket].empty())
62	++bucket;
63
64	if (bucket != NUM_BUCKETS)
65	return impls[bucket].begin();
66
67	--bucket;
68	return impls[MAX_BUCKET].end();
69	}
70
71	typename Impl::const_iterator beginOfNextNonEmptyBucket(size_t & bucket) const
72	{
73	while (bucket != NUM_BUCKETS && impls[bucket].empty())
74	++bucket;
75
76	if (bucket != NUM_BUCKETS)
77	return impls[bucket].begin();
78
79	--bucket;
80	return impls[MAX_BUCKET].end();
81	}
82
83	public:
84	using key_type = typename Impl::key_type;
85	using mapped_type = typename Impl::mapped_type;
86	using value_type = typename Impl::value_type;
87	using cell_type = typename Impl::cell_type;
88
89	using LookupResult = typename Impl::LookupResult;
90	using ConstLookupResult = typename Impl::ConstLookupResult;
91
92	Impl impls[NUM_BUCKETS];
93
94
95	TwoLevelHashTable() {}
96
97	/// Copy the data from another (normal) hash table. It should have the same hash function.
98	template <typename Source>
99	TwoLevelHashTable(const Source & src)
100	{
101	typename Source::const_iterator it = src.begin();
102
103	/// It is assumed that the zero key (stored separately) is first in iteration order.
104	if (it != src.end() && it.getPtr()->isZero(src))
105	{
106	insert(it->getValue());
107	++it;
108	}
109
110	for (; it != src.end(); ++it)
111	{
112	const Cell * cell = it.getPtr();
113	size_t hash_value = cell->getHash(src);
114	size_t buck = getBucketFromHash(hash_value);
115	impls[buck].insertUniqueNonZero(cell, hash_value);
116	}
117	}
118
119
120	class iterator
121	{
122	Self * container;
123	size_t bucket;
124	typename Impl::iterator current_it;
125
126	friend class TwoLevelHashTable;
127
128	iterator(Self * container_, size_t bucket_, typename Impl::iterator current_it_)
129	: container(container_), bucket(bucket_), current_it(current_it_) {}
130
131	public:
132	iterator() {}
133
134	bool operator== (const iterator & rhs) const { return bucket == rhs.bucket && current_it == rhs.current_it; }
135	bool operator!= (const iterator & rhs) const { return !(*this == rhs); }
136
137	iterator & operator++()
138	{
139	++current_it;
140	if (current_it == container->impls[bucket].end())
141	{
142	++bucket;
143	current_it = container->beginOfNextNonEmptyBucket(bucket);
144	}
145
146	return *this;
147	}
148
149	Cell & operator* () const { return *current_it; }
150	Cell * operator->() const { return current_it.getPtr(); }
151
152	Cell * getPtr() const { return current_it.getPtr(); }
153	size_t getHash() const { return current_it.getHash(); }
154	};
155
156
157	class const_iterator
158	{
159	Self * container;
160	size_t bucket;
161	typename Impl::const_iterator current_it;
162
163	friend class TwoLevelHashTable;
164
165	const_iterator(Self * container_, size_t bucket_, typename Impl::const_iterator current_it_)
166	: container(container_), bucket(bucket_), current_it(current_it_) {}
167
168	public:
169	const_iterator() {}
170	const_iterator(const iterator & rhs) : container(rhs.container), bucket(rhs.bucket), current_it(rhs.current_it) {}
171
172	bool operator== (const const_iterator & rhs) const { return bucket == rhs.bucket && current_it == rhs.current_it; }
173	bool operator!= (const const_iterator & rhs) const { return !(*this == rhs); }
174
175	const_iterator & operator++()
176	{
177	++current_it;
178	if (current_it == container->impls[bucket].end())
179	{
180	++bucket;
181	current_it = container->beginOfNextNonEmptyBucket(bucket);
182	}
183
184	return *this;
185	}
186
187	const Cell & operator* () const { return *current_it; }
188	const Cell * operator->() const { return current_it->getPtr(); }
189
190	const Cell * getPtr() const { return current_it.getPtr(); }
191	size_t getHash() const { return current_it.getHash(); }
192	};
193
194
195	const_iterator begin() const
196	{
197	size_t buck = `0`;
198	typename Impl::const_iterator impl_it = beginOfNextNonEmptyBucket(buck);
199	return { this, buck, impl_it };
200	}
201
202	iterator begin()
203	{
204	size_t buck = `0`;
205	typename Impl::iterator impl_it = beginOfNextNonEmptyBucket(buck);
206	return { this, buck, impl_it };
207	}
208
209	const_iterator end() const { return { this, MAX_BUCKET, impls[MAX_BUCKET].end() }; }
210	iterator end() { return { this, MAX_BUCKET, impls[MAX_BUCKET].end() }; }
211
212
213	/// Insert a value. In the case of any more complex values, it is better to use the `emplace` function.
214	std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type & x)
215	{
216	size_t hash_value = hash(Cell::getKey(x));
217
218	std::pair<LookupResult, bool> res;
219	emplace(Cell::getKey(x), res.first, res.second, hash_value);
220
221	if (res.second)
222	insertSetMapped(res.first->getMapped(), x);
223
224	return res;
225	}
226
227
228	/* Insert the key,*
229	* return an iterator to a position that can be used for `placement new` of value,
230	* as well as the flag - whether a new key was inserted.
231	*
232	* You have to make `placement new` values if you inserted a new key,
233	* since when destroying a hash table, the destructor will be invoked for it!
234	*
235	* Example usage:
236	*
237	* Map::iterator it;
238	* bool inserted;
239	* map.emplace(key, it, inserted);
240	* if (inserted)
241	* new(&it->second) Mapped(value);
242	*/
243	template <typename KeyHolder>
244	void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted)
245	{
246	size_t hash_value = hash(keyHolderGetKey(key_holder));
247	emplace(key_holder, it, inserted, hash_value);
248	}
249
250
251	/// Same, but with a precalculated values of hash function.
252	template <typename KeyHolder>
253	void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it,
254	bool & inserted, size_t hash_value)
255	{
256	size_t buck = getBucketFromHash(hash_value);
257	impls[buck].emplace(key_holder, it, inserted, hash_value);
258	}
259
260	LookupResult ALWAYS_INLINE find(Key x, size_t hash_value)
261	{
262	size_t buck = getBucketFromHash(hash_value);
263	return impls[buck].find(x, hash_value);
264	}
265
266	ConstLookupResult ALWAYS_INLINE find(Key x, size_t hash_value) const
267	{
268	return const_cast<std::decay_t<decltype(*this)> >(this*)->find(x, hash_value);
269	}
270
271	LookupResult ALWAYS_INLINE find(Key x) { return find(x, hash(x)); }
272
273	ConstLookupResult ALWAYS_INLINE find(Key x) const { return find(x, hash(x)); }
274
275
276	void write(DB::WriteBuffer & wb) const
277	{
278	for (size_t i = `0`; i < NUM_BUCKETS; ++i)
279	impls[i].write(wb);
280	}
281
282	void writeText(DB::WriteBuffer & wb) const
283	{
284	for (size_t i = `0`; i < NUM_BUCKETS; ++i)
285	{
286	if (i != `0`)
287	DB::writeChar(`','`, wb);
288	impls[i].writeText(wb);
289	}
290	}
291
292	void read(DB::ReadBuffer & rb)
293	{
294	for (size_t i = `0`; i < NUM_BUCKETS; ++i)
295	impls[i].read(rb);
296	}
297
298	void readText(DB::ReadBuffer & rb)
299	{
300	for (size_t i = `0`; i < NUM_BUCKETS; ++i)
301	{
302	if (i != `0`)
303	DB::assertChar(`','`, rb);
304	impls[i].readText(rb);
305	}
306	}
307
308
309	size_t size() const
310	{
311	size_t res = `0`;
312	for (size_t i = `0`; i < NUM_BUCKETS; ++i)
313	res += impls[i].size();
314
315	return res;
316	}
317
318	bool empty() const
319	{
320	for (size_t i = `0`; i < NUM_BUCKETS; ++i)
321	if (!impls[i].empty())
322	return false;
323
324	return true;
325	}
326
327	size_t getBufferSizeInBytes() const
328	{
329	size_t res = `0`;
330	for (size_t i = `0`; i < NUM_BUCKETS; ++i)
331	res += impls[i].getBufferSizeInBytes();
332
333	return res;
334	}
335	};
336

Browse the source code of ClickHouse/dbms/src/Common/HashTable/TwoLevelHashTable.h