1#pragma once
2
3#include <Common/HashTable/HashTable.h>
4
5
6/** Two-level hash table.
7 * Represents 256 (or 1ULL << BITS_FOR_BUCKET) small hash tables (buckets of the first level).
8 * To determine which one to use, one of the bytes of the hash function is taken.
9 *
10 * Usually works a little slower than a simple hash table.
11 * However, it has advantages in some cases:
12 * - if you need to merge two hash tables together, then you can easily parallelize it by buckets;
13 * - delay during resizes is amortized, since the small hash tables will be resized separately;
14 * - in theory, resizes are cache-local in a larger range of sizes.
15 */
16
17template <size_t initial_size_degree = 8>
18struct TwoLevelHashTableGrower : public HashTableGrower<initial_size_degree>
19{
20 /// Increase the size of the hash table.
21 void increaseSize()
22 {
23 this->size_degree += this->size_degree >= 15 ? 1 : 2;
24 }
25};
26
27template
28<
29 typename Key,
30 typename Cell,
31 typename Hash,
32 typename Grower,
33 typename Allocator, /// TODO WithStackMemory
34 typename ImplTable = HashTable<Key, Cell, Hash, Grower, Allocator>,
35 size_t BITS_FOR_BUCKET = 8
36>
37class TwoLevelHashTable :
38 private boost::noncopyable,
39 protected Hash /// empty base optimization
40{
41protected:
42 friend class const_iterator;
43 friend class iterator;
44
45 using HashValue = size_t;
46 using Self = TwoLevelHashTable;
47public:
48 using Impl = ImplTable;
49
50 static constexpr size_t NUM_BUCKETS = 1ULL << BITS_FOR_BUCKET;
51 static constexpr size_t MAX_BUCKET = NUM_BUCKETS - 1;
52
53 size_t hash(const Key & x) const { return Hash::operator()(x); }
54
55 /// NOTE Bad for hash tables with more than 2^32 cells.
56 static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET; }
57
58protected:
59 typename Impl::iterator beginOfNextNonEmptyBucket(size_t & bucket)
60 {
61 while (bucket != NUM_BUCKETS && impls[bucket].empty())
62 ++bucket;
63
64 if (bucket != NUM_BUCKETS)
65 return impls[bucket].begin();
66
67 --bucket;
68 return impls[MAX_BUCKET].end();
69 }
70
71 typename Impl::const_iterator beginOfNextNonEmptyBucket(size_t & bucket) const
72 {
73 while (bucket != NUM_BUCKETS && impls[bucket].empty())
74 ++bucket;
75
76 if (bucket != NUM_BUCKETS)
77 return impls[bucket].begin();
78
79 --bucket;
80 return impls[MAX_BUCKET].end();
81 }
82
83public:
84 using key_type = typename Impl::key_type;
85 using mapped_type = typename Impl::mapped_type;
86 using value_type = typename Impl::value_type;
87 using cell_type = typename Impl::cell_type;
88
89 using LookupResult = typename Impl::LookupResult;
90 using ConstLookupResult = typename Impl::ConstLookupResult;
91
92 Impl impls[NUM_BUCKETS];
93
94
95 TwoLevelHashTable() {}
96
97 /// Copy the data from another (normal) hash table. It should have the same hash function.
98 template <typename Source>
99 TwoLevelHashTable(const Source & src)
100 {
101 typename Source::const_iterator it = src.begin();
102
103 /// It is assumed that the zero key (stored separately) is first in iteration order.
104 if (it != src.end() && it.getPtr()->isZero(src))
105 {
106 insert(it->getValue());
107 ++it;
108 }
109
110 for (; it != src.end(); ++it)
111 {
112 const Cell * cell = it.getPtr();
113 size_t hash_value = cell->getHash(src);
114 size_t buck = getBucketFromHash(hash_value);
115 impls[buck].insertUniqueNonZero(cell, hash_value);
116 }
117 }
118
119
120 class iterator
121 {
122 Self * container;
123 size_t bucket;
124 typename Impl::iterator current_it;
125
126 friend class TwoLevelHashTable;
127
128 iterator(Self * container_, size_t bucket_, typename Impl::iterator current_it_)
129 : container(container_), bucket(bucket_), current_it(current_it_) {}
130
131 public:
132 iterator() {}
133
134 bool operator== (const iterator & rhs) const { return bucket == rhs.bucket && current_it == rhs.current_it; }
135 bool operator!= (const iterator & rhs) const { return !(*this == rhs); }
136
137 iterator & operator++()
138 {
139 ++current_it;
140 if (current_it == container->impls[bucket].end())
141 {
142 ++bucket;
143 current_it = container->beginOfNextNonEmptyBucket(bucket);
144 }
145
146 return *this;
147 }
148
149 Cell & operator* () const { return *current_it; }
150 Cell * operator->() const { return current_it.getPtr(); }
151
152 Cell * getPtr() const { return current_it.getPtr(); }
153 size_t getHash() const { return current_it.getHash(); }
154 };
155
156
157 class const_iterator
158 {
159 Self * container;
160 size_t bucket;
161 typename Impl::const_iterator current_it;
162
163 friend class TwoLevelHashTable;
164
165 const_iterator(Self * container_, size_t bucket_, typename Impl::const_iterator current_it_)
166 : container(container_), bucket(bucket_), current_it(current_it_) {}
167
168 public:
169 const_iterator() {}
170 const_iterator(const iterator & rhs) : container(rhs.container), bucket(rhs.bucket), current_it(rhs.current_it) {}
171
172 bool operator== (const const_iterator & rhs) const { return bucket == rhs.bucket && current_it == rhs.current_it; }
173 bool operator!= (const const_iterator & rhs) const { return !(*this == rhs); }
174
175 const_iterator & operator++()
176 {
177 ++current_it;
178 if (current_it == container->impls[bucket].end())
179 {
180 ++bucket;
181 current_it = container->beginOfNextNonEmptyBucket(bucket);
182 }
183
184 return *this;
185 }
186
187 const Cell & operator* () const { return *current_it; }
188 const Cell * operator->() const { return current_it->getPtr(); }
189
190 const Cell * getPtr() const { return current_it.getPtr(); }
191 size_t getHash() const { return current_it.getHash(); }
192 };
193
194
195 const_iterator begin() const
196 {
197 size_t buck = 0;
198 typename Impl::const_iterator impl_it = beginOfNextNonEmptyBucket(buck);
199 return { this, buck, impl_it };
200 }
201
202 iterator begin()
203 {
204 size_t buck = 0;
205 typename Impl::iterator impl_it = beginOfNextNonEmptyBucket(buck);
206 return { this, buck, impl_it };
207 }
208
209 const_iterator end() const { return { this, MAX_BUCKET, impls[MAX_BUCKET].end() }; }
210 iterator end() { return { this, MAX_BUCKET, impls[MAX_BUCKET].end() }; }
211
212
213 /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function.
214 std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type & x)
215 {
216 size_t hash_value = hash(Cell::getKey(x));
217
218 std::pair<LookupResult, bool> res;
219 emplace(Cell::getKey(x), res.first, res.second, hash_value);
220
221 if (res.second)
222 insertSetMapped(res.first->getMapped(), x);
223
224 return res;
225 }
226
227
228 /** Insert the key,
229 * return an iterator to a position that can be used for `placement new` of value,
230 * as well as the flag - whether a new key was inserted.
231 *
232 * You have to make `placement new` values if you inserted a new key,
233 * since when destroying a hash table, the destructor will be invoked for it!
234 *
235 * Example usage:
236 *
237 * Map::iterator it;
238 * bool inserted;
239 * map.emplace(key, it, inserted);
240 * if (inserted)
241 * new(&it->second) Mapped(value);
242 */
243 template <typename KeyHolder>
244 void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted)
245 {
246 size_t hash_value = hash(keyHolderGetKey(key_holder));
247 emplace(key_holder, it, inserted, hash_value);
248 }
249
250
251 /// Same, but with a precalculated values of hash function.
252 template <typename KeyHolder>
253 void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it,
254 bool & inserted, size_t hash_value)
255 {
256 size_t buck = getBucketFromHash(hash_value);
257 impls[buck].emplace(key_holder, it, inserted, hash_value);
258 }
259
260 LookupResult ALWAYS_INLINE find(Key x, size_t hash_value)
261 {
262 size_t buck = getBucketFromHash(hash_value);
263 return impls[buck].find(x, hash_value);
264 }
265
266 ConstLookupResult ALWAYS_INLINE find(Key x, size_t hash_value) const
267 {
268 return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value);
269 }
270
271 LookupResult ALWAYS_INLINE find(Key x) { return find(x, hash(x)); }
272
273 ConstLookupResult ALWAYS_INLINE find(Key x) const { return find(x, hash(x)); }
274
275
276 void write(DB::WriteBuffer & wb) const
277 {
278 for (size_t i = 0; i < NUM_BUCKETS; ++i)
279 impls[i].write(wb);
280 }
281
282 void writeText(DB::WriteBuffer & wb) const
283 {
284 for (size_t i = 0; i < NUM_BUCKETS; ++i)
285 {
286 if (i != 0)
287 DB::writeChar(',', wb);
288 impls[i].writeText(wb);
289 }
290 }
291
292 void read(DB::ReadBuffer & rb)
293 {
294 for (size_t i = 0; i < NUM_BUCKETS; ++i)
295 impls[i].read(rb);
296 }
297
298 void readText(DB::ReadBuffer & rb)
299 {
300 for (size_t i = 0; i < NUM_BUCKETS; ++i)
301 {
302 if (i != 0)
303 DB::assertChar(',', rb);
304 impls[i].readText(rb);
305 }
306 }
307
308
309 size_t size() const
310 {
311 size_t res = 0;
312 for (size_t i = 0; i < NUM_BUCKETS; ++i)
313 res += impls[i].size();
314
315 return res;
316 }
317
318 bool empty() const
319 {
320 for (size_t i = 0; i < NUM_BUCKETS; ++i)
321 if (!impls[i].empty())
322 return false;
323
324 return true;
325 }
326
327 size_t getBufferSizeInBytes() const
328 {
329 size_t res = 0;
330 for (size_t i = 0; i < NUM_BUCKETS; ++i)
331 res += impls[i].getBufferSizeInBytes();
332
333 return res;
334 }
335};
336