1#pragma once
2
3#include <Common/HashTable/HashTable.h>
4
5template <typename Key, typename TState = HashTableNoState>
6struct FixedHashTableCell
7{
8 using State = TState;
9
10 using value_type = Key;
11 using mapped_type = VoidMapped;
12 bool full;
13
14 FixedHashTableCell() {}
15 FixedHashTableCell(const Key &, const State &) : full(true) {}
16
17 const VoidKey getKey() const { return {}; }
18 VoidMapped getMapped() const { return {}; }
19
20 bool isZero(const State &) const { return !full; }
21 void setZero() { full = false; }
22 static constexpr bool need_zero_value_storage = false;
23
24 /// This Cell is only stored inside an iterator. It's used to accomodate the fact
25 /// that the iterator based API always provide a reference to a continuous memory
26 /// containing the Key. As a result, we have to instantiate a real Key field.
27 /// All methods that return a mutable reference to the Key field are named with
28 /// -Mutable suffix, indicating this is uncommon usage. As this is only for lookup
29 /// tables, it's totally fine to discard the Key mutations.
30 struct CellExt
31 {
32 Key key;
33
34 const VoidKey getKey() const { return {}; }
35 VoidMapped getMapped() const { return {}; }
36 const value_type & getValue() const { return key; }
37 void update(Key && key_, FixedHashTableCell *) { key = key_; }
38 };
39};
40
41
42/** Used as a lookup table for small keys such as UInt8, UInt16. It's different
43 * than a HashTable in that keys are not stored in the Cell buf, but inferred
44 * inside each iterator. There are a bunch of to make it faster than using
45 * HashTable: a) It doesn't have a conflict chain; b) There is no key
46 * comparision; c) The number of cycles for checking cell empty is halved; d)
47 * Memory layout is tighter, especially the Clearable variants.
48 *
49 * NOTE: For Set variants this should always be better. For Map variants
50 * however, as we need to assemble the real cell inside each iterator, there
51 * might be some cases we fall short.
52 *
53 * TODO: Deprecate the cell API so that end users don't rely on the structure
54 * of cell. Instead iterator should be used for operations such as cell
55 * transfer, key updates (f.g. StringRef) and serde. This will allow
56 * TwoLevelHashSet(Map) to contain different type of sets(maps).
57 */
58template <typename Key, typename Cell, typename Allocator>
59class FixedHashTable : private boost::noncopyable, protected Allocator, protected Cell::State
60{
61 static constexpr size_t NUM_CELLS = 1ULL << (sizeof(Key) * 8);
62
63protected:
64 friend class const_iterator;
65 friend class iterator;
66 friend class Reader;
67
68 using Self = FixedHashTable;
69
70 size_t m_size = 0; /// Amount of elements
71 Cell * buf; /// A piece of memory for all elements.
72
73 void alloc() { buf = reinterpret_cast<Cell *>(Allocator::alloc(NUM_CELLS * sizeof(Cell))); }
74
75 void free()
76 {
77 if (buf)
78 {
79 Allocator::free(buf, getBufferSizeInBytes());
80 buf = nullptr;
81 }
82 }
83
84 void destroyElements()
85 {
86 if (!std::is_trivially_destructible_v<Cell>)
87 for (iterator it = begin(), it_end = end(); it != it_end; ++it)
88 it.ptr->~Cell();
89 }
90
91
92 template <typename Derived, bool is_const>
93 class iterator_base
94 {
95 using Container = std::conditional_t<is_const, const Self, Self>;
96 using cell_type = std::conditional_t<is_const, const Cell, Cell>;
97
98 Container * container;
99 cell_type * ptr;
100
101 friend class FixedHashTable;
102
103 public:
104 iterator_base() {}
105 iterator_base(Container * container_, cell_type * ptr_) : container(container_), ptr(ptr_)
106 {
107 cell.update(ptr - container->buf, ptr);
108 }
109
110 bool operator==(const iterator_base & rhs) const { return ptr == rhs.ptr; }
111 bool operator!=(const iterator_base & rhs) const { return ptr != rhs.ptr; }
112
113 Derived & operator++()
114 {
115 ++ptr;
116
117 /// Skip empty cells in the main buffer.
118 auto buf_end = container->buf + container->NUM_CELLS;
119 while (ptr < buf_end && ptr->isZero(*container))
120 ++ptr;
121
122 return static_cast<Derived &>(*this);
123 }
124
125 auto & operator*()
126 {
127 if (cell.key != ptr - container->buf)
128 cell.update(ptr - container->buf, ptr);
129 return cell;
130 }
131 auto * operator-> ()
132 {
133 if (cell.key != ptr - container->buf)
134 cell.update(ptr - container->buf, ptr);
135 return &cell;
136 }
137
138 auto getPtr() const { return ptr; }
139 size_t getHash() const { return ptr - container->buf; }
140 size_t getCollisionChainLength() const { return 0; }
141 typename cell_type::CellExt cell;
142 };
143
144
145public:
146 using key_type = Key;
147 using mapped_type = typename Cell::mapped_type;
148 using value_type = typename Cell::value_type;
149 using cell_type = Cell;
150
151 using LookupResult = Cell *;
152 using ConstLookupResult = const Cell *;
153
154
155 size_t hash(const Key & x) const { return x; }
156
157 FixedHashTable() { alloc(); }
158
159 FixedHashTable(FixedHashTable && rhs) : buf(nullptr) { *this = std::move(rhs); }
160
161 ~FixedHashTable()
162 {
163 destroyElements();
164 free();
165 }
166
167 FixedHashTable & operator=(FixedHashTable && rhs)
168 {
169 destroyElements();
170 free();
171
172 std::swap(buf, rhs.buf);
173 std::swap(m_size, rhs.m_size);
174
175 Allocator::operator=(std::move(rhs));
176 Cell::State::operator=(std::move(rhs));
177
178 return *this;
179 }
180
181 class Reader final : private Cell::State
182 {
183 public:
184 Reader(DB::ReadBuffer & in_) : in(in_) {}
185
186 Reader(const Reader &) = delete;
187 Reader & operator=(const Reader &) = delete;
188
189 bool next()
190 {
191 if (!is_initialized)
192 {
193 Cell::State::read(in);
194 DB::readVarUInt(size, in);
195 is_initialized = true;
196 }
197
198 if (read_count == size)
199 {
200 is_eof = true;
201 return false;
202 }
203
204 cell.read(in);
205 ++read_count;
206
207 return true;
208 }
209
210 inline const value_type & get() const
211 {
212 if (!is_initialized || is_eof)
213 throw DB::Exception("No available data", DB::ErrorCodes::NO_AVAILABLE_DATA);
214
215 return cell.getValue();
216 }
217
218 private:
219 DB::ReadBuffer & in;
220 Cell cell;
221 size_t read_count = 0;
222 size_t size;
223 bool is_eof = false;
224 bool is_initialized = false;
225 };
226
227
228 class iterator : public iterator_base<iterator, false>
229 {
230 public:
231 using iterator_base<iterator, false>::iterator_base;
232 };
233
234 class const_iterator : public iterator_base<const_iterator, true>
235 {
236 public:
237 using iterator_base<const_iterator, true>::iterator_base;
238 };
239
240
241 const_iterator begin() const
242 {
243 if (!buf)
244 return end();
245
246 const Cell * ptr = buf;
247 auto buf_end = buf + NUM_CELLS;
248 while (ptr < buf_end && ptr->isZero(*this))
249 ++ptr;
250
251 return const_iterator(this, ptr);
252 }
253
254 const_iterator cbegin() const { return begin(); }
255
256 iterator begin()
257 {
258 if (!buf)
259 return end();
260
261 Cell * ptr = buf;
262 auto buf_end = buf + NUM_CELLS;
263 while (ptr < buf_end && ptr->isZero(*this))
264 ++ptr;
265
266 return iterator(this, ptr);
267 }
268
269 const_iterator end() const { return const_iterator(this, buf + NUM_CELLS); }
270 const_iterator cend() const { return end(); }
271 iterator end() { return iterator(this, buf + NUM_CELLS); }
272
273
274public:
275 /// The last parameter is unused but exists for compatibility with HashTable interface.
276 void ALWAYS_INLINE emplace(const Key & x, LookupResult & it, bool & inserted, size_t /* hash */ = 0)
277 {
278 it = &buf[x];
279
280 if (!buf[x].isZero(*this))
281 {
282 inserted = false;
283 return;
284 }
285
286 new (&buf[x]) Cell(x, *this);
287 inserted = true;
288 ++m_size;
289 }
290
291 std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type & x)
292 {
293 std::pair<LookupResult, bool> res;
294 emplace(Cell::getKey(x), res.first, res.second);
295 if (res.second)
296 insertSetMapped(res.first->getMapped(), x);
297
298 return res;
299 }
300
301 LookupResult ALWAYS_INLINE find(const Key & x) { return !buf[x].isZero(*this) ? &buf[x] : nullptr; }
302
303 ConstLookupResult ALWAYS_INLINE find(const Key & x) const { return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x); }
304
305 LookupResult ALWAYS_INLINE find(const Key &, size_t hash_value) { return !buf[hash_value].isZero(*this) ? &buf[hash_value] : nullptr; }
306
307 ConstLookupResult ALWAYS_INLINE find(const Key & key, size_t hash_value) const
308 {
309 return const_cast<std::decay_t<decltype(*this)> *>(this)->find(key, hash_value);
310 }
311
312 bool ALWAYS_INLINE has(const Key & x) const { return !buf[x].isZero(*this); }
313 bool ALWAYS_INLINE has(const Key &, size_t hash_value) const { return !buf[hash_value].isZero(*this); }
314
315 void write(DB::WriteBuffer & wb) const
316 {
317 Cell::State::write(wb);
318 DB::writeVarUInt(m_size, wb);
319
320 for (auto ptr = buf, buf_end = buf + NUM_CELLS; ptr < buf_end; ++ptr)
321 if (!ptr->isZero(*this))
322 {
323 DB::writeVarUInt(ptr - buf);
324 ptr->write(wb);
325 }
326 }
327
328 void writeText(DB::WriteBuffer & wb) const
329 {
330 Cell::State::writeText(wb);
331 DB::writeText(m_size, wb);
332
333 for (auto ptr = buf, buf_end = buf + NUM_CELLS; ptr < buf_end; ++ptr)
334 {
335 if (!ptr->isZero(*this))
336 {
337 DB::writeChar(',', wb);
338 DB::writeText(ptr - buf, wb);
339 DB::writeChar(',', wb);
340 ptr->writeText(wb);
341 }
342 }
343 }
344
345 void read(DB::ReadBuffer & rb)
346 {
347 Cell::State::read(rb);
348 destroyElements();
349 DB::readVarUInt(m_size, rb);
350 free();
351 alloc();
352
353 for (size_t i = 0; i < m_size; ++i)
354 {
355 size_t place_value = 0;
356 DB::readVarUInt(place_value, rb);
357 Cell x;
358 x.read(rb);
359 new (&buf[place_value]) Cell(x, *this);
360 }
361 }
362
363 void readText(DB::ReadBuffer & rb)
364 {
365 Cell::State::readText(rb);
366 destroyElements();
367 DB::readText(m_size, rb);
368 free();
369 alloc();
370
371 for (size_t i = 0; i < m_size; ++i)
372 {
373 size_t place_value = 0;
374 DB::assertChar(',', rb);
375 DB::readText(place_value, rb);
376 Cell x;
377 DB::assertChar(',', rb);
378 x.readText(rb);
379 new (&buf[place_value]) Cell(x, *this);
380 }
381 }
382
383 size_t size() const { return m_size; }
384
385 bool empty() const { return 0 == m_size; }
386
387 void clear()
388 {
389 destroyElements();
390 m_size = 0;
391
392 memset(static_cast<void *>(buf), 0, NUM_CELLS * sizeof(*buf));
393 }
394
395 /// After executing this function, the table can only be destroyed,
396 /// and also you can use the methods `size`, `empty`, `begin`, `end`.
397 void clearAndShrink()
398 {
399 destroyElements();
400 m_size = 0;
401 free();
402 }
403
404 size_t getBufferSizeInBytes() const { return NUM_CELLS * sizeof(Cell); }
405
406 size_t getBufferSizeInCells() const { return NUM_CELLS; }
407
408#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
409 size_t getCollisions() const { return 0; }
410#endif
411};
412