1 | #pragma once |
2 | |
3 | #include <Common/HashTable/HashTable.h> |
4 | |
5 | template <typename Key, typename TState = HashTableNoState> |
6 | struct FixedHashTableCell |
7 | { |
8 | using State = TState; |
9 | |
10 | using value_type = Key; |
11 | using mapped_type = VoidMapped; |
12 | bool full; |
13 | |
14 | FixedHashTableCell() {} |
15 | FixedHashTableCell(const Key &, const State &) : full(true) {} |
16 | |
17 | const VoidKey getKey() const { return {}; } |
18 | VoidMapped getMapped() const { return {}; } |
19 | |
20 | bool isZero(const State &) const { return !full; } |
21 | void setZero() { full = false; } |
22 | static constexpr bool need_zero_value_storage = false; |
23 | |
24 | /// This Cell is only stored inside an iterator. It's used to accomodate the fact |
25 | /// that the iterator based API always provide a reference to a continuous memory |
26 | /// containing the Key. As a result, we have to instantiate a real Key field. |
27 | /// All methods that return a mutable reference to the Key field are named with |
28 | /// -Mutable suffix, indicating this is uncommon usage. As this is only for lookup |
29 | /// tables, it's totally fine to discard the Key mutations. |
30 | struct CellExt |
31 | { |
32 | Key key; |
33 | |
34 | const VoidKey getKey() const { return {}; } |
35 | VoidMapped getMapped() const { return {}; } |
36 | const value_type & getValue() const { return key; } |
37 | void update(Key && key_, FixedHashTableCell *) { key = key_; } |
38 | }; |
39 | }; |
40 | |
41 | |
42 | /** Used as a lookup table for small keys such as UInt8, UInt16. It's different |
43 | * than a HashTable in that keys are not stored in the Cell buf, but inferred |
44 | * inside each iterator. There are a bunch of to make it faster than using |
45 | * HashTable: a) It doesn't have a conflict chain; b) There is no key |
46 | * comparision; c) The number of cycles for checking cell empty is halved; d) |
47 | * Memory layout is tighter, especially the Clearable variants. |
48 | * |
49 | * NOTE: For Set variants this should always be better. For Map variants |
50 | * however, as we need to assemble the real cell inside each iterator, there |
51 | * might be some cases we fall short. |
52 | * |
53 | * TODO: Deprecate the cell API so that end users don't rely on the structure |
54 | * of cell. Instead iterator should be used for operations such as cell |
55 | * transfer, key updates (f.g. StringRef) and serde. This will allow |
56 | * TwoLevelHashSet(Map) to contain different type of sets(maps). |
57 | */ |
58 | template <typename Key, typename Cell, typename Allocator> |
59 | class FixedHashTable : private boost::noncopyable, protected Allocator, protected Cell::State |
60 | { |
61 | static constexpr size_t NUM_CELLS = 1ULL << (sizeof(Key) * 8); |
62 | |
63 | protected: |
64 | friend class const_iterator; |
65 | friend class iterator; |
66 | friend class Reader; |
67 | |
68 | using Self = FixedHashTable; |
69 | |
70 | size_t m_size = 0; /// Amount of elements |
71 | Cell * buf; /// A piece of memory for all elements. |
72 | |
73 | void alloc() { buf = reinterpret_cast<Cell *>(Allocator::alloc(NUM_CELLS * sizeof(Cell))); } |
74 | |
75 | void free() |
76 | { |
77 | if (buf) |
78 | { |
79 | Allocator::free(buf, getBufferSizeInBytes()); |
80 | buf = nullptr; |
81 | } |
82 | } |
83 | |
84 | void destroyElements() |
85 | { |
86 | if (!std::is_trivially_destructible_v<Cell>) |
87 | for (iterator it = begin(), it_end = end(); it != it_end; ++it) |
88 | it.ptr->~Cell(); |
89 | } |
90 | |
91 | |
92 | template <typename Derived, bool is_const> |
93 | class iterator_base |
94 | { |
95 | using Container = std::conditional_t<is_const, const Self, Self>; |
96 | using cell_type = std::conditional_t<is_const, const Cell, Cell>; |
97 | |
98 | Container * container; |
99 | cell_type * ptr; |
100 | |
101 | friend class FixedHashTable; |
102 | |
103 | public: |
104 | iterator_base() {} |
105 | iterator_base(Container * container_, cell_type * ptr_) : container(container_), ptr(ptr_) |
106 | { |
107 | cell.update(ptr - container->buf, ptr); |
108 | } |
109 | |
110 | bool operator==(const iterator_base & rhs) const { return ptr == rhs.ptr; } |
111 | bool operator!=(const iterator_base & rhs) const { return ptr != rhs.ptr; } |
112 | |
113 | Derived & operator++() |
114 | { |
115 | ++ptr; |
116 | |
117 | /// Skip empty cells in the main buffer. |
118 | auto buf_end = container->buf + container->NUM_CELLS; |
119 | while (ptr < buf_end && ptr->isZero(*container)) |
120 | ++ptr; |
121 | |
122 | return static_cast<Derived &>(*this); |
123 | } |
124 | |
125 | auto & operator*() |
126 | { |
127 | if (cell.key != ptr - container->buf) |
128 | cell.update(ptr - container->buf, ptr); |
129 | return cell; |
130 | } |
131 | auto * operator-> () |
132 | { |
133 | if (cell.key != ptr - container->buf) |
134 | cell.update(ptr - container->buf, ptr); |
135 | return &cell; |
136 | } |
137 | |
138 | auto getPtr() const { return ptr; } |
139 | size_t getHash() const { return ptr - container->buf; } |
140 | size_t getCollisionChainLength() const { return 0; } |
141 | typename cell_type::CellExt cell; |
142 | }; |
143 | |
144 | |
145 | public: |
146 | using key_type = Key; |
147 | using mapped_type = typename Cell::mapped_type; |
148 | using value_type = typename Cell::value_type; |
149 | using cell_type = Cell; |
150 | |
151 | using LookupResult = Cell *; |
152 | using ConstLookupResult = const Cell *; |
153 | |
154 | |
155 | size_t hash(const Key & x) const { return x; } |
156 | |
157 | FixedHashTable() { alloc(); } |
158 | |
159 | FixedHashTable(FixedHashTable && rhs) : buf(nullptr) { *this = std::move(rhs); } |
160 | |
161 | ~FixedHashTable() |
162 | { |
163 | destroyElements(); |
164 | free(); |
165 | } |
166 | |
167 | FixedHashTable & operator=(FixedHashTable && rhs) |
168 | { |
169 | destroyElements(); |
170 | free(); |
171 | |
172 | std::swap(buf, rhs.buf); |
173 | std::swap(m_size, rhs.m_size); |
174 | |
175 | Allocator::operator=(std::move(rhs)); |
176 | Cell::State::operator=(std::move(rhs)); |
177 | |
178 | return *this; |
179 | } |
180 | |
181 | class Reader final : private Cell::State |
182 | { |
183 | public: |
184 | Reader(DB::ReadBuffer & in_) : in(in_) {} |
185 | |
186 | Reader(const Reader &) = delete; |
187 | Reader & operator=(const Reader &) = delete; |
188 | |
189 | bool next() |
190 | { |
191 | if (!is_initialized) |
192 | { |
193 | Cell::State::read(in); |
194 | DB::readVarUInt(size, in); |
195 | is_initialized = true; |
196 | } |
197 | |
198 | if (read_count == size) |
199 | { |
200 | is_eof = true; |
201 | return false; |
202 | } |
203 | |
204 | cell.read(in); |
205 | ++read_count; |
206 | |
207 | return true; |
208 | } |
209 | |
210 | inline const value_type & get() const |
211 | { |
212 | if (!is_initialized || is_eof) |
213 | throw DB::Exception("No available data" , DB::ErrorCodes::NO_AVAILABLE_DATA); |
214 | |
215 | return cell.getValue(); |
216 | } |
217 | |
218 | private: |
219 | DB::ReadBuffer & in; |
220 | Cell cell; |
221 | size_t read_count = 0; |
222 | size_t size; |
223 | bool is_eof = false; |
224 | bool is_initialized = false; |
225 | }; |
226 | |
227 | |
228 | class iterator : public iterator_base<iterator, false> |
229 | { |
230 | public: |
231 | using iterator_base<iterator, false>::iterator_base; |
232 | }; |
233 | |
234 | class const_iterator : public iterator_base<const_iterator, true> |
235 | { |
236 | public: |
237 | using iterator_base<const_iterator, true>::iterator_base; |
238 | }; |
239 | |
240 | |
241 | const_iterator begin() const |
242 | { |
243 | if (!buf) |
244 | return end(); |
245 | |
246 | const Cell * ptr = buf; |
247 | auto buf_end = buf + NUM_CELLS; |
248 | while (ptr < buf_end && ptr->isZero(*this)) |
249 | ++ptr; |
250 | |
251 | return const_iterator(this, ptr); |
252 | } |
253 | |
254 | const_iterator cbegin() const { return begin(); } |
255 | |
256 | iterator begin() |
257 | { |
258 | if (!buf) |
259 | return end(); |
260 | |
261 | Cell * ptr = buf; |
262 | auto buf_end = buf + NUM_CELLS; |
263 | while (ptr < buf_end && ptr->isZero(*this)) |
264 | ++ptr; |
265 | |
266 | return iterator(this, ptr); |
267 | } |
268 | |
269 | const_iterator end() const { return const_iterator(this, buf + NUM_CELLS); } |
270 | const_iterator cend() const { return end(); } |
271 | iterator end() { return iterator(this, buf + NUM_CELLS); } |
272 | |
273 | |
274 | public: |
275 | /// The last parameter is unused but exists for compatibility with HashTable interface. |
276 | void ALWAYS_INLINE emplace(const Key & x, LookupResult & it, bool & inserted, size_t /* hash */ = 0) |
277 | { |
278 | it = &buf[x]; |
279 | |
280 | if (!buf[x].isZero(*this)) |
281 | { |
282 | inserted = false; |
283 | return; |
284 | } |
285 | |
286 | new (&buf[x]) Cell(x, *this); |
287 | inserted = true; |
288 | ++m_size; |
289 | } |
290 | |
291 | std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type & x) |
292 | { |
293 | std::pair<LookupResult, bool> res; |
294 | emplace(Cell::getKey(x), res.first, res.second); |
295 | if (res.second) |
296 | insertSetMapped(res.first->getMapped(), x); |
297 | |
298 | return res; |
299 | } |
300 | |
301 | LookupResult ALWAYS_INLINE find(const Key & x) { return !buf[x].isZero(*this) ? &buf[x] : nullptr; } |
302 | |
303 | ConstLookupResult ALWAYS_INLINE find(const Key & x) const { return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x); } |
304 | |
305 | LookupResult ALWAYS_INLINE find(const Key &, size_t hash_value) { return !buf[hash_value].isZero(*this) ? &buf[hash_value] : nullptr; } |
306 | |
307 | ConstLookupResult ALWAYS_INLINE find(const Key & key, size_t hash_value) const |
308 | { |
309 | return const_cast<std::decay_t<decltype(*this)> *>(this)->find(key, hash_value); |
310 | } |
311 | |
312 | bool ALWAYS_INLINE has(const Key & x) const { return !buf[x].isZero(*this); } |
313 | bool ALWAYS_INLINE has(const Key &, size_t hash_value) const { return !buf[hash_value].isZero(*this); } |
314 | |
315 | void write(DB::WriteBuffer & wb) const |
316 | { |
317 | Cell::State::write(wb); |
318 | DB::writeVarUInt(m_size, wb); |
319 | |
320 | for (auto ptr = buf, buf_end = buf + NUM_CELLS; ptr < buf_end; ++ptr) |
321 | if (!ptr->isZero(*this)) |
322 | { |
323 | DB::writeVarUInt(ptr - buf); |
324 | ptr->write(wb); |
325 | } |
326 | } |
327 | |
328 | void writeText(DB::WriteBuffer & wb) const |
329 | { |
330 | Cell::State::writeText(wb); |
331 | DB::writeText(m_size, wb); |
332 | |
333 | for (auto ptr = buf, buf_end = buf + NUM_CELLS; ptr < buf_end; ++ptr) |
334 | { |
335 | if (!ptr->isZero(*this)) |
336 | { |
337 | DB::writeChar(',', wb); |
338 | DB::writeText(ptr - buf, wb); |
339 | DB::writeChar(',', wb); |
340 | ptr->writeText(wb); |
341 | } |
342 | } |
343 | } |
344 | |
345 | void read(DB::ReadBuffer & rb) |
346 | { |
347 | Cell::State::read(rb); |
348 | destroyElements(); |
349 | DB::readVarUInt(m_size, rb); |
350 | free(); |
351 | alloc(); |
352 | |
353 | for (size_t i = 0; i < m_size; ++i) |
354 | { |
355 | size_t place_value = 0; |
356 | DB::readVarUInt(place_value, rb); |
357 | Cell x; |
358 | x.read(rb); |
359 | new (&buf[place_value]) Cell(x, *this); |
360 | } |
361 | } |
362 | |
363 | void readText(DB::ReadBuffer & rb) |
364 | { |
365 | Cell::State::readText(rb); |
366 | destroyElements(); |
367 | DB::readText(m_size, rb); |
368 | free(); |
369 | alloc(); |
370 | |
371 | for (size_t i = 0; i < m_size; ++i) |
372 | { |
373 | size_t place_value = 0; |
374 | DB::assertChar(',', rb); |
375 | DB::readText(place_value, rb); |
376 | Cell x; |
377 | DB::assertChar(',', rb); |
378 | x.readText(rb); |
379 | new (&buf[place_value]) Cell(x, *this); |
380 | } |
381 | } |
382 | |
383 | size_t size() const { return m_size; } |
384 | |
385 | bool empty() const { return 0 == m_size; } |
386 | |
387 | void clear() |
388 | { |
389 | destroyElements(); |
390 | m_size = 0; |
391 | |
392 | memset(static_cast<void *>(buf), 0, NUM_CELLS * sizeof(*buf)); |
393 | } |
394 | |
395 | /// After executing this function, the table can only be destroyed, |
396 | /// and also you can use the methods `size`, `empty`, `begin`, `end`. |
397 | void clearAndShrink() |
398 | { |
399 | destroyElements(); |
400 | m_size = 0; |
401 | free(); |
402 | } |
403 | |
404 | size_t getBufferSizeInBytes() const { return NUM_CELLS * sizeof(Cell); } |
405 | |
406 | size_t getBufferSizeInCells() const { return NUM_CELLS; } |
407 | |
408 | #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS |
409 | size_t getCollisions() const { return 0; } |
410 | #endif |
411 | }; |
412 | |