1 | #pragma once |
2 | |
3 | #include <Common/HashTable/HashMap.h> |
4 | #include <Common/HashTable/HashTable.h> |
5 | |
6 | #include <variant> |
7 | |
8 | using StringKey8 = UInt64; |
9 | using StringKey16 = DB::UInt128; |
10 | struct StringKey24 |
11 | { |
12 | UInt64 a; |
13 | UInt64 b; |
14 | UInt64 c; |
15 | |
16 | bool operator==(const StringKey24 rhs) const { return a == rhs.a && b == rhs.b && c == rhs.c; } |
17 | }; |
18 | |
19 | inline StringRef ALWAYS_INLINE toStringRef(const StringKey8 & n) |
20 | { |
21 | return {reinterpret_cast<const char *>(&n), 8ul - (__builtin_clzll(n) >> 3)}; |
22 | } |
23 | inline StringRef ALWAYS_INLINE toStringRef(const StringKey16 & n) |
24 | { |
25 | return {reinterpret_cast<const char *>(&n), 16ul - (__builtin_clzll(n.high) >> 3)}; |
26 | } |
27 | inline StringRef ALWAYS_INLINE toStringRef(const StringKey24 & n) |
28 | { |
29 | return {reinterpret_cast<const char *>(&n), 24ul - (__builtin_clzll(n.c) >> 3)}; |
30 | } |
31 | |
32 | struct StringHashTableHash |
33 | { |
34 | #if defined(__SSE4_2__) |
35 | size_t ALWAYS_INLINE operator()(StringKey8 key) const |
36 | { |
37 | size_t res = -1ULL; |
38 | res = _mm_crc32_u64(res, key); |
39 | return res; |
40 | } |
41 | size_t ALWAYS_INLINE operator()(StringKey16 key) const |
42 | { |
43 | size_t res = -1ULL; |
44 | res = _mm_crc32_u64(res, key.low); |
45 | res = _mm_crc32_u64(res, key.high); |
46 | return res; |
47 | } |
48 | size_t ALWAYS_INLINE operator()(StringKey24 key) const |
49 | { |
50 | size_t res = -1ULL; |
51 | res = _mm_crc32_u64(res, key.a); |
52 | res = _mm_crc32_u64(res, key.b); |
53 | res = _mm_crc32_u64(res, key.c); |
54 | return res; |
55 | } |
56 | #else |
57 | size_t ALWAYS_INLINE operator()(StringKey8 key) const |
58 | { |
59 | return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 8); |
60 | } |
61 | size_t ALWAYS_INLINE operator()(StringKey16 key) const |
62 | { |
63 | return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 16); |
64 | } |
65 | size_t ALWAYS_INLINE operator()(StringKey24 key) const |
66 | { |
67 | return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 24); |
68 | } |
69 | #endif |
70 | size_t ALWAYS_INLINE operator()(StringRef key) const |
71 | { |
72 | return StringRefHash()(key); |
73 | } |
74 | }; |
75 | |
76 | template <typename Cell> |
77 | struct StringHashTableEmpty |
78 | { |
79 | using Self = StringHashTableEmpty; |
80 | |
81 | bool has_zero = false; |
82 | std::aligned_storage_t<sizeof(Cell), alignof(Cell)> zero_value_storage; /// Storage of element with zero key. |
83 | |
84 | public: |
85 | bool hasZero() const { return has_zero; } |
86 | |
87 | void setHasZero() |
88 | { |
89 | has_zero = true; |
90 | new (zeroValue()) Cell(); |
91 | } |
92 | |
93 | void setHasZero(const Cell & other) |
94 | { |
95 | has_zero = true; |
96 | new (zeroValue()) Cell(other); |
97 | } |
98 | |
99 | void clearHasZero() |
100 | { |
101 | has_zero = false; |
102 | if (!std::is_trivially_destructible_v<Cell>) |
103 | zeroValue()->~Cell(); |
104 | } |
105 | |
106 | Cell * zeroValue() { return reinterpret_cast<Cell *>(&zero_value_storage); } |
107 | const Cell * zeroValue() const { return reinterpret_cast<const Cell *>(&zero_value_storage); } |
108 | |
109 | using LookupResult = Cell *; |
110 | using ConstLookupResult = const Cell *; |
111 | |
112 | template <typename KeyHolder> |
113 | void ALWAYS_INLINE emplace(KeyHolder &&, LookupResult & it, bool & inserted, size_t = 0) |
114 | { |
115 | if (!hasZero()) |
116 | { |
117 | setHasZero(); |
118 | inserted = true; |
119 | } |
120 | else |
121 | inserted = false; |
122 | it = zeroValue(); |
123 | } |
124 | |
125 | template <typename Key> |
126 | LookupResult ALWAYS_INLINE find(const Key &, size_t = 0) |
127 | { |
128 | return hasZero() ? zeroValue() : nullptr; |
129 | } |
130 | |
131 | template <typename Key> |
132 | ConstLookupResult ALWAYS_INLINE find(const Key &, size_t = 0) const |
133 | { |
134 | return hasZero() ? zeroValue() : nullptr; |
135 | } |
136 | |
137 | void write(DB::WriteBuffer & wb) const { zeroValue()->write(wb); } |
138 | void writeText(DB::WriteBuffer & wb) const { zeroValue()->writeText(wb); } |
139 | void read(DB::ReadBuffer & rb) { zeroValue()->read(rb); } |
140 | void readText(DB::ReadBuffer & rb) { zeroValue()->readText(rb); } |
141 | size_t size() const { return hasZero() ? 1 : 0; } |
142 | bool empty() const { return !hasZero(); } |
143 | size_t getBufferSizeInBytes() const { return sizeof(Cell); } |
144 | size_t getCollisions() const { return 0; } |
145 | }; |
146 | |
147 | template <size_t initial_size_degree = 8> |
148 | struct StringHashTableGrower : public HashTableGrower<initial_size_degree> |
149 | { |
150 | // Smooth growing for string maps |
151 | void increaseSize() { this->size_degree += 1; } |
152 | }; |
153 | |
154 | template <typename Mapped> |
155 | struct StringHashTableLookupResult |
156 | { |
157 | Mapped * mapped_ptr; |
158 | StringHashTableLookupResult() {} |
159 | StringHashTableLookupResult(Mapped * mapped_ptr_) : mapped_ptr(mapped_ptr_) {} |
160 | StringHashTableLookupResult(std::nullptr_t) {} |
161 | const VoidKey getKey() const { return {}; } |
162 | auto & getMapped() { return *mapped_ptr; } |
163 | auto & operator*() { return *this; } |
164 | auto & operator*() const { return *this; } |
165 | auto * operator->() { return this; } |
166 | auto * operator->() const { return this; } |
167 | operator bool() const { return mapped_ptr; } |
168 | friend bool operator==(const StringHashTableLookupResult & a, const std::nullptr_t &) { return !a.mapped_ptr; } |
169 | friend bool operator==(const std::nullptr_t &, const StringHashTableLookupResult & b) { return !b.mapped_ptr; } |
170 | friend bool operator!=(const StringHashTableLookupResult & a, const std::nullptr_t &) { return a.mapped_ptr; } |
171 | friend bool operator!=(const std::nullptr_t &, const StringHashTableLookupResult & b) { return b.mapped_ptr; } |
172 | }; |
173 | |
174 | template <typename SubMaps> |
175 | class StringHashTable : private boost::noncopyable |
176 | { |
177 | protected: |
178 | static constexpr size_t NUM_MAPS = 5; |
179 | // Map for storing empty string |
180 | using T0 = typename SubMaps::T0; |
181 | |
182 | // Short strings are stored as numbers |
183 | using T1 = typename SubMaps::T1; |
184 | using T2 = typename SubMaps::T2; |
185 | using T3 = typename SubMaps::T3; |
186 | |
187 | // Long strings are stored as StringRef along with saved hash |
188 | using Ts = typename SubMaps::Ts; |
189 | using Self = StringHashTable; |
190 | |
191 | template <typename, typename, size_t> |
192 | friend class TwoLevelStringHashTable; |
193 | |
194 | T0 m0; |
195 | T1 m1; |
196 | T2 m2; |
197 | T3 m3; |
198 | Ts ms; |
199 | |
200 | public: |
201 | using Key = StringRef; |
202 | using key_type = Key; |
203 | using mapped_type = typename Ts::mapped_type; |
204 | using value_type = typename Ts::value_type; |
205 | using cell_type = typename Ts::cell_type; |
206 | |
207 | using LookupResult = StringHashTableLookupResult<typename cell_type::mapped_type>; |
208 | using ConstLookupResult = StringHashTableLookupResult<const typename cell_type::mapped_type>; |
209 | |
210 | StringHashTable() {} |
211 | |
212 | StringHashTable(size_t reserve_for_num_elements) |
213 | : m1{reserve_for_num_elements / 4} |
214 | , m2{reserve_for_num_elements / 4} |
215 | , m3{reserve_for_num_elements / 4} |
216 | , ms{reserve_for_num_elements / 4} |
217 | { |
218 | } |
219 | |
220 | StringHashTable(StringHashTable && rhs) { *this = std::move(rhs); } |
221 | ~StringHashTable() {} |
222 | |
223 | public: |
224 | // Dispatch is written in a way that maximizes the performance: |
225 | // 1. Always memcpy 8 times bytes |
226 | // 2. Use switch case extension to generate fast dispatching table |
227 | // 3. Funcs are named callables that can be force_inlined |
228 | // NOTE: It relies on Little Endianness |
229 | template <typename Self, typename KeyHolder, typename Func> |
230 | static auto ALWAYS_INLINE dispatch(Self & self, KeyHolder && key_holder, Func && func) |
231 | { |
232 | const StringRef & x = keyHolderGetKey(key_holder); |
233 | const size_t sz = x.size; |
234 | if (sz == 0) |
235 | { |
236 | keyHolderDiscardKey(key_holder); |
237 | return func(self.m0, VoidKey{}, 0); |
238 | } |
239 | |
240 | const char * p = x.data; |
241 | // pending bits that needs to be shifted out |
242 | const char s = (-sz & 7) * 8; |
243 | union |
244 | { |
245 | StringKey8 k8; |
246 | StringKey16 k16; |
247 | StringKey24 k24; |
248 | UInt64 n[3]; |
249 | }; |
250 | StringHashTableHash hash; |
251 | switch ((sz - 1) >> 3) |
252 | { |
253 | case 0: // 1..8 bytes |
254 | { |
255 | // first half page |
256 | if ((reinterpret_cast<uintptr_t>(p) & 2048) == 0) |
257 | { |
258 | memcpy(&n[0], p, 8); |
259 | n[0] &= -1ul >> s; |
260 | } |
261 | else |
262 | { |
263 | const char * lp = x.data + x.size - 8; |
264 | memcpy(&n[0], lp, 8); |
265 | n[0] >>= s; |
266 | } |
267 | keyHolderDiscardKey(key_holder); |
268 | return func(self.m1, k8, hash(k8)); |
269 | } |
270 | case 1: // 9..16 bytes |
271 | { |
272 | memcpy(&n[0], p, 8); |
273 | const char * lp = x.data + x.size - 8; |
274 | memcpy(&n[1], lp, 8); |
275 | n[1] >>= s; |
276 | keyHolderDiscardKey(key_holder); |
277 | return func(self.m2, k16, hash(k16)); |
278 | } |
279 | case 2: // 17..24 bytes |
280 | { |
281 | memcpy(&n[0], p, 16); |
282 | const char * lp = x.data + x.size - 8; |
283 | memcpy(&n[2], lp, 8); |
284 | n[2] >>= s; |
285 | keyHolderDiscardKey(key_holder); |
286 | return func(self.m3, k24, hash(k24)); |
287 | } |
288 | default: // >= 25 bytes |
289 | { |
290 | return func(self.ms, std::forward<KeyHolder>(key_holder), hash(x)); |
291 | } |
292 | } |
293 | } |
294 | |
295 | struct EmplaceCallable |
296 | { |
297 | LookupResult & mapped; |
298 | bool & inserted; |
299 | |
300 | EmplaceCallable(LookupResult & mapped_, bool & inserted_) |
301 | : mapped(mapped_), inserted(inserted_) {} |
302 | |
303 | template <typename Map, typename KeyHolder> |
304 | void ALWAYS_INLINE operator()(Map & map, KeyHolder && key_holder, size_t hash) |
305 | { |
306 | typename Map::LookupResult result; |
307 | map.emplace(key_holder, result, inserted, hash); |
308 | mapped = &result->getMapped(); |
309 | } |
310 | }; |
311 | |
312 | template <typename KeyHolder> |
313 | void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted) |
314 | { |
315 | this->dispatch(*this, key_holder, EmplaceCallable(it, inserted)); |
316 | } |
317 | |
318 | struct FindCallable |
319 | { |
320 | // find() doesn't need any key memory management, so we don't work with |
321 | // any key holders here, only with normal keys. The key type is still |
322 | // different for every subtable, this is why it is a template parameter. |
323 | template <typename Submap, typename SubmapKey> |
324 | auto ALWAYS_INLINE operator()(Submap & map, const SubmapKey & key, size_t hash) |
325 | { |
326 | return &map.find(key, hash)->getMapped(); |
327 | } |
328 | }; |
329 | |
330 | LookupResult ALWAYS_INLINE find(const Key & x) |
331 | { |
332 | return dispatch(*this, x, FindCallable{}); |
333 | } |
334 | |
335 | ConstLookupResult ALWAYS_INLINE find(const Key & x) const |
336 | { |
337 | return dispatch(*this, x, FindCallable{}); |
338 | } |
339 | |
340 | bool ALWAYS_INLINE has(const Key & x, size_t = 0) const |
341 | { |
342 | return dispatch(*this, x, FindCallable{}) != nullptr; |
343 | } |
344 | |
345 | void write(DB::WriteBuffer & wb) const |
346 | { |
347 | m0.write(wb); |
348 | m1.write(wb); |
349 | m2.write(wb); |
350 | m3.write(wb); |
351 | ms.write(wb); |
352 | } |
353 | |
354 | void writeText(DB::WriteBuffer & wb) const |
355 | { |
356 | m0.writeText(wb); |
357 | DB::writeChar(',', wb); |
358 | m1.writeText(wb); |
359 | DB::writeChar(',', wb); |
360 | m2.writeText(wb); |
361 | DB::writeChar(',', wb); |
362 | m3.writeText(wb); |
363 | DB::writeChar(',', wb); |
364 | ms.writeText(wb); |
365 | } |
366 | |
367 | void read(DB::ReadBuffer & rb) |
368 | { |
369 | m0.read(rb); |
370 | m1.read(rb); |
371 | m2.read(rb); |
372 | m3.read(rb); |
373 | ms.read(rb); |
374 | } |
375 | |
376 | void readText(DB::ReadBuffer & rb) |
377 | { |
378 | m0.readText(rb); |
379 | DB::assertChar(',', rb); |
380 | m1.readText(rb); |
381 | DB::assertChar(',', rb); |
382 | m2.readText(rb); |
383 | DB::assertChar(',', rb); |
384 | m3.readText(rb); |
385 | DB::assertChar(',', rb); |
386 | ms.readText(rb); |
387 | } |
388 | |
389 | size_t size() const { return m0.size() + m1.size() + m2.size() + m3.size() + ms.size(); } |
390 | |
391 | bool empty() const { return m0.empty() && m1.empty() && m2.empty() && m3.empty() && ms.empty(); } |
392 | |
393 | size_t getBufferSizeInBytes() const |
394 | { |
395 | return m0.getBufferSizeInBytes() + m1.getBufferSizeInBytes() + m2.getBufferSizeInBytes() + m3.getBufferSizeInBytes() |
396 | + ms.getBufferSizeInBytes(); |
397 | } |
398 | |
399 | void clearAndShrink() |
400 | { |
401 | m1.clearHasZero(); |
402 | m1.clearAndShrink(); |
403 | m2.clearAndShrink(); |
404 | m3.clearAndShrink(); |
405 | ms.clearAndShrink(); |
406 | } |
407 | }; |
408 | |