1#pragma once
2
3#include <Common/HashTable/HashMap.h>
4#include <Common/HashTable/HashTable.h>
5
6#include <variant>
7
8using StringKey8 = UInt64;
9using StringKey16 = DB::UInt128;
10struct StringKey24
11{
12 UInt64 a;
13 UInt64 b;
14 UInt64 c;
15
16 bool operator==(const StringKey24 rhs) const { return a == rhs.a && b == rhs.b && c == rhs.c; }
17};
18
19inline StringRef ALWAYS_INLINE toStringRef(const StringKey8 & n)
20{
21 return {reinterpret_cast<const char *>(&n), 8ul - (__builtin_clzll(n) >> 3)};
22}
23inline StringRef ALWAYS_INLINE toStringRef(const StringKey16 & n)
24{
25 return {reinterpret_cast<const char *>(&n), 16ul - (__builtin_clzll(n.high) >> 3)};
26}
27inline StringRef ALWAYS_INLINE toStringRef(const StringKey24 & n)
28{
29 return {reinterpret_cast<const char *>(&n), 24ul - (__builtin_clzll(n.c) >> 3)};
30}
31
32struct StringHashTableHash
33{
34#if defined(__SSE4_2__)
35 size_t ALWAYS_INLINE operator()(StringKey8 key) const
36 {
37 size_t res = -1ULL;
38 res = _mm_crc32_u64(res, key);
39 return res;
40 }
41 size_t ALWAYS_INLINE operator()(StringKey16 key) const
42 {
43 size_t res = -1ULL;
44 res = _mm_crc32_u64(res, key.low);
45 res = _mm_crc32_u64(res, key.high);
46 return res;
47 }
48 size_t ALWAYS_INLINE operator()(StringKey24 key) const
49 {
50 size_t res = -1ULL;
51 res = _mm_crc32_u64(res, key.a);
52 res = _mm_crc32_u64(res, key.b);
53 res = _mm_crc32_u64(res, key.c);
54 return res;
55 }
56#else
57 size_t ALWAYS_INLINE operator()(StringKey8 key) const
58 {
59 return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 8);
60 }
61 size_t ALWAYS_INLINE operator()(StringKey16 key) const
62 {
63 return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 16);
64 }
65 size_t ALWAYS_INLINE operator()(StringKey24 key) const
66 {
67 return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 24);
68 }
69#endif
70 size_t ALWAYS_INLINE operator()(StringRef key) const
71 {
72 return StringRefHash()(key);
73 }
74};
75
76template <typename Cell>
77struct StringHashTableEmpty
78{
79 using Self = StringHashTableEmpty;
80
81 bool has_zero = false;
82 std::aligned_storage_t<sizeof(Cell), alignof(Cell)> zero_value_storage; /// Storage of element with zero key.
83
84public:
85 bool hasZero() const { return has_zero; }
86
87 void setHasZero()
88 {
89 has_zero = true;
90 new (zeroValue()) Cell();
91 }
92
93 void setHasZero(const Cell & other)
94 {
95 has_zero = true;
96 new (zeroValue()) Cell(other);
97 }
98
99 void clearHasZero()
100 {
101 has_zero = false;
102 if (!std::is_trivially_destructible_v<Cell>)
103 zeroValue()->~Cell();
104 }
105
106 Cell * zeroValue() { return reinterpret_cast<Cell *>(&zero_value_storage); }
107 const Cell * zeroValue() const { return reinterpret_cast<const Cell *>(&zero_value_storage); }
108
109 using LookupResult = Cell *;
110 using ConstLookupResult = const Cell *;
111
112 template <typename KeyHolder>
113 void ALWAYS_INLINE emplace(KeyHolder &&, LookupResult & it, bool & inserted, size_t = 0)
114 {
115 if (!hasZero())
116 {
117 setHasZero();
118 inserted = true;
119 }
120 else
121 inserted = false;
122 it = zeroValue();
123 }
124
125 template <typename Key>
126 LookupResult ALWAYS_INLINE find(const Key &, size_t = 0)
127 {
128 return hasZero() ? zeroValue() : nullptr;
129 }
130
131 template <typename Key>
132 ConstLookupResult ALWAYS_INLINE find(const Key &, size_t = 0) const
133 {
134 return hasZero() ? zeroValue() : nullptr;
135 }
136
137 void write(DB::WriteBuffer & wb) const { zeroValue()->write(wb); }
138 void writeText(DB::WriteBuffer & wb) const { zeroValue()->writeText(wb); }
139 void read(DB::ReadBuffer & rb) { zeroValue()->read(rb); }
140 void readText(DB::ReadBuffer & rb) { zeroValue()->readText(rb); }
141 size_t size() const { return hasZero() ? 1 : 0; }
142 bool empty() const { return !hasZero(); }
143 size_t getBufferSizeInBytes() const { return sizeof(Cell); }
144 size_t getCollisions() const { return 0; }
145};
146
147template <size_t initial_size_degree = 8>
148struct StringHashTableGrower : public HashTableGrower<initial_size_degree>
149{
150 // Smooth growing for string maps
151 void increaseSize() { this->size_degree += 1; }
152};
153
154template <typename Mapped>
155struct StringHashTableLookupResult
156{
157 Mapped * mapped_ptr;
158 StringHashTableLookupResult() {}
159 StringHashTableLookupResult(Mapped * mapped_ptr_) : mapped_ptr(mapped_ptr_) {}
160 StringHashTableLookupResult(std::nullptr_t) {}
161 const VoidKey getKey() const { return {}; }
162 auto & getMapped() { return *mapped_ptr; }
163 auto & operator*() { return *this; }
164 auto & operator*() const { return *this; }
165 auto * operator->() { return this; }
166 auto * operator->() const { return this; }
167 operator bool() const { return mapped_ptr; }
168 friend bool operator==(const StringHashTableLookupResult & a, const std::nullptr_t &) { return !a.mapped_ptr; }
169 friend bool operator==(const std::nullptr_t &, const StringHashTableLookupResult & b) { return !b.mapped_ptr; }
170 friend bool operator!=(const StringHashTableLookupResult & a, const std::nullptr_t &) { return a.mapped_ptr; }
171 friend bool operator!=(const std::nullptr_t &, const StringHashTableLookupResult & b) { return b.mapped_ptr; }
172};
173
174template <typename SubMaps>
175class StringHashTable : private boost::noncopyable
176{
177protected:
178 static constexpr size_t NUM_MAPS = 5;
179 // Map for storing empty string
180 using T0 = typename SubMaps::T0;
181
182 // Short strings are stored as numbers
183 using T1 = typename SubMaps::T1;
184 using T2 = typename SubMaps::T2;
185 using T3 = typename SubMaps::T3;
186
187 // Long strings are stored as StringRef along with saved hash
188 using Ts = typename SubMaps::Ts;
189 using Self = StringHashTable;
190
191 template <typename, typename, size_t>
192 friend class TwoLevelStringHashTable;
193
194 T0 m0;
195 T1 m1;
196 T2 m2;
197 T3 m3;
198 Ts ms;
199
200public:
201 using Key = StringRef;
202 using key_type = Key;
203 using mapped_type = typename Ts::mapped_type;
204 using value_type = typename Ts::value_type;
205 using cell_type = typename Ts::cell_type;
206
207 using LookupResult = StringHashTableLookupResult<typename cell_type::mapped_type>;
208 using ConstLookupResult = StringHashTableLookupResult<const typename cell_type::mapped_type>;
209
210 StringHashTable() {}
211
212 StringHashTable(size_t reserve_for_num_elements)
213 : m1{reserve_for_num_elements / 4}
214 , m2{reserve_for_num_elements / 4}
215 , m3{reserve_for_num_elements / 4}
216 , ms{reserve_for_num_elements / 4}
217 {
218 }
219
220 StringHashTable(StringHashTable && rhs) { *this = std::move(rhs); }
221 ~StringHashTable() {}
222
223public:
224 // Dispatch is written in a way that maximizes the performance:
225 // 1. Always memcpy 8 times bytes
226 // 2. Use switch case extension to generate fast dispatching table
227 // 3. Funcs are named callables that can be force_inlined
228 // NOTE: It relies on Little Endianness
229 template <typename Self, typename KeyHolder, typename Func>
230 static auto ALWAYS_INLINE dispatch(Self & self, KeyHolder && key_holder, Func && func)
231 {
232 const StringRef & x = keyHolderGetKey(key_holder);
233 const size_t sz = x.size;
234 if (sz == 0)
235 {
236 keyHolderDiscardKey(key_holder);
237 return func(self.m0, VoidKey{}, 0);
238 }
239
240 const char * p = x.data;
241 // pending bits that needs to be shifted out
242 const char s = (-sz & 7) * 8;
243 union
244 {
245 StringKey8 k8;
246 StringKey16 k16;
247 StringKey24 k24;
248 UInt64 n[3];
249 };
250 StringHashTableHash hash;
251 switch ((sz - 1) >> 3)
252 {
253 case 0: // 1..8 bytes
254 {
255 // first half page
256 if ((reinterpret_cast<uintptr_t>(p) & 2048) == 0)
257 {
258 memcpy(&n[0], p, 8);
259 n[0] &= -1ul >> s;
260 }
261 else
262 {
263 const char * lp = x.data + x.size - 8;
264 memcpy(&n[0], lp, 8);
265 n[0] >>= s;
266 }
267 keyHolderDiscardKey(key_holder);
268 return func(self.m1, k8, hash(k8));
269 }
270 case 1: // 9..16 bytes
271 {
272 memcpy(&n[0], p, 8);
273 const char * lp = x.data + x.size - 8;
274 memcpy(&n[1], lp, 8);
275 n[1] >>= s;
276 keyHolderDiscardKey(key_holder);
277 return func(self.m2, k16, hash(k16));
278 }
279 case 2: // 17..24 bytes
280 {
281 memcpy(&n[0], p, 16);
282 const char * lp = x.data + x.size - 8;
283 memcpy(&n[2], lp, 8);
284 n[2] >>= s;
285 keyHolderDiscardKey(key_holder);
286 return func(self.m3, k24, hash(k24));
287 }
288 default: // >= 25 bytes
289 {
290 return func(self.ms, std::forward<KeyHolder>(key_holder), hash(x));
291 }
292 }
293 }
294
295 struct EmplaceCallable
296 {
297 LookupResult & mapped;
298 bool & inserted;
299
300 EmplaceCallable(LookupResult & mapped_, bool & inserted_)
301 : mapped(mapped_), inserted(inserted_) {}
302
303 template <typename Map, typename KeyHolder>
304 void ALWAYS_INLINE operator()(Map & map, KeyHolder && key_holder, size_t hash)
305 {
306 typename Map::LookupResult result;
307 map.emplace(key_holder, result, inserted, hash);
308 mapped = &result->getMapped();
309 }
310 };
311
312 template <typename KeyHolder>
313 void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted)
314 {
315 this->dispatch(*this, key_holder, EmplaceCallable(it, inserted));
316 }
317
318 struct FindCallable
319 {
320 // find() doesn't need any key memory management, so we don't work with
321 // any key holders here, only with normal keys. The key type is still
322 // different for every subtable, this is why it is a template parameter.
323 template <typename Submap, typename SubmapKey>
324 auto ALWAYS_INLINE operator()(Submap & map, const SubmapKey & key, size_t hash)
325 {
326 return &map.find(key, hash)->getMapped();
327 }
328 };
329
330 LookupResult ALWAYS_INLINE find(const Key & x)
331 {
332 return dispatch(*this, x, FindCallable{});
333 }
334
335 ConstLookupResult ALWAYS_INLINE find(const Key & x) const
336 {
337 return dispatch(*this, x, FindCallable{});
338 }
339
340 bool ALWAYS_INLINE has(const Key & x, size_t = 0) const
341 {
342 return dispatch(*this, x, FindCallable{}) != nullptr;
343 }
344
345 void write(DB::WriteBuffer & wb) const
346 {
347 m0.write(wb);
348 m1.write(wb);
349 m2.write(wb);
350 m3.write(wb);
351 ms.write(wb);
352 }
353
354 void writeText(DB::WriteBuffer & wb) const
355 {
356 m0.writeText(wb);
357 DB::writeChar(',', wb);
358 m1.writeText(wb);
359 DB::writeChar(',', wb);
360 m2.writeText(wb);
361 DB::writeChar(',', wb);
362 m3.writeText(wb);
363 DB::writeChar(',', wb);
364 ms.writeText(wb);
365 }
366
367 void read(DB::ReadBuffer & rb)
368 {
369 m0.read(rb);
370 m1.read(rb);
371 m2.read(rb);
372 m3.read(rb);
373 ms.read(rb);
374 }
375
376 void readText(DB::ReadBuffer & rb)
377 {
378 m0.readText(rb);
379 DB::assertChar(',', rb);
380 m1.readText(rb);
381 DB::assertChar(',', rb);
382 m2.readText(rb);
383 DB::assertChar(',', rb);
384 m3.readText(rb);
385 DB::assertChar(',', rb);
386 ms.readText(rb);
387 }
388
389 size_t size() const { return m0.size() + m1.size() + m2.size() + m3.size() + ms.size(); }
390
391 bool empty() const { return m0.empty() && m1.empty() && m2.empty() && m3.empty() && ms.empty(); }
392
393 size_t getBufferSizeInBytes() const
394 {
395 return m0.getBufferSizeInBytes() + m1.getBufferSizeInBytes() + m2.getBufferSizeInBytes() + m3.getBufferSizeInBytes()
396 + ms.getBufferSizeInBytes();
397 }
398
399 void clearAndShrink()
400 {
401 m1.clearHasZero();
402 m1.clearAndShrink();
403 m2.clearAndShrink();
404 m3.clearAndShrink();
405 ms.clearAndShrink();
406 }
407};
408