1#pragma once
2
3#include <Common/HashTable/StringHashTable.h>
4
5template <typename SubMaps, typename ImplTable = StringHashTable<SubMaps>, size_t BITS_FOR_BUCKET = 8>
6class TwoLevelStringHashTable : private boost::noncopyable
7{
8protected:
9 using HashValue = size_t;
10 using Self = TwoLevelStringHashTable;
11
12public:
13 using Key = StringRef;
14 using Impl = ImplTable;
15
16 static constexpr size_t NUM_BUCKETS = 1ULL << BITS_FOR_BUCKET;
17 static constexpr size_t MAX_BUCKET = NUM_BUCKETS - 1;
18
19 // TODO: currently hashing contains redundant computations when doing distributed or external aggregations
20 size_t hash(const Key & x) const
21 {
22 return const_cast<Self &>(*this).dispatch(*this, x, [&](const auto &, const auto &, size_t hash) { return hash; });
23 }
24
25 size_t operator()(const Key & x) const { return hash(x); }
26
27 /// NOTE Bad for hash tables with more than 2^32 cells.
28 static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET; }
29
30public:
31 using key_type = typename Impl::key_type;
32 using mapped_type = typename Impl::mapped_type;
33 using value_type = typename Impl::value_type;
34 using cell_type = typename Impl::cell_type;
35
36 using LookupResult = typename Impl::LookupResult;
37 using ConstLookupResult = typename Impl::ConstLookupResult;
38
39 Impl impls[NUM_BUCKETS];
40
41 TwoLevelStringHashTable() {}
42
43 template <typename Source>
44 TwoLevelStringHashTable(const Source & src)
45 {
46 if (src.m0.hasZero())
47 impls[0].m0.setHasZero(*src.m0.zeroValue());
48
49 for (auto & v : src.m1)
50 {
51 size_t hash_value = v.getHash(src.m1);
52 size_t buck = getBucketFromHash(hash_value);
53 impls[buck].m1.insertUniqueNonZero(&v, hash_value);
54 }
55 for (auto & v : src.m2)
56 {
57 size_t hash_value = v.getHash(src.m2);
58 size_t buck = getBucketFromHash(hash_value);
59 impls[buck].m2.insertUniqueNonZero(&v, hash_value);
60 }
61 for (auto & v : src.m3)
62 {
63 size_t hash_value = v.getHash(src.m3);
64 size_t buck = getBucketFromHash(hash_value);
65 impls[buck].m3.insertUniqueNonZero(&v, hash_value);
66 }
67 for (auto & v : src.ms)
68 {
69 size_t hash_value = v.getHash(src.ms);
70 size_t buck = getBucketFromHash(hash_value);
71 impls[buck].ms.insertUniqueNonZero(&v, hash_value);
72 }
73 }
74
75 // This function is mostly the same as StringHashTable::dispatch, but with
76 // added bucket computation. See the comments there.
77 template <typename Self, typename Func, typename KeyHolder>
78 static auto ALWAYS_INLINE dispatch(Self & self, KeyHolder && key_holder, Func && func)
79 {
80 const StringRef & x = keyHolderGetKey(key_holder);
81 const size_t sz = x.size;
82 if (sz == 0)
83 {
84 keyHolderDiscardKey(key_holder);
85 return func(self.impls[0].m0, VoidKey{}, 0);
86 }
87
88 const char * p = x.data;
89 // pending bits that needs to be shifted out
90 const char s = (-sz & 7) * 8;
91 union
92 {
93 StringKey8 k8;
94 StringKey16 k16;
95 StringKey24 k24;
96 UInt64 n[3];
97 };
98 StringHashTableHash hash;
99 switch ((sz - 1) >> 3)
100 {
101 case 0:
102 {
103 // first half page
104 if ((reinterpret_cast<uintptr_t>(p) & 2048) == 0)
105 {
106 memcpy(&n[0], p, 8);
107 n[0] &= -1ul >> s;
108 }
109 else
110 {
111 const char * lp = x.data + x.size - 8;
112 memcpy(&n[0], lp, 8);
113 n[0] >>= s;
114 }
115 auto res = hash(k8);
116 auto buck = getBucketFromHash(res);
117 keyHolderDiscardKey(key_holder);
118 return func(self.impls[buck].m1, k8, res);
119 }
120 case 1:
121 {
122 memcpy(&n[0], p, 8);
123 const char * lp = x.data + x.size - 8;
124 memcpy(&n[1], lp, 8);
125 n[1] >>= s;
126 auto res = hash(k16);
127 auto buck = getBucketFromHash(res);
128 keyHolderDiscardKey(key_holder);
129 return func(self.impls[buck].m2, k16, res);
130 }
131 case 2:
132 {
133 memcpy(&n[0], p, 16);
134 const char * lp = x.data + x.size - 8;
135 memcpy(&n[2], lp, 8);
136 n[2] >>= s;
137 auto res = hash(k24);
138 auto buck = getBucketFromHash(res);
139 keyHolderDiscardKey(key_holder);
140 return func(self.impls[buck].m3, k24, res);
141 }
142 default:
143 {
144 auto res = hash(x);
145 auto buck = getBucketFromHash(res);
146 return func(self.impls[buck].ms, std::forward<KeyHolder>(key_holder), res);
147 }
148 }
149 }
150
151 template <typename KeyHolder>
152 void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted)
153 {
154 dispatch(*this, key_holder, typename Impl::EmplaceCallable{it, inserted});
155 }
156
157 LookupResult ALWAYS_INLINE find(const Key x)
158 {
159 return dispatch(*this, x, typename Impl::FindCallable{});
160 }
161
162 ConstLookupResult ALWAYS_INLINE find(const Key x) const
163 {
164 return dispatch(*this, x, typename Impl::FindCallable{});
165 }
166
167 void write(DB::WriteBuffer & wb) const
168 {
169 for (size_t i = 0; i < NUM_BUCKETS; ++i)
170 impls[i].write(wb);
171 }
172
173 void writeText(DB::WriteBuffer & wb) const
174 {
175 for (size_t i = 0; i < NUM_BUCKETS; ++i)
176 {
177 if (i != 0)
178 DB::writeChar(',', wb);
179 impls[i].writeText(wb);
180 }
181 }
182
183 void read(DB::ReadBuffer & rb)
184 {
185 for (size_t i = 0; i < NUM_BUCKETS; ++i)
186 impls[i].read(rb);
187 }
188
189 void readText(DB::ReadBuffer & rb)
190 {
191 for (size_t i = 0; i < NUM_BUCKETS; ++i)
192 {
193 if (i != 0)
194 DB::assertChar(',', rb);
195 impls[i].readText(rb);
196 }
197 }
198
199 size_t size() const
200 {
201 size_t res = 0;
202 for (size_t i = 0; i < NUM_BUCKETS; ++i)
203 res += impls[i].size();
204
205 return res;
206 }
207
208 bool empty() const
209 {
210 for (size_t i = 0; i < NUM_BUCKETS; ++i)
211 if (!impls[i].empty())
212 return false;
213
214 return true;
215 }
216
217 size_t getBufferSizeInBytes() const
218 {
219 size_t res = 0;
220 for (size_t i = 0; i < NUM_BUCKETS; ++i)
221 res += impls[i].getBufferSizeInBytes();
222
223 return res;
224 }
225};
226