1 | #pragma once |
2 | |
3 | #include <Interpreters/AggregationCommon.h> |
4 | #include <Common/ColumnsHashing.h> |
5 | #include <Common/assert_cast.h> |
6 | #include <Common/Arena.h> |
7 | #include <Common/HashTable/HashSet.h> |
8 | #include <Common/HashTable/ClearableHashSet.h> |
9 | #include <Common/HashTable/FixedClearableHashSet.h> |
10 | #include <Common/HashTable/FixedHashSet.h> |
11 | #include <Common/UInt128.h> |
12 | |
13 | |
14 | namespace DB |
15 | { |
16 | |
17 | /** Methods for different implementations of sets (used in right hand side of IN or for DISTINCT). |
18 | * To use as template parameter. |
19 | */ |
20 | |
21 | |
22 | /// For the case where there is one numeric key. |
23 | template <typename FieldType, typename TData, bool use_cache = true> /// UInt8/16/32/64 for any types with corresponding bit width. |
24 | struct SetMethodOneNumber |
25 | { |
26 | using Data = TData; |
27 | using Key = typename Data::key_type; |
28 | |
29 | Data data; |
30 | |
31 | using State = ColumnsHashing::HashMethodOneNumber<typename Data::value_type, |
32 | void, FieldType, use_cache>; |
33 | }; |
34 | |
35 | /// For the case where there is one string key. |
36 | template <typename TData> |
37 | struct SetMethodString |
38 | { |
39 | using Data = TData; |
40 | using Key = typename Data::key_type; |
41 | |
42 | Data data; |
43 | |
44 | using State = ColumnsHashing::HashMethodString<typename Data::value_type, void, true, false>; |
45 | }; |
46 | |
47 | /// For the case when there is one fixed-length string key. |
48 | template <typename TData> |
49 | struct SetMethodFixedString |
50 | { |
51 | using Data = TData; |
52 | using Key = typename Data::key_type; |
53 | |
54 | Data data; |
55 | |
56 | using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, void, true, false>; |
57 | }; |
58 | |
59 | namespace set_impl |
60 | { |
61 | |
62 | /// This class is designed to provide the functionality that is required for |
63 | /// supporting nullable keys in SetMethodKeysFixed. If there are |
64 | /// no nullable keys, this class is merely implemented as an empty shell. |
65 | template <typename Key, bool has_nullable_keys> |
66 | class BaseStateKeysFixed; |
67 | |
68 | /// Case where nullable keys are supported. |
69 | template <typename Key> |
70 | class BaseStateKeysFixed<Key, true> |
71 | { |
72 | protected: |
73 | void init(const ColumnRawPtrs & key_columns) |
74 | { |
75 | null_maps.reserve(key_columns.size()); |
76 | actual_columns.reserve(key_columns.size()); |
77 | |
78 | for (const auto & col : key_columns) |
79 | { |
80 | if (auto * nullable = checkAndGetColumn<ColumnNullable>(*col)) |
81 | { |
82 | actual_columns.push_back(&nullable->getNestedColumn()); |
83 | null_maps.push_back(&nullable->getNullMapColumn()); |
84 | } |
85 | else |
86 | { |
87 | actual_columns.push_back(col); |
88 | null_maps.push_back(nullptr); |
89 | } |
90 | } |
91 | } |
92 | |
93 | /// Return the columns which actually contain the values of the keys. |
94 | /// For a given key column, if it is nullable, we return its nested |
95 | /// column. Otherwise we return the key column itself. |
96 | inline const ColumnRawPtrs & getActualColumns() const |
97 | { |
98 | return actual_columns; |
99 | } |
100 | |
101 | /// Create a bitmap that indicates whether, for a particular row, |
102 | /// a key column bears a null value or not. |
103 | KeysNullMap<Key> createBitmap(size_t row) const |
104 | { |
105 | KeysNullMap<Key> bitmap{}; |
106 | |
107 | for (size_t k = 0; k < null_maps.size(); ++k) |
108 | { |
109 | if (null_maps[k] != nullptr) |
110 | { |
111 | const auto & null_map = assert_cast<const ColumnUInt8 &>(*null_maps[k]).getData(); |
112 | if (null_map[row] == 1) |
113 | { |
114 | size_t bucket = k / 8; |
115 | size_t offset = k % 8; |
116 | bitmap[bucket] |= UInt8(1) << offset; |
117 | } |
118 | } |
119 | } |
120 | |
121 | return bitmap; |
122 | } |
123 | |
124 | private: |
125 | ColumnRawPtrs actual_columns; |
126 | ColumnRawPtrs null_maps; |
127 | }; |
128 | |
129 | /// Case where nullable keys are not supported. |
130 | template <typename Key> |
131 | class BaseStateKeysFixed<Key, false> |
132 | { |
133 | protected: |
134 | void init(const ColumnRawPtrs &) |
135 | { |
136 | throw Exception{"Internal error: calling init() for non-nullable" |
137 | " keys is forbidden" , ErrorCodes::LOGICAL_ERROR}; |
138 | } |
139 | |
140 | const ColumnRawPtrs & getActualColumns() const |
141 | { |
142 | throw Exception{"Internal error: calling getActualColumns() for non-nullable" |
143 | " keys is forbidden" , ErrorCodes::LOGICAL_ERROR}; |
144 | } |
145 | |
146 | KeysNullMap<Key> createBitmap(size_t) const |
147 | { |
148 | throw Exception{"Internal error: calling createBitmap() for non-nullable keys" |
149 | " is forbidden" , ErrorCodes::LOGICAL_ERROR}; |
150 | } |
151 | }; |
152 | |
153 | } |
154 | |
155 | /// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits. |
156 | template <typename TData, bool has_nullable_keys_ = false> |
157 | struct SetMethodKeysFixed |
158 | { |
159 | using Data = TData; |
160 | using Key = typename Data::key_type; |
161 | static constexpr bool has_nullable_keys = has_nullable_keys_; |
162 | |
163 | Data data; |
164 | |
165 | using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, void, has_nullable_keys, false>; |
166 | }; |
167 | |
168 | /// For other cases. 128 bit hash from the key. |
169 | template <typename TData> |
170 | struct SetMethodHashed |
171 | { |
172 | using Data = TData; |
173 | using Key = typename Data::key_type; |
174 | |
175 | Data data; |
176 | |
177 | using State = ColumnsHashing::HashMethodHashed<typename Data::value_type, void>; |
178 | }; |
179 | |
180 | |
181 | /** Different implementations of the set. |
182 | */ |
183 | struct NonClearableSet |
184 | { |
185 | /* |
186 | * As in Aggregator, using consecutive keys cache doesn't improve performance |
187 | * for FixedHashTables. |
188 | */ |
189 | std::unique_ptr<SetMethodOneNumber<UInt8, FixedHashSet<UInt8>, false /* use_cache */>> key8; |
190 | std::unique_ptr<SetMethodOneNumber<UInt16, FixedHashSet<UInt16>, false /* use_cache */>> key16; |
191 | |
192 | /** Also for the experiment was tested the ability to use SmallSet, |
193 | * as long as the number of elements in the set is small (and, if necessary, converted to a full-fledged HashSet). |
194 | * But this experiment showed that there is an advantage only in rare cases. |
195 | */ |
196 | std::unique_ptr<SetMethodOneNumber<UInt32, HashSet<UInt32, HashCRC32<UInt32>>>> key32; |
197 | std::unique_ptr<SetMethodOneNumber<UInt64, HashSet<UInt64, HashCRC32<UInt64>>>> key64; |
198 | std::unique_ptr<SetMethodString<HashSetWithSavedHash<StringRef>>> key_string; |
199 | std::unique_ptr<SetMethodFixedString<HashSetWithSavedHash<StringRef>>> key_fixed_string; |
200 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt128, UInt128HashCRC32>>> keys128; |
201 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt256, UInt256HashCRC32>>> keys256; |
202 | std::unique_ptr<SetMethodHashed<HashSet<UInt128, UInt128TrivialHash>>> hashed; |
203 | |
204 | /// Support for nullable keys (for DISTINCT implementation). |
205 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt128, UInt128HashCRC32>, true>> nullable_keys128; |
206 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt256, UInt256HashCRC32>, true>> nullable_keys256; |
207 | /** Unlike Aggregator, `concat` method is not used here. |
208 | * This is done because `hashed` method, although slower, but in this case, uses less RAM. |
209 | * since when you use it, the key values themselves are not stored. |
210 | */ |
211 | }; |
212 | |
213 | struct ClearableSet |
214 | { |
215 | std::unique_ptr<SetMethodOneNumber<UInt8, FixedClearableHashSet<UInt8>, false /* use_cache */>> key8; |
216 | std::unique_ptr<SetMethodOneNumber<UInt16, FixedClearableHashSet<UInt16>, false /*use_cache */>> key16; |
217 | |
218 | std::unique_ptr<SetMethodOneNumber<UInt32, ClearableHashSet<UInt32, HashCRC32<UInt32>>>> key32; |
219 | std::unique_ptr<SetMethodOneNumber<UInt64, ClearableHashSet<UInt64, HashCRC32<UInt64>>>> key64; |
220 | std::unique_ptr<SetMethodString<ClearableHashSetWithSavedHash<StringRef>>> key_string; |
221 | std::unique_ptr<SetMethodFixedString<ClearableHashSetWithSavedHash<StringRef>>> key_fixed_string; |
222 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt128, UInt128HashCRC32>>> keys128; |
223 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt256, UInt256HashCRC32>>> keys256; |
224 | std::unique_ptr<SetMethodHashed<ClearableHashSet<UInt128, UInt128TrivialHash>>> hashed; |
225 | |
226 | /// Support for nullable keys (for DISTINCT implementation). |
227 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt128, UInt128HashCRC32>, true>> nullable_keys128; |
228 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt256, UInt256HashCRC32>, true>> nullable_keys256; |
229 | /** Unlike Aggregator, `concat` method is not used here. |
230 | * This is done because `hashed` method, although slower, but in this case, uses less RAM. |
231 | * since when you use it, the key values themselves are not stored. |
232 | */ |
233 | }; |
234 | |
235 | template <typename Variant> |
236 | struct SetVariantsTemplate: public Variant |
237 | { |
238 | Arena string_pool; |
239 | |
240 | #define APPLY_FOR_SET_VARIANTS(M) \ |
241 | M(key8) \ |
242 | M(key16) \ |
243 | M(key32) \ |
244 | M(key64) \ |
245 | M(key_string) \ |
246 | M(key_fixed_string) \ |
247 | M(keys128) \ |
248 | M(keys256) \ |
249 | M(nullable_keys128) \ |
250 | M(nullable_keys256) \ |
251 | M(hashed) |
252 | |
253 | #define M(NAME) using Variant::NAME; |
254 | APPLY_FOR_SET_VARIANTS(M) |
255 | #undef M |
256 | |
257 | enum class Type |
258 | { |
259 | EMPTY, |
260 | |
261 | #define M(NAME) NAME, |
262 | APPLY_FOR_SET_VARIANTS(M) |
263 | #undef M |
264 | }; |
265 | |
266 | Type type = Type::EMPTY; |
267 | |
268 | bool empty() const { return type == Type::EMPTY; } |
269 | |
270 | static Type chooseMethod(const ColumnRawPtrs & key_columns, Sizes & key_sizes); |
271 | |
272 | void init(Type type_); |
273 | |
274 | size_t getTotalRowCount() const; |
275 | /// Counts the size in bytes of the Set buffer and the size of the `string_pool` |
276 | size_t getTotalByteCount() const; |
277 | }; |
278 | |
279 | using SetVariants = SetVariantsTemplate<NonClearableSet>; |
280 | using ClearableSetVariants = SetVariantsTemplate<ClearableSet>; |
281 | |
282 | } |
283 | |