1 | #include <Functions/IFunctionImpl.h> |
2 | #include <Functions/FunctionHelpers.h> |
3 | #include <DataTypes/DataTypeArray.h> |
4 | #include <DataTypes/DataTypesNumber.h> |
5 | #include <Columns/ColumnArray.h> |
6 | #include <Columns/ColumnNullable.h> |
7 | #include <Columns/ColumnsNumber.h> |
8 | #include <Columns/ColumnString.h> |
9 | #include <Interpreters/AggregationCommon.h> |
10 | #include <Common/HashTable/ClearableHashMap.h> |
11 | #include <Common/ColumnsHashing.h> |
12 | |
13 | |
14 | namespace DB |
15 | { |
16 | |
17 | namespace ErrorCodes |
18 | { |
19 | extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; |
20 | extern const int ILLEGAL_COLUMN; |
21 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
22 | extern const int SIZES_OF_ARRAYS_DOESNT_MATCH; |
23 | } |
24 | |
25 | class FunctionArrayEnumerateUniq; |
26 | class FunctionArrayEnumerateDense; |
27 | |
28 | template <typename Derived> |
29 | class FunctionArrayEnumerateExtended : public IFunction |
30 | { |
31 | public: |
32 | static FunctionPtr create(const Context &) { return std::make_shared<Derived>(); } |
33 | |
34 | String getName() const override { return Derived::name; } |
35 | |
36 | bool isVariadic() const override { return true; } |
37 | size_t getNumberOfArguments() const override { return 0; } |
38 | bool useDefaultImplementationForConstants() const override { return true; } |
39 | |
40 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
41 | { |
42 | if (arguments.size() == 0) |
43 | throw Exception("Number of arguments for function " + getName() + " doesn't match: passed " |
44 | + toString(arguments.size()) + ", should be at least 1." , |
45 | ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); |
46 | |
47 | for (size_t i = 0; i < arguments.size(); ++i) |
48 | { |
49 | const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[i].get()); |
50 | if (!array_type) |
51 | throw Exception("All arguments for function " + getName() + " must be arrays but argument " + |
52 | toString(i + 1) + " has type " + arguments[i]->getName() + "." , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
53 | } |
54 | |
55 | return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt32>()); |
56 | } |
57 | |
58 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override; |
59 | |
60 | private: |
61 | /// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess. |
62 | static constexpr size_t INITIAL_SIZE_DEGREE = 9; |
63 | |
64 | template <typename T> |
65 | struct MethodOneNumber |
66 | { |
67 | using Set = ClearableHashMap<T, UInt32, DefaultHash<T>, HashTableGrower<INITIAL_SIZE_DEGREE>, |
68 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(T)>>; |
69 | using Method = ColumnsHashing::HashMethodOneNumber<typename Set::value_type, UInt32, T, false>; |
70 | }; |
71 | |
72 | struct MethodString |
73 | { |
74 | using Set = ClearableHashMap<StringRef, UInt32, StringRefHash, HashTableGrower<INITIAL_SIZE_DEGREE>, |
75 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(StringRef)>>; |
76 | using Method = ColumnsHashing::HashMethodString<typename Set::value_type, UInt32, false, false>; |
77 | }; |
78 | |
79 | struct MethodFixedString |
80 | { |
81 | using Set = ClearableHashMap<StringRef, UInt32, StringRefHash, HashTableGrower<INITIAL_SIZE_DEGREE>, |
82 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(StringRef)>>; |
83 | using Method = ColumnsHashing::HashMethodFixedString<typename Set::value_type, UInt32, false, false>; |
84 | }; |
85 | |
86 | struct MethodFixed |
87 | { |
88 | using Set = ClearableHashMap<UInt128, UInt32, UInt128HashCRC32, HashTableGrower<INITIAL_SIZE_DEGREE>, |
89 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>; |
90 | using Method = ColumnsHashing::HashMethodKeysFixed<typename Set::value_type, UInt128, UInt32, false, false, false>; |
91 | }; |
92 | |
93 | struct MethodHashed |
94 | { |
95 | using Set = ClearableHashMap<UInt128, UInt32, UInt128TrivialHash, HashTableGrower<INITIAL_SIZE_DEGREE>, |
96 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>; |
97 | using Method = ColumnsHashing::HashMethodHashed<typename Set::value_type, UInt32, false>; |
98 | }; |
99 | |
100 | template <typename Method> |
101 | void executeMethod(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, const Sizes & key_sizes, |
102 | const NullMap * null_map, ColumnUInt32::Container & res_values); |
103 | |
104 | template <typename Method, bool has_null_map> |
105 | void executeMethodImpl(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, const Sizes & key_sizes, |
106 | const NullMap * null_map, ColumnUInt32::Container & res_values); |
107 | |
108 | template <typename T> |
109 | bool executeNumber(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values); |
110 | bool executeString(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values); |
111 | bool executeFixedString(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values); |
112 | bool execute128bit(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, ColumnUInt32::Container & res_values); |
113 | void executeHashed(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, ColumnUInt32::Container & res_values); |
114 | }; |
115 | |
116 | |
117 | template <typename Derived> |
118 | void FunctionArrayEnumerateExtended<Derived>::executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) |
119 | { |
120 | const ColumnArray::Offsets * offsets = nullptr; |
121 | size_t num_arguments = arguments.size(); |
122 | ColumnRawPtrs data_columns(num_arguments); |
123 | |
124 | Columns array_holders; |
125 | ColumnPtr offsets_column; |
126 | for (size_t i = 0; i < num_arguments; ++i) |
127 | { |
128 | const ColumnPtr & array_ptr = block.getByPosition(arguments[i]).column; |
129 | const ColumnArray * array = checkAndGetColumn<ColumnArray>(array_ptr.get()); |
130 | if (!array) |
131 | { |
132 | const ColumnConst * const_array = checkAndGetColumnConst<ColumnArray>( |
133 | block.getByPosition(arguments[i]).column.get()); |
134 | if (!const_array) |
135 | throw Exception("Illegal column " + block.getByPosition(arguments[i]).column->getName() |
136 | + " of " + toString(i + 1) + "-th argument of function " + getName(), |
137 | ErrorCodes::ILLEGAL_COLUMN); |
138 | array_holders.emplace_back(const_array->convertToFullColumn()); |
139 | array = checkAndGetColumn<ColumnArray>(array_holders.back().get()); |
140 | } |
141 | |
142 | const ColumnArray::Offsets & offsets_i = array->getOffsets(); |
143 | if (i == 0) |
144 | { |
145 | offsets = &offsets_i; |
146 | offsets_column = array->getOffsetsPtr(); |
147 | } |
148 | else if (offsets_i != *offsets) |
149 | throw Exception("Lengths of all arrays passed to " + getName() + " must be equal." , |
150 | ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH); |
151 | |
152 | auto * array_data = &array->getData(); |
153 | data_columns[i] = array_data; |
154 | } |
155 | |
156 | const NullMap * null_map = nullptr; |
157 | |
158 | for (size_t i = 0; i < num_arguments; ++i) |
159 | { |
160 | if (auto * nullable_col = checkAndGetColumn<ColumnNullable>(*data_columns[i])) |
161 | { |
162 | if (num_arguments == 1) |
163 | data_columns[i] = &nullable_col->getNestedColumn(); |
164 | |
165 | null_map = &nullable_col->getNullMapData(); |
166 | break; |
167 | } |
168 | } |
169 | |
170 | auto res_nested = ColumnUInt32::create(); |
171 | |
172 | ColumnUInt32::Container & res_values = res_nested->getData(); |
173 | if (!offsets->empty()) |
174 | res_values.resize(offsets->back()); |
175 | |
176 | if (num_arguments == 1) |
177 | { |
178 | if (!(executeNumber<UInt8>(*offsets, *data_columns[0], null_map, res_values) |
179 | || executeNumber<UInt16>(*offsets, *data_columns[0], null_map, res_values) |
180 | || executeNumber<UInt32>(*offsets, *data_columns[0], null_map, res_values) |
181 | || executeNumber<UInt64>(*offsets, *data_columns[0], null_map, res_values) |
182 | || executeNumber<Int8>(*offsets, *data_columns[0], null_map, res_values) |
183 | || executeNumber<Int16>(*offsets, *data_columns[0], null_map, res_values) |
184 | || executeNumber<Int32>(*offsets, *data_columns[0], null_map, res_values) |
185 | || executeNumber<Int64>(*offsets, *data_columns[0], null_map, res_values) |
186 | || executeNumber<Float32>(*offsets, *data_columns[0], null_map, res_values) |
187 | || executeNumber<Float64>(*offsets, *data_columns[0], null_map, res_values) |
188 | || executeString(*offsets, *data_columns[0], null_map, res_values) |
189 | || executeFixedString(*offsets, *data_columns[0], null_map, res_values))) |
190 | executeHashed(*offsets, data_columns, res_values); |
191 | } |
192 | else |
193 | { |
194 | if (!execute128bit(*offsets, data_columns, res_values)) |
195 | executeHashed(*offsets, data_columns, res_values); |
196 | } |
197 | |
198 | block.getByPosition(result).column = ColumnArray::create(std::move(res_nested), offsets_column); |
199 | } |
200 | |
201 | template <typename Derived> |
202 | template <typename Method, bool has_null_map> |
203 | void FunctionArrayEnumerateExtended<Derived>::executeMethodImpl( |
204 | const ColumnArray::Offsets & offsets, |
205 | const ColumnRawPtrs & columns, |
206 | const Sizes & key_sizes, |
207 | [[maybe_unused]] const NullMap * null_map, |
208 | ColumnUInt32::Container & res_values) |
209 | { |
210 | typename Method::Set indices; |
211 | typename Method::Method method(columns, key_sizes, nullptr); |
212 | Arena pool; /// Won't use it; |
213 | |
214 | ColumnArray::Offset prev_off = 0; |
215 | |
216 | if constexpr (std::is_same_v<Derived, FunctionArrayEnumerateUniq>) |
217 | { |
218 | // Unique |
219 | for (size_t off : offsets) |
220 | { |
221 | indices.clear(); |
222 | UInt32 null_count = 0; |
223 | for (size_t j = prev_off; j < off; ++j) |
224 | { |
225 | if constexpr (has_null_map) |
226 | { |
227 | if ((*null_map)[j]) |
228 | { |
229 | res_values[j] = ++null_count; |
230 | continue; |
231 | } |
232 | } |
233 | |
234 | auto emplace_result = method.emplaceKey(indices, j, pool); |
235 | auto idx = emplace_result.getMapped() + 1; |
236 | emplace_result.setMapped(idx); |
237 | |
238 | res_values[j] = idx; |
239 | } |
240 | prev_off = off; |
241 | } |
242 | } |
243 | else |
244 | { |
245 | // Dense |
246 | for (size_t off : offsets) |
247 | { |
248 | indices.clear(); |
249 | UInt32 rank = 0; |
250 | [[maybe_unused]] UInt32 null_index = 0; |
251 | for (size_t j = prev_off; j < off; ++j) |
252 | { |
253 | if constexpr (has_null_map) |
254 | { |
255 | if ((*null_map)[j]) |
256 | { |
257 | if (!null_index) |
258 | null_index = ++rank; |
259 | |
260 | res_values[j] = null_index; |
261 | continue; |
262 | } |
263 | } |
264 | |
265 | auto emplace_result = method.emplaceKey(indices, j, pool); |
266 | auto idx = emplace_result.getMapped(); |
267 | |
268 | if (!idx) |
269 | { |
270 | idx = ++rank; |
271 | emplace_result.setMapped(idx); |
272 | } |
273 | |
274 | res_values[j] = idx; |
275 | } |
276 | prev_off = off; |
277 | } |
278 | } |
279 | } |
280 | |
281 | template <typename Derived> |
282 | template <typename Method> |
283 | void FunctionArrayEnumerateExtended<Derived>::executeMethod( |
284 | const ColumnArray::Offsets & offsets, |
285 | const ColumnRawPtrs & columns, |
286 | const Sizes & key_sizes, |
287 | const NullMap * null_map, |
288 | ColumnUInt32::Container & res_values) |
289 | { |
290 | if (null_map) |
291 | executeMethodImpl<Method, true>(offsets, columns, key_sizes, null_map, res_values); |
292 | else |
293 | executeMethodImpl<Method, false>(offsets, columns, key_sizes, null_map, res_values); |
294 | |
295 | } |
296 | |
297 | template <typename Derived> |
298 | template <typename T> |
299 | bool FunctionArrayEnumerateExtended<Derived>::executeNumber( |
300 | const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values) |
301 | { |
302 | const auto * nested = checkAndGetColumn<ColumnVector<T>>(&data); |
303 | if (!nested) |
304 | return false; |
305 | |
306 | executeMethod<MethodOneNumber<T>>(offsets, {nested}, {}, null_map, res_values); |
307 | return true; |
308 | } |
309 | |
310 | template <typename Derived> |
311 | bool FunctionArrayEnumerateExtended<Derived>::executeString( |
312 | const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values) |
313 | { |
314 | const auto * nested = checkAndGetColumn<ColumnString>(&data); |
315 | if (nested) |
316 | executeMethod<MethodString>(offsets, {nested}, {}, null_map, res_values); |
317 | |
318 | return nested; |
319 | } |
320 | |
321 | template <typename Derived> |
322 | bool FunctionArrayEnumerateExtended<Derived>::executeFixedString( |
323 | const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values) |
324 | { |
325 | const auto * nested = checkAndGetColumn<ColumnString>(&data); |
326 | if (nested) |
327 | executeMethod<MethodFixedString>(offsets, {nested}, {}, null_map, res_values); |
328 | |
329 | return nested; |
330 | } |
331 | |
332 | template <typename Derived> |
333 | bool FunctionArrayEnumerateExtended<Derived>::execute128bit( |
334 | const ColumnArray::Offsets & offsets, |
335 | const ColumnRawPtrs & columns, |
336 | ColumnUInt32::Container & res_values) |
337 | { |
338 | size_t count = columns.size(); |
339 | size_t keys_bytes = 0; |
340 | Sizes key_sizes(count); |
341 | |
342 | for (size_t j = 0; j < count; ++j) |
343 | { |
344 | if (!columns[j]->isFixedAndContiguous()) |
345 | return false; |
346 | key_sizes[j] = columns[j]->sizeOfValueIfFixed(); |
347 | keys_bytes += key_sizes[j]; |
348 | } |
349 | |
350 | executeMethod<MethodFixed>(offsets, columns, key_sizes, nullptr, res_values); |
351 | return true; |
352 | } |
353 | |
354 | template <typename Derived> |
355 | void FunctionArrayEnumerateExtended<Derived>::executeHashed( |
356 | const ColumnArray::Offsets & offsets, |
357 | const ColumnRawPtrs & columns, |
358 | ColumnUInt32::Container & res_values) |
359 | { |
360 | executeMethod<MethodHashed>(offsets, columns, {}, nullptr, res_values); |
361 | } |
362 | |
363 | } |
364 | |