1 | #include <Columns/ColumnArray.h> |
2 | #include <Columns/ColumnNullable.h> |
3 | #include <Columns/ColumnString.h> |
4 | #include <Columns/ColumnsNumber.h> |
5 | #include <DataTypes/DataTypeArray.h> |
6 | #include <DataTypes/DataTypesNumber.h> |
7 | #include <DataTypes/getLeastSupertype.h> |
8 | #include <Functions/FunctionHelpers.h> |
9 | #include <Functions/IFunctionImpl.h> |
10 | #include <Interpreters/AggregationCommon.h> |
11 | #include <Common/ColumnsHashing.h> |
12 | #include <Common/HashTable/ClearableHashMap.h> |
13 | |
14 | // for better debug: #include <Core/iostream_debug_helpers.h> |
15 | |
16 | /** The function will enumerate distinct values of the passed multidimensional arrays looking inside at the specified depths. |
17 | * This is very unusual function made as a special order for Yandex.Metrica. |
18 | * |
19 | * arrayEnumerateUniqRanked(['hello', 'world', 'hello']) = [1, 1, 2] |
20 | * - it returns similar structured array containing number of occurence of the corresponding value. |
21 | * |
22 | * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['hello']], 1) = [1, 1, 2] |
23 | * - look at the depth 1 by default. Elements are ['hello', 'world'], ['hello'], ['hello']. |
24 | * |
25 | * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['hello']]) = [[1,1],[2],[3]] |
26 | * - look at the depth 2. Return similar structured array. |
27 | * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['hello']], 2) = [[1,1],[2],[3]] |
28 | * - look at the maximum depth by default. |
29 | * |
30 | * We may pass multiple array arguments. Their elements will be processed as zipped to tuple. |
31 | * |
32 | * arrayEnumerateUniqRanked(['hello', 'hello', 'world', 'world'], ['a', 'b', 'b', 'b']) = [1, 1, 1, 2] |
33 | * |
34 | * We may provide arrays of different depths to look at different arguments. |
35 | * |
36 | * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['world'], ['world']], ['a', 'b', 'b', 'b']) = [[1,1],[1],[1],[2]] |
37 | * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['world'], ['world']], 1, ['a', 'b', 'b', 'b'], 1) = [1, 1, 1, 2] |
38 | * |
39 | * When depths are different, we process less deep arrays as promoted to deeper arrays of similar structure by duplicating elements. |
40 | * |
41 | * arrayEnumerateUniqRanked( |
42 | * [['hello', 'world'], ['hello'], ['world'], ['world']], |
43 | * ['a', 'b', 'b', 'b']) |
44 | * = arrayEnumerateUniqRanked( |
45 | * [['hello', 'world'], ['hello'], ['world'], ['world']], |
46 | * [['a', 'a'], ['b'], ['b'], ['b']]) |
47 | * |
48 | * Finally, we can provide extra first argument named "clear_depth" (it can be considered as 1 by default). |
49 | * Array elements at the clear_depth will be enumerated as separate elements (enumeration counter is reset for each new element). |
50 | * |
51 | * SELECT arrayEnumerateUniqRanked(1, [['hello', 'world'], ['hello'], ['world'], ['world']]) = [[1,1],[2],[2],[3]] |
52 | * SELECT arrayEnumerateUniqRanked(2, [['hello', 'world'], ['hello'], ['world'], ['world']]) = [[1,1],[1],[1],[1]] |
53 | * SELECT arrayEnumerateUniqRanked(1, [['hello', 'world', 'hello'], ['hello'], ['world'], ['world']]) = [[1,1,2],[3],[2],[3]] |
54 | * SELECT arrayEnumerateUniqRanked(2, [['hello', 'world', 'hello'], ['hello'], ['world'], ['world']]) = [[1,1,2],[1],[1],[1]] |
55 | */ |
56 | |
57 | namespace DB |
58 | { |
59 | namespace ErrorCodes |
60 | { |
61 | extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; |
62 | extern const int ILLEGAL_COLUMN; |
63 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
64 | extern const int SIZES_OF_ARRAYS_DOESNT_MATCH; |
65 | } |
66 | |
67 | class FunctionArrayEnumerateUniqRanked; |
68 | class FunctionArrayEnumerateDenseRanked; |
69 | |
70 | using DepthType = uint32_t; |
71 | using DepthTypes = std::vector<DepthType>; |
72 | |
73 | struct ArraysDepths |
74 | { |
75 | /// Enumerate elements at the specified level separately. |
76 | DepthType clear_depth; |
77 | |
78 | /// Effective depth is the array depth by default or lower value, specified as a constant argument following the array. |
79 | /// f([[1, 2], [3]]) - effective depth is 2. |
80 | /// f([[1, 2], [3]], 1) - effective depth is 1. |
81 | DepthTypes depths; |
82 | |
83 | /// Maximum effective depth. |
84 | DepthType max_array_depth; |
85 | }; |
86 | |
87 | /// Return depth info about passed arrays |
88 | ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments); |
89 | |
90 | template <typename Derived> |
91 | class FunctionArrayEnumerateRankedExtended : public IFunction |
92 | { |
93 | public: |
94 | static FunctionPtr create(const Context & /* context */) { return std::make_shared<Derived>(); } |
95 | |
96 | String getName() const override { return Derived::name; } |
97 | |
98 | bool isVariadic() const override { return true; } |
99 | size_t getNumberOfArguments() const override { return 0; } |
100 | |
101 | DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override |
102 | { |
103 | if (arguments.size() == 0) |
104 | throw Exception( |
105 | "Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size()) |
106 | + ", should be at least 1." , |
107 | ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); |
108 | |
109 | const ArraysDepths arrays_depths = getArraysDepths(arguments); |
110 | |
111 | /// Return type is the array of the depth as the maximum effective depth of arguments, containing UInt32. |
112 | |
113 | DataTypePtr type = std::make_shared<DataTypeUInt32>(); |
114 | for (DepthType i = 0; i < arrays_depths.max_array_depth; ++i) |
115 | type = std::make_shared<DataTypeArray>(type); |
116 | |
117 | return type; |
118 | } |
119 | |
120 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override; |
121 | |
122 | private: |
123 | /// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess. |
124 | static constexpr size_t INITIAL_SIZE_DEGREE = 9; |
125 | |
126 | void executeMethodImpl( |
127 | const std::vector<const ColumnArray::Offsets *> & offsets_by_depth, |
128 | const ColumnRawPtrs & columns, |
129 | const ArraysDepths & arrays_depths, |
130 | ColumnUInt32::Container & res_values); |
131 | }; |
132 | |
133 | |
134 | /// Hash a set of keys into a UInt128 value. |
135 | static inline UInt128 ALWAYS_INLINE hash128depths(const std::vector<size_t> & indices, const ColumnRawPtrs & key_columns) |
136 | { |
137 | UInt128 key; |
138 | SipHash hash; |
139 | |
140 | for (size_t j = 0, keys_size = key_columns.size(); j < keys_size; ++j) |
141 | { |
142 | // Debug: const auto & field = (*key_columns[j])[indices[j]]; DUMP(j, indices[j], field); |
143 | key_columns[j]->updateHashWithValue(indices[j], hash); |
144 | } |
145 | |
146 | hash.get128(key.low, key.high); |
147 | |
148 | return key; |
149 | } |
150 | |
151 | |
152 | template <typename Derived> |
153 | void FunctionArrayEnumerateRankedExtended<Derived>::executeImpl( |
154 | Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) |
155 | { |
156 | size_t num_arguments = arguments.size(); |
157 | ColumnRawPtrs data_columns; |
158 | |
159 | Columns array_holders; |
160 | ColumnPtr offsets_column; |
161 | |
162 | ColumnsWithTypeAndName args; |
163 | |
164 | for (size_t i = 0; i < arguments.size(); ++i) |
165 | args.emplace_back(block.getByPosition(arguments[i])); |
166 | |
167 | const ArraysDepths arrays_depths = getArraysDepths(args); |
168 | |
169 | /// If the column is Array - return it. If the const Array - materialize it, keep ownership and return. |
170 | auto get_array_column = [&](const auto & column) -> const DB::ColumnArray * |
171 | { |
172 | const ColumnArray * array = checkAndGetColumn<ColumnArray>(column); |
173 | if (!array) |
174 | { |
175 | const ColumnConst * const_array = checkAndGetColumnConst<ColumnArray>(column); |
176 | if (!const_array) |
177 | return nullptr; |
178 | array_holders.emplace_back(const_array->convertToFullColumn()); |
179 | array = checkAndGetColumn<ColumnArray>(array_holders.back().get()); |
180 | } |
181 | return array; |
182 | }; |
183 | |
184 | std::vector<const ColumnArray::Offsets *> offsets_by_depth; |
185 | std::vector<ColumnPtr> offsetsptr_by_depth; |
186 | |
187 | size_t array_num = 0; |
188 | for (size_t i = 0; i < num_arguments; ++i) |
189 | { |
190 | const auto * array = get_array_column(block.getByPosition(arguments[i]).column.get()); |
191 | if (!array) |
192 | continue; |
193 | |
194 | if (array_num == 0) // TODO check with prev |
195 | { |
196 | offsets_by_depth.emplace_back(&array->getOffsets()); |
197 | offsetsptr_by_depth.emplace_back(array->getOffsetsPtr()); |
198 | } |
199 | else |
200 | { |
201 | if (*offsets_by_depth[0] != array->getOffsets()) |
202 | { |
203 | throw Exception( |
204 | "Lengths and effective depths of all arrays passed to " + getName() + " must be equal." , |
205 | ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH); |
206 | } |
207 | } |
208 | |
209 | DepthType col_depth = 1; |
210 | for (; col_depth < arrays_depths.depths[array_num]; ++col_depth) |
211 | { |
212 | auto sub_array = get_array_column(&array->getData()); |
213 | if (sub_array) |
214 | array = sub_array; |
215 | if (!sub_array) |
216 | break; |
217 | |
218 | if (offsets_by_depth.size() <= col_depth) |
219 | { |
220 | offsets_by_depth.emplace_back(&array->getOffsets()); |
221 | offsetsptr_by_depth.emplace_back(array->getOffsetsPtr()); |
222 | } |
223 | else |
224 | { |
225 | if (*offsets_by_depth[col_depth] != array->getOffsets()) |
226 | { |
227 | throw Exception( |
228 | "Lengths and effective depths of all arrays passed to " + getName() + " must be equal." , |
229 | ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH); |
230 | } |
231 | } |
232 | } |
233 | |
234 | if (col_depth < arrays_depths.depths[array_num]) |
235 | { |
236 | throw Exception( |
237 | getName() + ": Passed array number " + std::to_string(array_num) + " depth (" |
238 | + std::to_string(arrays_depths.depths[array_num]) + ") is more than the actual array depth (" |
239 | + std::to_string(col_depth) + ")." , |
240 | ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH); |
241 | } |
242 | |
243 | auto * array_data = &array->getData(); |
244 | data_columns.emplace_back(array_data); |
245 | ++array_num; |
246 | } |
247 | |
248 | if (offsets_by_depth.empty()) |
249 | throw Exception("No arrays passed to function " + getName(), ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); |
250 | |
251 | auto res_nested = ColumnUInt32::create(); |
252 | |
253 | ColumnUInt32::Container & res_values = res_nested->getData(); |
254 | res_values.resize(offsets_by_depth[arrays_depths.max_array_depth - 1]->back()); |
255 | |
256 | executeMethodImpl(offsets_by_depth, data_columns, arrays_depths, res_values); |
257 | |
258 | ColumnPtr result_nested_array = std::move(res_nested); |
259 | for (ssize_t depth = arrays_depths.max_array_depth - 1; depth >= 0; --depth) |
260 | result_nested_array = ColumnArray::create(std::move(result_nested_array), offsetsptr_by_depth[depth]); |
261 | |
262 | block.getByPosition(result).column = result_nested_array; |
263 | } |
264 | |
265 | /* |
266 | |
267 | (2, [[1,2,3],[2,2,1],[3]], 2, [4,5,6], 1) |
268 | ; 1 2 3; 2 2 1; 3 4 5 6 |
269 | ; 4 4 4; 5 5 5; 6 <- |
270 | |
271 | (1, [[1,2,3],[2,2,1],[3]], 1, [4,5,6], 1) |
272 | ;[1,2,3] [2,2,1] [3] 4 5 6 |
273 | ;4 5 6 <- |
274 | |
275 | (1, [[1,2,3],[2,2,1],[3]], 1, [4,5,6], 0) |
276 | ;[1,2,3] [2,2,1] [3] 4 5 6 |
277 | ;[4,5,6] [4,5,6] [4,5,6] <- |
278 | |
279 | . - get data |
280 | ; - clean index |
281 | |
282 | (1, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 1) |
283 | ;. . . |
284 | |
285 | (1, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 2) |
286 | ; . . . . . . . |
287 | |
288 | (2, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 2) |
289 | ; . . . ; . . . ; . |
290 | |
291 | (1, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 3) |
292 | ; . . . . . . . . . . . . . . . . . . . . |
293 | |
294 | (2, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 3) |
295 | ; . . . . . . . . . ; . . . . . . . . . ; . . |
296 | |
297 | (3, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 3) |
298 | ; . . . ; . . . ; . . . ; . . . ; . . . ; . . . ; . . |
299 | |
300 | */ |
301 | |
302 | template <typename Derived> |
303 | void FunctionArrayEnumerateRankedExtended<Derived>::executeMethodImpl( |
304 | const std::vector<const ColumnArray::Offsets *> & offsets_by_depth, |
305 | const ColumnRawPtrs & columns, |
306 | const ArraysDepths & arrays_depths, |
307 | ColumnUInt32::Container & res_values) |
308 | { |
309 | /// Offsets at the depth we want to look. |
310 | const size_t depth_to_look = arrays_depths.max_array_depth; |
311 | const auto & offsets = *offsets_by_depth[depth_to_look - 1]; |
312 | |
313 | using Map = ClearableHashMap< |
314 | UInt128, |
315 | UInt32, |
316 | UInt128TrivialHash, |
317 | HashTableGrower<INITIAL_SIZE_DEGREE>, |
318 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>; |
319 | Map indices; |
320 | |
321 | std::vector<size_t> indices_by_depth(depth_to_look); |
322 | std::vector<size_t> current_offset_n_by_depth(depth_to_look); |
323 | std::vector<size_t> last_offset_by_depth(depth_to_look, 0); // For skipping empty arrays |
324 | |
325 | /// For arrayEnumerateDense variant: to calculate every distinct value. |
326 | UInt32 rank = 0; |
327 | |
328 | std::vector<size_t> columns_indices(columns.size()); |
329 | |
330 | /// For each array at the depth we want to look. |
331 | ColumnArray::Offset prev_off = 0; |
332 | for (size_t off : offsets) |
333 | { |
334 | bool want_clear = false; |
335 | |
336 | /// Skipping offsets if no data in this array |
337 | if (prev_off == off) |
338 | { |
339 | if (depth_to_look >= 2) |
340 | { |
341 | /// Advance to the next element of the parent array. |
342 | for (ssize_t depth = depth_to_look - 2; depth >= 0; --depth) |
343 | { |
344 | /// Skipping offsets for empty arrays |
345 | while (last_offset_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]) |
346 | { |
347 | ++current_offset_n_by_depth[depth]; |
348 | } |
349 | |
350 | ++indices_by_depth[depth]; |
351 | |
352 | if (indices_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]) |
353 | { |
354 | last_offset_by_depth[depth] = (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]; |
355 | ++current_offset_n_by_depth[depth]; |
356 | want_clear = true; |
357 | } |
358 | else |
359 | { |
360 | break; |
361 | } |
362 | } |
363 | } |
364 | } |
365 | |
366 | /// For each element at the depth we want to look. |
367 | for (size_t j = prev_off; j < off; ++j) |
368 | { |
369 | for (size_t col_n = 0; col_n < columns.size(); ++col_n) |
370 | columns_indices[col_n] = indices_by_depth[arrays_depths.depths[col_n] - 1]; |
371 | |
372 | auto hash = hash128depths(columns_indices, columns); |
373 | |
374 | if constexpr (std::is_same_v<Derived, FunctionArrayEnumerateUniqRanked>) |
375 | { |
376 | auto idx = ++indices[hash]; |
377 | res_values[j] = idx; |
378 | } |
379 | else // FunctionArrayEnumerateDenseRanked |
380 | { |
381 | auto idx = indices[hash]; |
382 | if (!idx) |
383 | { |
384 | idx = ++rank; |
385 | indices[hash] = idx; |
386 | } |
387 | res_values[j] = idx; |
388 | } |
389 | |
390 | // Debug: DUMP(off, prev_off, j, columns_indices, res_values[j], columns); |
391 | |
392 | for (ssize_t depth = depth_to_look - 1; depth >= 0; --depth) |
393 | { |
394 | /// Skipping offsets for empty arrays |
395 | while (last_offset_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]) |
396 | { |
397 | ++current_offset_n_by_depth[depth]; |
398 | } |
399 | |
400 | ++indices_by_depth[depth]; |
401 | |
402 | if (indices_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]) |
403 | { |
404 | if (static_cast<int>(arrays_depths.clear_depth) == depth + 1) |
405 | want_clear = true; |
406 | last_offset_by_depth[depth] = (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]; |
407 | ++current_offset_n_by_depth[depth]; |
408 | } |
409 | else |
410 | { |
411 | break; |
412 | } |
413 | } |
414 | } |
415 | |
416 | if (want_clear) |
417 | { |
418 | want_clear = false; |
419 | indices.clear(); |
420 | rank = 0; |
421 | } |
422 | |
423 | prev_off = off; |
424 | } |
425 | } |
426 | |
427 | } |
428 | |