1#include <Columns/ColumnArray.h>
2#include <Columns/ColumnNullable.h>
3#include <Columns/ColumnString.h>
4#include <Columns/ColumnsNumber.h>
5#include <DataTypes/DataTypeArray.h>
6#include <DataTypes/DataTypesNumber.h>
7#include <DataTypes/getLeastSupertype.h>
8#include <Functions/FunctionHelpers.h>
9#include <Functions/IFunctionImpl.h>
10#include <Interpreters/AggregationCommon.h>
11#include <Common/ColumnsHashing.h>
12#include <Common/HashTable/ClearableHashMap.h>
13
14// for better debug: #include <Core/iostream_debug_helpers.h>
15
16/** The function will enumerate distinct values of the passed multidimensional arrays looking inside at the specified depths.
17 * This is very unusual function made as a special order for Yandex.Metrica.
18 *
19 * arrayEnumerateUniqRanked(['hello', 'world', 'hello']) = [1, 1, 2]
20 * - it returns similar structured array containing number of occurence of the corresponding value.
21 *
22 * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['hello']], 1) = [1, 1, 2]
23 * - look at the depth 1 by default. Elements are ['hello', 'world'], ['hello'], ['hello'].
24 *
25 * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['hello']]) = [[1,1],[2],[3]]
26 * - look at the depth 2. Return similar structured array.
27 * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['hello']], 2) = [[1,1],[2],[3]]
28 * - look at the maximum depth by default.
29 *
30 * We may pass multiple array arguments. Their elements will be processed as zipped to tuple.
31 *
32 * arrayEnumerateUniqRanked(['hello', 'hello', 'world', 'world'], ['a', 'b', 'b', 'b']) = [1, 1, 1, 2]
33 *
34 * We may provide arrays of different depths to look at different arguments.
35 *
36 * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['world'], ['world']], ['a', 'b', 'b', 'b']) = [[1,1],[1],[1],[2]]
37 * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['world'], ['world']], 1, ['a', 'b', 'b', 'b'], 1) = [1, 1, 1, 2]
38 *
39 * When depths are different, we process less deep arrays as promoted to deeper arrays of similar structure by duplicating elements.
40 *
41 * arrayEnumerateUniqRanked(
42 * [['hello', 'world'], ['hello'], ['world'], ['world']],
43 * ['a', 'b', 'b', 'b'])
44 * = arrayEnumerateUniqRanked(
45 * [['hello', 'world'], ['hello'], ['world'], ['world']],
46 * [['a', 'a'], ['b'], ['b'], ['b']])
47 *
48 * Finally, we can provide extra first argument named "clear_depth" (it can be considered as 1 by default).
49 * Array elements at the clear_depth will be enumerated as separate elements (enumeration counter is reset for each new element).
50 *
51 * SELECT arrayEnumerateUniqRanked(1, [['hello', 'world'], ['hello'], ['world'], ['world']]) = [[1,1],[2],[2],[3]]
52 * SELECT arrayEnumerateUniqRanked(2, [['hello', 'world'], ['hello'], ['world'], ['world']]) = [[1,1],[1],[1],[1]]
53 * SELECT arrayEnumerateUniqRanked(1, [['hello', 'world', 'hello'], ['hello'], ['world'], ['world']]) = [[1,1,2],[3],[2],[3]]
54 * SELECT arrayEnumerateUniqRanked(2, [['hello', 'world', 'hello'], ['hello'], ['world'], ['world']]) = [[1,1,2],[1],[1],[1]]
55 */
56
57namespace DB
58{
59namespace ErrorCodes
60{
61 extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
62 extern const int ILLEGAL_COLUMN;
63 extern const int ILLEGAL_TYPE_OF_ARGUMENT;
64 extern const int SIZES_OF_ARRAYS_DOESNT_MATCH;
65}
66
67class FunctionArrayEnumerateUniqRanked;
68class FunctionArrayEnumerateDenseRanked;
69
70using DepthType = uint32_t;
71using DepthTypes = std::vector<DepthType>;
72
73struct ArraysDepths
74{
75 /// Enumerate elements at the specified level separately.
76 DepthType clear_depth;
77
78 /// Effective depth is the array depth by default or lower value, specified as a constant argument following the array.
79 /// f([[1, 2], [3]]) - effective depth is 2.
80 /// f([[1, 2], [3]], 1) - effective depth is 1.
81 DepthTypes depths;
82
83 /// Maximum effective depth.
84 DepthType max_array_depth;
85};
86
87/// Return depth info about passed arrays
88ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments);
89
90template <typename Derived>
91class FunctionArrayEnumerateRankedExtended : public IFunction
92{
93public:
94 static FunctionPtr create(const Context & /* context */) { return std::make_shared<Derived>(); }
95
96 String getName() const override { return Derived::name; }
97
98 bool isVariadic() const override { return true; }
99 size_t getNumberOfArguments() const override { return 0; }
100
101 DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
102 {
103 if (arguments.size() == 0)
104 throw Exception(
105 "Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size())
106 + ", should be at least 1.",
107 ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
108
109 const ArraysDepths arrays_depths = getArraysDepths(arguments);
110
111 /// Return type is the array of the depth as the maximum effective depth of arguments, containing UInt32.
112
113 DataTypePtr type = std::make_shared<DataTypeUInt32>();
114 for (DepthType i = 0; i < arrays_depths.max_array_depth; ++i)
115 type = std::make_shared<DataTypeArray>(type);
116
117 return type;
118 }
119
120 void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override;
121
122private:
123 /// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess.
124 static constexpr size_t INITIAL_SIZE_DEGREE = 9;
125
126 void executeMethodImpl(
127 const std::vector<const ColumnArray::Offsets *> & offsets_by_depth,
128 const ColumnRawPtrs & columns,
129 const ArraysDepths & arrays_depths,
130 ColumnUInt32::Container & res_values);
131};
132
133
134/// Hash a set of keys into a UInt128 value.
135static inline UInt128 ALWAYS_INLINE hash128depths(const std::vector<size_t> & indices, const ColumnRawPtrs & key_columns)
136{
137 UInt128 key;
138 SipHash hash;
139
140 for (size_t j = 0, keys_size = key_columns.size(); j < keys_size; ++j)
141 {
142 // Debug: const auto & field = (*key_columns[j])[indices[j]]; DUMP(j, indices[j], field);
143 key_columns[j]->updateHashWithValue(indices[j], hash);
144 }
145
146 hash.get128(key.low, key.high);
147
148 return key;
149}
150
151
152template <typename Derived>
153void FunctionArrayEnumerateRankedExtended<Derived>::executeImpl(
154 Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/)
155{
156 size_t num_arguments = arguments.size();
157 ColumnRawPtrs data_columns;
158
159 Columns array_holders;
160 ColumnPtr offsets_column;
161
162 ColumnsWithTypeAndName args;
163
164 for (size_t i = 0; i < arguments.size(); ++i)
165 args.emplace_back(block.getByPosition(arguments[i]));
166
167 const ArraysDepths arrays_depths = getArraysDepths(args);
168
169 /// If the column is Array - return it. If the const Array - materialize it, keep ownership and return.
170 auto get_array_column = [&](const auto & column) -> const DB::ColumnArray *
171 {
172 const ColumnArray * array = checkAndGetColumn<ColumnArray>(column);
173 if (!array)
174 {
175 const ColumnConst * const_array = checkAndGetColumnConst<ColumnArray>(column);
176 if (!const_array)
177 return nullptr;
178 array_holders.emplace_back(const_array->convertToFullColumn());
179 array = checkAndGetColumn<ColumnArray>(array_holders.back().get());
180 }
181 return array;
182 };
183
184 std::vector<const ColumnArray::Offsets *> offsets_by_depth;
185 std::vector<ColumnPtr> offsetsptr_by_depth;
186
187 size_t array_num = 0;
188 for (size_t i = 0; i < num_arguments; ++i)
189 {
190 const auto * array = get_array_column(block.getByPosition(arguments[i]).column.get());
191 if (!array)
192 continue;
193
194 if (array_num == 0) // TODO check with prev
195 {
196 offsets_by_depth.emplace_back(&array->getOffsets());
197 offsetsptr_by_depth.emplace_back(array->getOffsetsPtr());
198 }
199 else
200 {
201 if (*offsets_by_depth[0] != array->getOffsets())
202 {
203 throw Exception(
204 "Lengths and effective depths of all arrays passed to " + getName() + " must be equal.",
205 ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH);
206 }
207 }
208
209 DepthType col_depth = 1;
210 for (; col_depth < arrays_depths.depths[array_num]; ++col_depth)
211 {
212 auto sub_array = get_array_column(&array->getData());
213 if (sub_array)
214 array = sub_array;
215 if (!sub_array)
216 break;
217
218 if (offsets_by_depth.size() <= col_depth)
219 {
220 offsets_by_depth.emplace_back(&array->getOffsets());
221 offsetsptr_by_depth.emplace_back(array->getOffsetsPtr());
222 }
223 else
224 {
225 if (*offsets_by_depth[col_depth] != array->getOffsets())
226 {
227 throw Exception(
228 "Lengths and effective depths of all arrays passed to " + getName() + " must be equal.",
229 ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH);
230 }
231 }
232 }
233
234 if (col_depth < arrays_depths.depths[array_num])
235 {
236 throw Exception(
237 getName() + ": Passed array number " + std::to_string(array_num) + " depth ("
238 + std::to_string(arrays_depths.depths[array_num]) + ") is more than the actual array depth ("
239 + std::to_string(col_depth) + ").",
240 ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH);
241 }
242
243 auto * array_data = &array->getData();
244 data_columns.emplace_back(array_data);
245 ++array_num;
246 }
247
248 if (offsets_by_depth.empty())
249 throw Exception("No arrays passed to function " + getName(), ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
250
251 auto res_nested = ColumnUInt32::create();
252
253 ColumnUInt32::Container & res_values = res_nested->getData();
254 res_values.resize(offsets_by_depth[arrays_depths.max_array_depth - 1]->back());
255
256 executeMethodImpl(offsets_by_depth, data_columns, arrays_depths, res_values);
257
258 ColumnPtr result_nested_array = std::move(res_nested);
259 for (ssize_t depth = arrays_depths.max_array_depth - 1; depth >= 0; --depth)
260 result_nested_array = ColumnArray::create(std::move(result_nested_array), offsetsptr_by_depth[depth]);
261
262 block.getByPosition(result).column = result_nested_array;
263}
264
265/*
266
267(2, [[1,2,3],[2,2,1],[3]], 2, [4,5,6], 1)
268 ; 1 2 3; 2 2 1; 3 4 5 6
269 ; 4 4 4; 5 5 5; 6 <-
270
271(1, [[1,2,3],[2,2,1],[3]], 1, [4,5,6], 1)
272 ;[1,2,3] [2,2,1] [3] 4 5 6
273 ;4 5 6 <-
274
275(1, [[1,2,3],[2,2,1],[3]], 1, [4,5,6], 0)
276 ;[1,2,3] [2,2,1] [3] 4 5 6
277 ;[4,5,6] [4,5,6] [4,5,6] <-
278
279. - get data
280; - clean index
281
282(1, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 1)
283 ;. . .
284
285(1, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 2)
286 ; . . . . . . .
287
288(2, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 2)
289 ; . . . ; . . . ; .
290
291(1, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 3)
292 ; . . . . . . . . . . . . . . . . . . . .
293
294(2, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 3)
295 ; . . . . . . . . . ; . . . . . . . . . ; . .
296
297(3, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 3)
298 ; . . . ; . . . ; . . . ; . . . ; . . . ; . . . ; . .
299
300*/
301
302template <typename Derived>
303void FunctionArrayEnumerateRankedExtended<Derived>::executeMethodImpl(
304 const std::vector<const ColumnArray::Offsets *> & offsets_by_depth,
305 const ColumnRawPtrs & columns,
306 const ArraysDepths & arrays_depths,
307 ColumnUInt32::Container & res_values)
308{
309 /// Offsets at the depth we want to look.
310 const size_t depth_to_look = arrays_depths.max_array_depth;
311 const auto & offsets = *offsets_by_depth[depth_to_look - 1];
312
313 using Map = ClearableHashMap<
314 UInt128,
315 UInt32,
316 UInt128TrivialHash,
317 HashTableGrower<INITIAL_SIZE_DEGREE>,
318 HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>;
319 Map indices;
320
321 std::vector<size_t> indices_by_depth(depth_to_look);
322 std::vector<size_t> current_offset_n_by_depth(depth_to_look);
323 std::vector<size_t> last_offset_by_depth(depth_to_look, 0); // For skipping empty arrays
324
325 /// For arrayEnumerateDense variant: to calculate every distinct value.
326 UInt32 rank = 0;
327
328 std::vector<size_t> columns_indices(columns.size());
329
330 /// For each array at the depth we want to look.
331 ColumnArray::Offset prev_off = 0;
332 for (size_t off : offsets)
333 {
334 bool want_clear = false;
335
336 /// Skipping offsets if no data in this array
337 if (prev_off == off)
338 {
339 if (depth_to_look >= 2)
340 {
341 /// Advance to the next element of the parent array.
342 for (ssize_t depth = depth_to_look - 2; depth >= 0; --depth)
343 {
344 /// Skipping offsets for empty arrays
345 while (last_offset_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]])
346 {
347 ++current_offset_n_by_depth[depth];
348 }
349
350 ++indices_by_depth[depth];
351
352 if (indices_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]])
353 {
354 last_offset_by_depth[depth] = (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]];
355 ++current_offset_n_by_depth[depth];
356 want_clear = true;
357 }
358 else
359 {
360 break;
361 }
362 }
363 }
364 }
365
366 /// For each element at the depth we want to look.
367 for (size_t j = prev_off; j < off; ++j)
368 {
369 for (size_t col_n = 0; col_n < columns.size(); ++col_n)
370 columns_indices[col_n] = indices_by_depth[arrays_depths.depths[col_n] - 1];
371
372 auto hash = hash128depths(columns_indices, columns);
373
374 if constexpr (std::is_same_v<Derived, FunctionArrayEnumerateUniqRanked>)
375 {
376 auto idx = ++indices[hash];
377 res_values[j] = idx;
378 }
379 else // FunctionArrayEnumerateDenseRanked
380 {
381 auto idx = indices[hash];
382 if (!idx)
383 {
384 idx = ++rank;
385 indices[hash] = idx;
386 }
387 res_values[j] = idx;
388 }
389
390 // Debug: DUMP(off, prev_off, j, columns_indices, res_values[j], columns);
391
392 for (ssize_t depth = depth_to_look - 1; depth >= 0; --depth)
393 {
394 /// Skipping offsets for empty arrays
395 while (last_offset_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]])
396 {
397 ++current_offset_n_by_depth[depth];
398 }
399
400 ++indices_by_depth[depth];
401
402 if (indices_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]])
403 {
404 if (static_cast<int>(arrays_depths.clear_depth) == depth + 1)
405 want_clear = true;
406 last_offset_by_depth[depth] = (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]];
407 ++current_offset_n_by_depth[depth];
408 }
409 else
410 {
411 break;
412 }
413 }
414 }
415
416 if (want_clear)
417 {
418 want_clear = false;
419 indices.clear();
420 rank = 0;
421 }
422
423 prev_off = off;
424 }
425}
426
427}
428