| 1 | #include <Columns/ColumnArray.h> |
| 2 | #include <Columns/ColumnNullable.h> |
| 3 | #include <Columns/ColumnString.h> |
| 4 | #include <Columns/ColumnsNumber.h> |
| 5 | #include <DataTypes/DataTypeArray.h> |
| 6 | #include <DataTypes/DataTypesNumber.h> |
| 7 | #include <DataTypes/getLeastSupertype.h> |
| 8 | #include <Functions/FunctionHelpers.h> |
| 9 | #include <Functions/IFunctionImpl.h> |
| 10 | #include <Interpreters/AggregationCommon.h> |
| 11 | #include <Common/ColumnsHashing.h> |
| 12 | #include <Common/HashTable/ClearableHashMap.h> |
| 13 | |
| 14 | // for better debug: #include <Core/iostream_debug_helpers.h> |
| 15 | |
| 16 | /** The function will enumerate distinct values of the passed multidimensional arrays looking inside at the specified depths. |
| 17 | * This is very unusual function made as a special order for Yandex.Metrica. |
| 18 | * |
| 19 | * arrayEnumerateUniqRanked(['hello', 'world', 'hello']) = [1, 1, 2] |
| 20 | * - it returns similar structured array containing number of occurence of the corresponding value. |
| 21 | * |
| 22 | * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['hello']], 1) = [1, 1, 2] |
| 23 | * - look at the depth 1 by default. Elements are ['hello', 'world'], ['hello'], ['hello']. |
| 24 | * |
| 25 | * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['hello']]) = [[1,1],[2],[3]] |
| 26 | * - look at the depth 2. Return similar structured array. |
| 27 | * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['hello']], 2) = [[1,1],[2],[3]] |
| 28 | * - look at the maximum depth by default. |
| 29 | * |
| 30 | * We may pass multiple array arguments. Their elements will be processed as zipped to tuple. |
| 31 | * |
| 32 | * arrayEnumerateUniqRanked(['hello', 'hello', 'world', 'world'], ['a', 'b', 'b', 'b']) = [1, 1, 1, 2] |
| 33 | * |
| 34 | * We may provide arrays of different depths to look at different arguments. |
| 35 | * |
| 36 | * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['world'], ['world']], ['a', 'b', 'b', 'b']) = [[1,1],[1],[1],[2]] |
| 37 | * arrayEnumerateUniqRanked([['hello', 'world'], ['hello'], ['world'], ['world']], 1, ['a', 'b', 'b', 'b'], 1) = [1, 1, 1, 2] |
| 38 | * |
| 39 | * When depths are different, we process less deep arrays as promoted to deeper arrays of similar structure by duplicating elements. |
| 40 | * |
| 41 | * arrayEnumerateUniqRanked( |
| 42 | * [['hello', 'world'], ['hello'], ['world'], ['world']], |
| 43 | * ['a', 'b', 'b', 'b']) |
| 44 | * = arrayEnumerateUniqRanked( |
| 45 | * [['hello', 'world'], ['hello'], ['world'], ['world']], |
| 46 | * [['a', 'a'], ['b'], ['b'], ['b']]) |
| 47 | * |
| 48 | * Finally, we can provide extra first argument named "clear_depth" (it can be considered as 1 by default). |
| 49 | * Array elements at the clear_depth will be enumerated as separate elements (enumeration counter is reset for each new element). |
| 50 | * |
| 51 | * SELECT arrayEnumerateUniqRanked(1, [['hello', 'world'], ['hello'], ['world'], ['world']]) = [[1,1],[2],[2],[3]] |
| 52 | * SELECT arrayEnumerateUniqRanked(2, [['hello', 'world'], ['hello'], ['world'], ['world']]) = [[1,1],[1],[1],[1]] |
| 53 | * SELECT arrayEnumerateUniqRanked(1, [['hello', 'world', 'hello'], ['hello'], ['world'], ['world']]) = [[1,1,2],[3],[2],[3]] |
| 54 | * SELECT arrayEnumerateUniqRanked(2, [['hello', 'world', 'hello'], ['hello'], ['world'], ['world']]) = [[1,1,2],[1],[1],[1]] |
| 55 | */ |
| 56 | |
| 57 | namespace DB |
| 58 | { |
| 59 | namespace ErrorCodes |
| 60 | { |
| 61 | extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; |
| 62 | extern const int ILLEGAL_COLUMN; |
| 63 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
| 64 | extern const int SIZES_OF_ARRAYS_DOESNT_MATCH; |
| 65 | } |
| 66 | |
| 67 | class FunctionArrayEnumerateUniqRanked; |
| 68 | class FunctionArrayEnumerateDenseRanked; |
| 69 | |
| 70 | using DepthType = uint32_t; |
| 71 | using DepthTypes = std::vector<DepthType>; |
| 72 | |
| 73 | struct ArraysDepths |
| 74 | { |
| 75 | /// Enumerate elements at the specified level separately. |
| 76 | DepthType clear_depth; |
| 77 | |
| 78 | /// Effective depth is the array depth by default or lower value, specified as a constant argument following the array. |
| 79 | /// f([[1, 2], [3]]) - effective depth is 2. |
| 80 | /// f([[1, 2], [3]], 1) - effective depth is 1. |
| 81 | DepthTypes depths; |
| 82 | |
| 83 | /// Maximum effective depth. |
| 84 | DepthType max_array_depth; |
| 85 | }; |
| 86 | |
| 87 | /// Return depth info about passed arrays |
| 88 | ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments); |
| 89 | |
| 90 | template <typename Derived> |
| 91 | class FunctionArrayEnumerateRankedExtended : public IFunction |
| 92 | { |
| 93 | public: |
| 94 | static FunctionPtr create(const Context & /* context */) { return std::make_shared<Derived>(); } |
| 95 | |
| 96 | String getName() const override { return Derived::name; } |
| 97 | |
| 98 | bool isVariadic() const override { return true; } |
| 99 | size_t getNumberOfArguments() const override { return 0; } |
| 100 | |
| 101 | DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override |
| 102 | { |
| 103 | if (arguments.size() == 0) |
| 104 | throw Exception( |
| 105 | "Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size()) |
| 106 | + ", should be at least 1." , |
| 107 | ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); |
| 108 | |
| 109 | const ArraysDepths arrays_depths = getArraysDepths(arguments); |
| 110 | |
| 111 | /// Return type is the array of the depth as the maximum effective depth of arguments, containing UInt32. |
| 112 | |
| 113 | DataTypePtr type = std::make_shared<DataTypeUInt32>(); |
| 114 | for (DepthType i = 0; i < arrays_depths.max_array_depth; ++i) |
| 115 | type = std::make_shared<DataTypeArray>(type); |
| 116 | |
| 117 | return type; |
| 118 | } |
| 119 | |
| 120 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override; |
| 121 | |
| 122 | private: |
| 123 | /// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess. |
| 124 | static constexpr size_t INITIAL_SIZE_DEGREE = 9; |
| 125 | |
| 126 | void executeMethodImpl( |
| 127 | const std::vector<const ColumnArray::Offsets *> & offsets_by_depth, |
| 128 | const ColumnRawPtrs & columns, |
| 129 | const ArraysDepths & arrays_depths, |
| 130 | ColumnUInt32::Container & res_values); |
| 131 | }; |
| 132 | |
| 133 | |
| 134 | /// Hash a set of keys into a UInt128 value. |
| 135 | static inline UInt128 ALWAYS_INLINE hash128depths(const std::vector<size_t> & indices, const ColumnRawPtrs & key_columns) |
| 136 | { |
| 137 | UInt128 key; |
| 138 | SipHash hash; |
| 139 | |
| 140 | for (size_t j = 0, keys_size = key_columns.size(); j < keys_size; ++j) |
| 141 | { |
| 142 | // Debug: const auto & field = (*key_columns[j])[indices[j]]; DUMP(j, indices[j], field); |
| 143 | key_columns[j]->updateHashWithValue(indices[j], hash); |
| 144 | } |
| 145 | |
| 146 | hash.get128(key.low, key.high); |
| 147 | |
| 148 | return key; |
| 149 | } |
| 150 | |
| 151 | |
| 152 | template <typename Derived> |
| 153 | void FunctionArrayEnumerateRankedExtended<Derived>::executeImpl( |
| 154 | Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) |
| 155 | { |
| 156 | size_t num_arguments = arguments.size(); |
| 157 | ColumnRawPtrs data_columns; |
| 158 | |
| 159 | Columns array_holders; |
| 160 | ColumnPtr offsets_column; |
| 161 | |
| 162 | ColumnsWithTypeAndName args; |
| 163 | |
| 164 | for (size_t i = 0; i < arguments.size(); ++i) |
| 165 | args.emplace_back(block.getByPosition(arguments[i])); |
| 166 | |
| 167 | const ArraysDepths arrays_depths = getArraysDepths(args); |
| 168 | |
| 169 | /// If the column is Array - return it. If the const Array - materialize it, keep ownership and return. |
| 170 | auto get_array_column = [&](const auto & column) -> const DB::ColumnArray * |
| 171 | { |
| 172 | const ColumnArray * array = checkAndGetColumn<ColumnArray>(column); |
| 173 | if (!array) |
| 174 | { |
| 175 | const ColumnConst * const_array = checkAndGetColumnConst<ColumnArray>(column); |
| 176 | if (!const_array) |
| 177 | return nullptr; |
| 178 | array_holders.emplace_back(const_array->convertToFullColumn()); |
| 179 | array = checkAndGetColumn<ColumnArray>(array_holders.back().get()); |
| 180 | } |
| 181 | return array; |
| 182 | }; |
| 183 | |
| 184 | std::vector<const ColumnArray::Offsets *> offsets_by_depth; |
| 185 | std::vector<ColumnPtr> offsetsptr_by_depth; |
| 186 | |
| 187 | size_t array_num = 0; |
| 188 | for (size_t i = 0; i < num_arguments; ++i) |
| 189 | { |
| 190 | const auto * array = get_array_column(block.getByPosition(arguments[i]).column.get()); |
| 191 | if (!array) |
| 192 | continue; |
| 193 | |
| 194 | if (array_num == 0) // TODO check with prev |
| 195 | { |
| 196 | offsets_by_depth.emplace_back(&array->getOffsets()); |
| 197 | offsetsptr_by_depth.emplace_back(array->getOffsetsPtr()); |
| 198 | } |
| 199 | else |
| 200 | { |
| 201 | if (*offsets_by_depth[0] != array->getOffsets()) |
| 202 | { |
| 203 | throw Exception( |
| 204 | "Lengths and effective depths of all arrays passed to " + getName() + " must be equal." , |
| 205 | ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH); |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | DepthType col_depth = 1; |
| 210 | for (; col_depth < arrays_depths.depths[array_num]; ++col_depth) |
| 211 | { |
| 212 | auto sub_array = get_array_column(&array->getData()); |
| 213 | if (sub_array) |
| 214 | array = sub_array; |
| 215 | if (!sub_array) |
| 216 | break; |
| 217 | |
| 218 | if (offsets_by_depth.size() <= col_depth) |
| 219 | { |
| 220 | offsets_by_depth.emplace_back(&array->getOffsets()); |
| 221 | offsetsptr_by_depth.emplace_back(array->getOffsetsPtr()); |
| 222 | } |
| 223 | else |
| 224 | { |
| 225 | if (*offsets_by_depth[col_depth] != array->getOffsets()) |
| 226 | { |
| 227 | throw Exception( |
| 228 | "Lengths and effective depths of all arrays passed to " + getName() + " must be equal." , |
| 229 | ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH); |
| 230 | } |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | if (col_depth < arrays_depths.depths[array_num]) |
| 235 | { |
| 236 | throw Exception( |
| 237 | getName() + ": Passed array number " + std::to_string(array_num) + " depth (" |
| 238 | + std::to_string(arrays_depths.depths[array_num]) + ") is more than the actual array depth (" |
| 239 | + std::to_string(col_depth) + ")." , |
| 240 | ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH); |
| 241 | } |
| 242 | |
| 243 | auto * array_data = &array->getData(); |
| 244 | data_columns.emplace_back(array_data); |
| 245 | ++array_num; |
| 246 | } |
| 247 | |
| 248 | if (offsets_by_depth.empty()) |
| 249 | throw Exception("No arrays passed to function " + getName(), ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); |
| 250 | |
| 251 | auto res_nested = ColumnUInt32::create(); |
| 252 | |
| 253 | ColumnUInt32::Container & res_values = res_nested->getData(); |
| 254 | res_values.resize(offsets_by_depth[arrays_depths.max_array_depth - 1]->back()); |
| 255 | |
| 256 | executeMethodImpl(offsets_by_depth, data_columns, arrays_depths, res_values); |
| 257 | |
| 258 | ColumnPtr result_nested_array = std::move(res_nested); |
| 259 | for (ssize_t depth = arrays_depths.max_array_depth - 1; depth >= 0; --depth) |
| 260 | result_nested_array = ColumnArray::create(std::move(result_nested_array), offsetsptr_by_depth[depth]); |
| 261 | |
| 262 | block.getByPosition(result).column = result_nested_array; |
| 263 | } |
| 264 | |
| 265 | /* |
| 266 | |
| 267 | (2, [[1,2,3],[2,2,1],[3]], 2, [4,5,6], 1) |
| 268 | ; 1 2 3; 2 2 1; 3 4 5 6 |
| 269 | ; 4 4 4; 5 5 5; 6 <- |
| 270 | |
| 271 | (1, [[1,2,3],[2,2,1],[3]], 1, [4,5,6], 1) |
| 272 | ;[1,2,3] [2,2,1] [3] 4 5 6 |
| 273 | ;4 5 6 <- |
| 274 | |
| 275 | (1, [[1,2,3],[2,2,1],[3]], 1, [4,5,6], 0) |
| 276 | ;[1,2,3] [2,2,1] [3] 4 5 6 |
| 277 | ;[4,5,6] [4,5,6] [4,5,6] <- |
| 278 | |
| 279 | . - get data |
| 280 | ; - clean index |
| 281 | |
| 282 | (1, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 1) |
| 283 | ;. . . |
| 284 | |
| 285 | (1, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 2) |
| 286 | ; . . . . . . . |
| 287 | |
| 288 | (2, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 2) |
| 289 | ; . . . ; . . . ; . |
| 290 | |
| 291 | (1, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 3) |
| 292 | ; . . . . . . . . . . . . . . . . . . . . |
| 293 | |
| 294 | (2, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 3) |
| 295 | ; . . . . . . . . . ; . . . . . . . . . ; . . |
| 296 | |
| 297 | (3, [[[1,2,3],[1,2,3],[1,2,3]],[[1,2,3],[1,2,3],[1,2,3]],[[1,2]]], 3) |
| 298 | ; . . . ; . . . ; . . . ; . . . ; . . . ; . . . ; . . |
| 299 | |
| 300 | */ |
| 301 | |
| 302 | template <typename Derived> |
| 303 | void FunctionArrayEnumerateRankedExtended<Derived>::executeMethodImpl( |
| 304 | const std::vector<const ColumnArray::Offsets *> & offsets_by_depth, |
| 305 | const ColumnRawPtrs & columns, |
| 306 | const ArraysDepths & arrays_depths, |
| 307 | ColumnUInt32::Container & res_values) |
| 308 | { |
| 309 | /// Offsets at the depth we want to look. |
| 310 | const size_t depth_to_look = arrays_depths.max_array_depth; |
| 311 | const auto & offsets = *offsets_by_depth[depth_to_look - 1]; |
| 312 | |
| 313 | using Map = ClearableHashMap< |
| 314 | UInt128, |
| 315 | UInt32, |
| 316 | UInt128TrivialHash, |
| 317 | HashTableGrower<INITIAL_SIZE_DEGREE>, |
| 318 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>; |
| 319 | Map indices; |
| 320 | |
| 321 | std::vector<size_t> indices_by_depth(depth_to_look); |
| 322 | std::vector<size_t> current_offset_n_by_depth(depth_to_look); |
| 323 | std::vector<size_t> last_offset_by_depth(depth_to_look, 0); // For skipping empty arrays |
| 324 | |
| 325 | /// For arrayEnumerateDense variant: to calculate every distinct value. |
| 326 | UInt32 rank = 0; |
| 327 | |
| 328 | std::vector<size_t> columns_indices(columns.size()); |
| 329 | |
| 330 | /// For each array at the depth we want to look. |
| 331 | ColumnArray::Offset prev_off = 0; |
| 332 | for (size_t off : offsets) |
| 333 | { |
| 334 | bool want_clear = false; |
| 335 | |
| 336 | /// Skipping offsets if no data in this array |
| 337 | if (prev_off == off) |
| 338 | { |
| 339 | if (depth_to_look >= 2) |
| 340 | { |
| 341 | /// Advance to the next element of the parent array. |
| 342 | for (ssize_t depth = depth_to_look - 2; depth >= 0; --depth) |
| 343 | { |
| 344 | /// Skipping offsets for empty arrays |
| 345 | while (last_offset_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]) |
| 346 | { |
| 347 | ++current_offset_n_by_depth[depth]; |
| 348 | } |
| 349 | |
| 350 | ++indices_by_depth[depth]; |
| 351 | |
| 352 | if (indices_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]) |
| 353 | { |
| 354 | last_offset_by_depth[depth] = (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]; |
| 355 | ++current_offset_n_by_depth[depth]; |
| 356 | want_clear = true; |
| 357 | } |
| 358 | else |
| 359 | { |
| 360 | break; |
| 361 | } |
| 362 | } |
| 363 | } |
| 364 | } |
| 365 | |
| 366 | /// For each element at the depth we want to look. |
| 367 | for (size_t j = prev_off; j < off; ++j) |
| 368 | { |
| 369 | for (size_t col_n = 0; col_n < columns.size(); ++col_n) |
| 370 | columns_indices[col_n] = indices_by_depth[arrays_depths.depths[col_n] - 1]; |
| 371 | |
| 372 | auto hash = hash128depths(columns_indices, columns); |
| 373 | |
| 374 | if constexpr (std::is_same_v<Derived, FunctionArrayEnumerateUniqRanked>) |
| 375 | { |
| 376 | auto idx = ++indices[hash]; |
| 377 | res_values[j] = idx; |
| 378 | } |
| 379 | else // FunctionArrayEnumerateDenseRanked |
| 380 | { |
| 381 | auto idx = indices[hash]; |
| 382 | if (!idx) |
| 383 | { |
| 384 | idx = ++rank; |
| 385 | indices[hash] = idx; |
| 386 | } |
| 387 | res_values[j] = idx; |
| 388 | } |
| 389 | |
| 390 | // Debug: DUMP(off, prev_off, j, columns_indices, res_values[j], columns); |
| 391 | |
| 392 | for (ssize_t depth = depth_to_look - 1; depth >= 0; --depth) |
| 393 | { |
| 394 | /// Skipping offsets for empty arrays |
| 395 | while (last_offset_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]) |
| 396 | { |
| 397 | ++current_offset_n_by_depth[depth]; |
| 398 | } |
| 399 | |
| 400 | ++indices_by_depth[depth]; |
| 401 | |
| 402 | if (indices_by_depth[depth] == (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]) |
| 403 | { |
| 404 | if (static_cast<int>(arrays_depths.clear_depth) == depth + 1) |
| 405 | want_clear = true; |
| 406 | last_offset_by_depth[depth] = (*offsets_by_depth[depth])[current_offset_n_by_depth[depth]]; |
| 407 | ++current_offset_n_by_depth[depth]; |
| 408 | } |
| 409 | else |
| 410 | { |
| 411 | break; |
| 412 | } |
| 413 | } |
| 414 | } |
| 415 | |
| 416 | if (want_clear) |
| 417 | { |
| 418 | want_clear = false; |
| 419 | indices.clear(); |
| 420 | rank = 0; |
| 421 | } |
| 422 | |
| 423 | prev_off = off; |
| 424 | } |
| 425 | } |
| 426 | |
| 427 | } |
| 428 | |