| 1 | #include <string.h> // memcpy |
| 2 | |
| 3 | #include <Columns/ColumnArray.h> |
| 4 | #include <Columns/ColumnsNumber.h> |
| 5 | #include <Columns/ColumnString.h> |
| 6 | #include <Columns/ColumnTuple.h> |
| 7 | #include <Columns/ColumnNullable.h> |
| 8 | #include <Columns/ColumnConst.h> |
| 9 | #include <Columns/ColumnsCommon.h> |
| 10 | |
| 11 | #include <common/unaligned.h> |
| 12 | |
| 13 | #include <DataStreams/ColumnGathererStream.h> |
| 14 | |
| 15 | #include <Common/Exception.h> |
| 16 | #include <Common/Arena.h> |
| 17 | #include <Common/SipHash.h> |
| 18 | #include <Common/typeid_cast.h> |
| 19 | #include <Common/assert_cast.h> |
| 20 | |
| 21 | |
| 22 | namespace DB |
| 23 | { |
| 24 | |
| 25 | namespace ErrorCodes |
| 26 | { |
| 27 | extern const int ILLEGAL_COLUMN; |
| 28 | extern const int NOT_IMPLEMENTED; |
| 29 | extern const int BAD_ARGUMENTS; |
| 30 | extern const int PARAMETER_OUT_OF_BOUND; |
| 31 | extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; |
| 32 | extern const int LOGICAL_ERROR; |
| 33 | } |
| 34 | |
| 35 | |
| 36 | ColumnArray::ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr && offsets_column) |
| 37 | : data(std::move(nested_column)), offsets(std::move(offsets_column)) |
| 38 | { |
| 39 | if (!typeid_cast<const ColumnOffsets *>(offsets.get())) |
| 40 | throw Exception("offsets_column must be a ColumnUInt64" , ErrorCodes::ILLEGAL_COLUMN); |
| 41 | |
| 42 | /** NOTE |
| 43 | * Arrays with constant value are possible and used in implementation of higher order functions (see FunctionReplicate). |
| 44 | * But in most cases, arrays with constant value are unexpected and code will work wrong. Use with caution. |
| 45 | */ |
| 46 | } |
| 47 | |
| 48 | ColumnArray::ColumnArray(MutableColumnPtr && nested_column) |
| 49 | : data(std::move(nested_column)) |
| 50 | { |
| 51 | if (!data->empty()) |
| 52 | throw Exception("Not empty data passed to ColumnArray, but no offsets passed" , ErrorCodes::ILLEGAL_COLUMN); |
| 53 | |
| 54 | offsets = ColumnOffsets::create(); |
| 55 | } |
| 56 | |
| 57 | |
| 58 | std::string ColumnArray::getName() const { return "Array(" + getData().getName() + ")" ; } |
| 59 | |
| 60 | |
| 61 | MutableColumnPtr ColumnArray::cloneResized(size_t to_size) const |
| 62 | { |
| 63 | auto res = ColumnArray::create(getData().cloneEmpty()); |
| 64 | |
| 65 | if (to_size == 0) |
| 66 | return res; |
| 67 | |
| 68 | size_t from_size = size(); |
| 69 | |
| 70 | if (to_size <= from_size) |
| 71 | { |
| 72 | /// Just cut column. |
| 73 | |
| 74 | res->getOffsets().assign(getOffsets().begin(), getOffsets().begin() + to_size); |
| 75 | res->getData().insertRangeFrom(getData(), 0, getOffsets()[to_size - 1]); |
| 76 | } |
| 77 | else |
| 78 | { |
| 79 | /// Copy column and append empty arrays for extra elements. |
| 80 | |
| 81 | Offset offset = 0; |
| 82 | if (from_size > 0) |
| 83 | { |
| 84 | res->getOffsets().assign(getOffsets().begin(), getOffsets().end()); |
| 85 | res->getData().insertRangeFrom(getData(), 0, getData().size()); |
| 86 | offset = getOffsets().back(); |
| 87 | } |
| 88 | |
| 89 | res->getOffsets().resize(to_size); |
| 90 | for (size_t i = from_size; i < to_size; ++i) |
| 91 | res->getOffsets()[i] = offset; |
| 92 | } |
| 93 | |
| 94 | return res; |
| 95 | } |
| 96 | |
| 97 | |
| 98 | size_t ColumnArray::size() const |
| 99 | { |
| 100 | return getOffsets().size(); |
| 101 | } |
| 102 | |
| 103 | |
| 104 | Field ColumnArray::operator[](size_t n) const |
| 105 | { |
| 106 | size_t offset = offsetAt(n); |
| 107 | size_t size = sizeAt(n); |
| 108 | Array res(size); |
| 109 | |
| 110 | for (size_t i = 0; i < size; ++i) |
| 111 | res[i] = getData()[offset + i]; |
| 112 | |
| 113 | return res; |
| 114 | } |
| 115 | |
| 116 | |
| 117 | void ColumnArray::get(size_t n, Field & res) const |
| 118 | { |
| 119 | size_t offset = offsetAt(n); |
| 120 | size_t size = sizeAt(n); |
| 121 | res = Array(size); |
| 122 | Array & res_arr = DB::get<Array &>(res); |
| 123 | |
| 124 | for (size_t i = 0; i < size; ++i) |
| 125 | getData().get(offset + i, res_arr[i]); |
| 126 | } |
| 127 | |
| 128 | |
| 129 | StringRef ColumnArray::getDataAt(size_t n) const |
| 130 | { |
| 131 | /** Returns the range of memory that covers all elements of the array. |
| 132 | * Works for arrays of fixed length values. |
| 133 | * For arrays of strings and arrays of arrays, the resulting chunk of memory may not be one-to-one correspondence with the elements, |
| 134 | * since it contains only the data laid in succession, but not the offsets. |
| 135 | */ |
| 136 | |
| 137 | size_t offset_of_first_elem = offsetAt(n); |
| 138 | StringRef first = getData().getDataAtWithTerminatingZero(offset_of_first_elem); |
| 139 | |
| 140 | size_t array_size = sizeAt(n); |
| 141 | if (array_size == 0) |
| 142 | return StringRef(first.data, 0); |
| 143 | |
| 144 | size_t offset_of_last_elem = getOffsets()[n] - 1; |
| 145 | StringRef last = getData().getDataAtWithTerminatingZero(offset_of_last_elem); |
| 146 | |
| 147 | return StringRef(first.data, last.data + last.size - first.data); |
| 148 | } |
| 149 | |
| 150 | |
| 151 | void ColumnArray::insertData(const char * pos, size_t length) |
| 152 | { |
| 153 | /** Similarly - only for arrays of fixed length values. |
| 154 | */ |
| 155 | IColumn * data_ = &getData(); |
| 156 | if (!data_->isFixedAndContiguous()) |
| 157 | throw Exception("Method insertData is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); |
| 158 | |
| 159 | size_t field_size = data_->sizeOfValueIfFixed(); |
| 160 | |
| 161 | const char * end = pos + length; |
| 162 | size_t elems = 0; |
| 163 | for (; pos + field_size <= end; pos += field_size, ++elems) |
| 164 | data_->insertData(pos, field_size); |
| 165 | |
| 166 | if (pos != end) |
| 167 | throw Exception("Incorrect length argument for method ColumnArray::insertData" , ErrorCodes::BAD_ARGUMENTS); |
| 168 | |
| 169 | getOffsets().push_back(getOffsets().back() + elems); |
| 170 | } |
| 171 | |
| 172 | |
| 173 | StringRef ColumnArray::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const |
| 174 | { |
| 175 | size_t array_size = sizeAt(n); |
| 176 | size_t offset = offsetAt(n); |
| 177 | |
| 178 | char * pos = arena.allocContinue(sizeof(array_size), begin); |
| 179 | memcpy(pos, &array_size, sizeof(array_size)); |
| 180 | |
| 181 | StringRef res(pos, sizeof(array_size)); |
| 182 | |
| 183 | for (size_t i = 0; i < array_size; ++i) |
| 184 | { |
| 185 | auto value_ref = getData().serializeValueIntoArena(offset + i, arena, begin); |
| 186 | res.data = value_ref.data - res.size; |
| 187 | res.size += value_ref.size; |
| 188 | } |
| 189 | |
| 190 | return res; |
| 191 | } |
| 192 | |
| 193 | |
| 194 | const char * ColumnArray::deserializeAndInsertFromArena(const char * pos) |
| 195 | { |
| 196 | size_t array_size = unalignedLoad<size_t>(pos); |
| 197 | pos += sizeof(array_size); |
| 198 | |
| 199 | for (size_t i = 0; i < array_size; ++i) |
| 200 | pos = getData().deserializeAndInsertFromArena(pos); |
| 201 | |
| 202 | getOffsets().push_back(getOffsets().back() + array_size); |
| 203 | return pos; |
| 204 | } |
| 205 | |
| 206 | |
| 207 | void ColumnArray::updateHashWithValue(size_t n, SipHash & hash) const |
| 208 | { |
| 209 | size_t array_size = sizeAt(n); |
| 210 | size_t offset = offsetAt(n); |
| 211 | |
| 212 | hash.update(array_size); |
| 213 | for (size_t i = 0; i < array_size; ++i) |
| 214 | getData().updateHashWithValue(offset + i, hash); |
| 215 | } |
| 216 | |
| 217 | |
| 218 | void ColumnArray::insert(const Field & x) |
| 219 | { |
| 220 | const Array & array = DB::get<const Array &>(x); |
| 221 | size_t size = array.size(); |
| 222 | for (size_t i = 0; i < size; ++i) |
| 223 | getData().insert(array[i]); |
| 224 | getOffsets().push_back(getOffsets().back() + size); |
| 225 | } |
| 226 | |
| 227 | |
| 228 | void ColumnArray::insertFrom(const IColumn & src_, size_t n) |
| 229 | { |
| 230 | const ColumnArray & src = assert_cast<const ColumnArray &>(src_); |
| 231 | size_t size = src.sizeAt(n); |
| 232 | size_t offset = src.offsetAt(n); |
| 233 | |
| 234 | getData().insertRangeFrom(src.getData(), offset, size); |
| 235 | getOffsets().push_back(getOffsets().back() + size); |
| 236 | } |
| 237 | |
| 238 | |
| 239 | void ColumnArray::insertDefault() |
| 240 | { |
| 241 | /// NOTE 1: We can use back() even if the array is empty (due to zero -1th element in PODArray). |
| 242 | /// NOTE 2: We cannot use reference in push_back, because reference get invalidated if array is reallocated. |
| 243 | auto last_offset = getOffsets().back(); |
| 244 | getOffsets().push_back(last_offset); |
| 245 | } |
| 246 | |
| 247 | |
| 248 | void ColumnArray::popBack(size_t n) |
| 249 | { |
| 250 | auto & offsets_data = getOffsets(); |
| 251 | size_t nested_n = offsets_data.back() - offsetAt(offsets_data.size() - n); |
| 252 | if (nested_n) |
| 253 | getData().popBack(nested_n); |
| 254 | offsets_data.resize_assume_reserved(offsets_data.size() - n); |
| 255 | } |
| 256 | |
| 257 | |
| 258 | int ColumnArray::compareAt(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint) const |
| 259 | { |
| 260 | const ColumnArray & rhs = assert_cast<const ColumnArray &>(rhs_); |
| 261 | |
| 262 | /// Suboptimal |
| 263 | size_t lhs_size = sizeAt(n); |
| 264 | size_t rhs_size = rhs.sizeAt(m); |
| 265 | size_t min_size = std::min(lhs_size, rhs_size); |
| 266 | for (size_t i = 0; i < min_size; ++i) |
| 267 | if (int res = getData().compareAt(offsetAt(n) + i, rhs.offsetAt(m) + i, *rhs.data.get(), nan_direction_hint)) |
| 268 | return res; |
| 269 | |
| 270 | return lhs_size < rhs_size |
| 271 | ? -1 |
| 272 | : (lhs_size == rhs_size |
| 273 | ? 0 |
| 274 | : 1); |
| 275 | } |
| 276 | |
| 277 | |
| 278 | namespace |
| 279 | { |
| 280 | template <bool positive> |
| 281 | struct less |
| 282 | { |
| 283 | const ColumnArray & parent; |
| 284 | int nan_direction_hint; |
| 285 | |
| 286 | less(const ColumnArray & parent_, int nan_direction_hint_) |
| 287 | : parent(parent_), nan_direction_hint(nan_direction_hint_) {} |
| 288 | |
| 289 | bool operator()(size_t lhs, size_t rhs) const |
| 290 | { |
| 291 | if (positive) |
| 292 | return parent.compareAt(lhs, rhs, parent, nan_direction_hint) < 0; |
| 293 | else |
| 294 | return parent.compareAt(lhs, rhs, parent, nan_direction_hint) > 0; |
| 295 | } |
| 296 | }; |
| 297 | } |
| 298 | |
| 299 | |
| 300 | void ColumnArray::reserve(size_t n) |
| 301 | { |
| 302 | getOffsets().reserve(n); |
| 303 | getData().reserve(n); /// The average size of arrays is not taken into account here. Or it is considered to be no more than 1. |
| 304 | } |
| 305 | |
| 306 | |
| 307 | size_t ColumnArray::byteSize() const |
| 308 | { |
| 309 | return getData().byteSize() + getOffsets().size() * sizeof(getOffsets()[0]); |
| 310 | } |
| 311 | |
| 312 | |
| 313 | size_t ColumnArray::allocatedBytes() const |
| 314 | { |
| 315 | return getData().allocatedBytes() + getOffsets().allocated_bytes(); |
| 316 | } |
| 317 | |
| 318 | |
| 319 | void ColumnArray::protect() |
| 320 | { |
| 321 | getData().protect(); |
| 322 | getOffsets().protect(); |
| 323 | } |
| 324 | |
| 325 | |
| 326 | bool ColumnArray::hasEqualOffsets(const ColumnArray & other) const |
| 327 | { |
| 328 | if (offsets == other.offsets) |
| 329 | return true; |
| 330 | |
| 331 | const Offsets & offsets1 = getOffsets(); |
| 332 | const Offsets & offsets2 = other.getOffsets(); |
| 333 | return offsets1.size() == offsets2.size() |
| 334 | && (offsets1.size() == 0 || 0 == memcmp(offsets1.data(), offsets2.data(), sizeof(offsets1[0]) * offsets1.size())); |
| 335 | } |
| 336 | |
| 337 | |
| 338 | ColumnPtr ColumnArray::convertToFullColumnIfConst() const |
| 339 | { |
| 340 | /// It is possible to have an array with constant data and non-constant offsets. |
| 341 | /// Example is the result of expression: replicate('hello', [1]) |
| 342 | return ColumnArray::create(data->convertToFullColumnIfConst(), offsets); |
| 343 | } |
| 344 | |
| 345 | void ColumnArray::getExtremes(Field & min, Field & max) const |
| 346 | { |
| 347 | min = Array(); |
| 348 | max = Array(); |
| 349 | |
| 350 | size_t col_size = size(); |
| 351 | |
| 352 | if (col_size == 0) |
| 353 | return; |
| 354 | |
| 355 | size_t min_idx = 0; |
| 356 | size_t max_idx = 0; |
| 357 | |
| 358 | for (size_t i = 1; i < col_size; ++i) |
| 359 | { |
| 360 | if (compareAt(i, min_idx, *this, /* nan_direction_hint = */ 1) < 0) |
| 361 | min_idx = i; |
| 362 | else if (compareAt(i, max_idx, *this, /* nan_direction_hint = */ -1) > 0) |
| 363 | max_idx = i; |
| 364 | } |
| 365 | |
| 366 | get(min_idx, min); |
| 367 | get(max_idx, max); |
| 368 | } |
| 369 | |
| 370 | |
| 371 | void ColumnArray::insertRangeFrom(const IColumn & src, size_t start, size_t length) |
| 372 | { |
| 373 | if (length == 0) |
| 374 | return; |
| 375 | |
| 376 | const ColumnArray & src_concrete = assert_cast<const ColumnArray &>(src); |
| 377 | |
| 378 | if (start + length > src_concrete.getOffsets().size()) |
| 379 | throw Exception("Parameter out of bound in ColumnArray::insertRangeFrom method. [start(" + std::to_string(start) + ") + length(" + std::to_string(length) + ") > offsets.size(" + std::to_string(src_concrete.getOffsets().size()) + ")]" , |
| 380 | ErrorCodes::PARAMETER_OUT_OF_BOUND); |
| 381 | |
| 382 | size_t nested_offset = src_concrete.offsetAt(start); |
| 383 | size_t nested_length = src_concrete.getOffsets()[start + length - 1] - nested_offset; |
| 384 | |
| 385 | getData().insertRangeFrom(src_concrete.getData(), nested_offset, nested_length); |
| 386 | |
| 387 | Offsets & cur_offsets = getOffsets(); |
| 388 | const Offsets & src_offsets = src_concrete.getOffsets(); |
| 389 | |
| 390 | if (start == 0 && cur_offsets.empty()) |
| 391 | { |
| 392 | cur_offsets.assign(src_offsets.begin(), src_offsets.begin() + length); |
| 393 | } |
| 394 | else |
| 395 | { |
| 396 | size_t old_size = cur_offsets.size(); |
| 397 | size_t prev_max_offset = old_size ? cur_offsets.back() : 0; |
| 398 | cur_offsets.resize(old_size + length); |
| 399 | |
| 400 | for (size_t i = 0; i < length; ++i) |
| 401 | cur_offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset; |
| 402 | } |
| 403 | } |
| 404 | |
| 405 | |
| 406 | ColumnPtr ColumnArray::filter(const Filter & filt, ssize_t result_size_hint) const |
| 407 | { |
| 408 | if (typeid_cast<const ColumnUInt8 *>(data.get())) return filterNumber<UInt8>(filt, result_size_hint); |
| 409 | if (typeid_cast<const ColumnUInt16 *>(data.get())) return filterNumber<UInt16>(filt, result_size_hint); |
| 410 | if (typeid_cast<const ColumnUInt32 *>(data.get())) return filterNumber<UInt32>(filt, result_size_hint); |
| 411 | if (typeid_cast<const ColumnUInt64 *>(data.get())) return filterNumber<UInt64>(filt, result_size_hint); |
| 412 | if (typeid_cast<const ColumnInt8 *>(data.get())) return filterNumber<Int8>(filt, result_size_hint); |
| 413 | if (typeid_cast<const ColumnInt16 *>(data.get())) return filterNumber<Int16>(filt, result_size_hint); |
| 414 | if (typeid_cast<const ColumnInt32 *>(data.get())) return filterNumber<Int32>(filt, result_size_hint); |
| 415 | if (typeid_cast<const ColumnInt64 *>(data.get())) return filterNumber<Int64>(filt, result_size_hint); |
| 416 | if (typeid_cast<const ColumnFloat32 *>(data.get())) return filterNumber<Float32>(filt, result_size_hint); |
| 417 | if (typeid_cast<const ColumnFloat64 *>(data.get())) return filterNumber<Float64>(filt, result_size_hint); |
| 418 | if (typeid_cast<const ColumnString *>(data.get())) return filterString(filt, result_size_hint); |
| 419 | if (typeid_cast<const ColumnTuple *>(data.get())) return filterTuple(filt, result_size_hint); |
| 420 | if (typeid_cast<const ColumnNullable *>(data.get())) return filterNullable(filt, result_size_hint); |
| 421 | return filterGeneric(filt, result_size_hint); |
| 422 | } |
| 423 | |
| 424 | template <typename T> |
| 425 | ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const |
| 426 | { |
| 427 | if (getOffsets().size() == 0) |
| 428 | return ColumnArray::create(data); |
| 429 | |
| 430 | auto res = ColumnArray::create(data->cloneEmpty()); |
| 431 | |
| 432 | auto & res_elems = assert_cast<ColumnVector<T> &>(res->getData()).getData(); |
| 433 | Offsets & res_offsets = res->getOffsets(); |
| 434 | |
| 435 | filterArraysImpl<T>(assert_cast<const ColumnVector<T> &>(*data).getData(), getOffsets(), res_elems, res_offsets, filt, result_size_hint); |
| 436 | return res; |
| 437 | } |
| 438 | |
| 439 | ColumnPtr ColumnArray::filterString(const Filter & filt, ssize_t result_size_hint) const |
| 440 | { |
| 441 | size_t col_size = getOffsets().size(); |
| 442 | if (col_size != filt.size()) |
| 443 | throw Exception("Size of filter doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 444 | |
| 445 | if (0 == col_size) |
| 446 | return ColumnArray::create(data); |
| 447 | |
| 448 | auto res = ColumnArray::create(data->cloneEmpty()); |
| 449 | |
| 450 | const ColumnString & src_string = typeid_cast<const ColumnString &>(*data); |
| 451 | const ColumnString::Chars & src_chars = src_string.getChars(); |
| 452 | const Offsets & src_string_offsets = src_string.getOffsets(); |
| 453 | const Offsets & src_offsets = getOffsets(); |
| 454 | |
| 455 | ColumnString::Chars & res_chars = typeid_cast<ColumnString &>(res->getData()).getChars(); |
| 456 | Offsets & res_string_offsets = typeid_cast<ColumnString &>(res->getData()).getOffsets(); |
| 457 | Offsets & res_offsets = res->getOffsets(); |
| 458 | |
| 459 | if (result_size_hint < 0) /// Other cases are not considered. |
| 460 | { |
| 461 | res_chars.reserve(src_chars.size()); |
| 462 | res_string_offsets.reserve(src_string_offsets.size()); |
| 463 | res_offsets.reserve(col_size); |
| 464 | } |
| 465 | |
| 466 | Offset prev_src_offset = 0; |
| 467 | Offset prev_src_string_offset = 0; |
| 468 | |
| 469 | Offset prev_res_offset = 0; |
| 470 | Offset prev_res_string_offset = 0; |
| 471 | |
| 472 | for (size_t i = 0; i < col_size; ++i) |
| 473 | { |
| 474 | /// Number of rows in the array. |
| 475 | size_t array_size = src_offsets[i] - prev_src_offset; |
| 476 | |
| 477 | if (filt[i]) |
| 478 | { |
| 479 | /// If the array is not empty - copy content. |
| 480 | if (array_size) |
| 481 | { |
| 482 | size_t chars_to_copy = src_string_offsets[array_size + prev_src_offset - 1] - prev_src_string_offset; |
| 483 | size_t res_chars_prev_size = res_chars.size(); |
| 484 | res_chars.resize(res_chars_prev_size + chars_to_copy); |
| 485 | memcpy(&res_chars[res_chars_prev_size], &src_chars[prev_src_string_offset], chars_to_copy); |
| 486 | |
| 487 | for (size_t j = 0; j < array_size; ++j) |
| 488 | res_string_offsets.push_back(src_string_offsets[j + prev_src_offset] + prev_res_string_offset - prev_src_string_offset); |
| 489 | |
| 490 | prev_res_string_offset = res_string_offsets.back(); |
| 491 | } |
| 492 | |
| 493 | prev_res_offset += array_size; |
| 494 | res_offsets.push_back(prev_res_offset); |
| 495 | } |
| 496 | |
| 497 | if (array_size) |
| 498 | { |
| 499 | prev_src_offset += array_size; |
| 500 | prev_src_string_offset = src_string_offsets[prev_src_offset - 1]; |
| 501 | } |
| 502 | } |
| 503 | |
| 504 | return res; |
| 505 | } |
| 506 | |
| 507 | ColumnPtr ColumnArray::filterGeneric(const Filter & filt, ssize_t result_size_hint) const |
| 508 | { |
| 509 | size_t size = getOffsets().size(); |
| 510 | if (size != filt.size()) |
| 511 | throw Exception("Size of filter doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 512 | |
| 513 | if (size == 0) |
| 514 | return ColumnArray::create(data); |
| 515 | |
| 516 | Filter nested_filt(getOffsets().back()); |
| 517 | for (size_t i = 0; i < size; ++i) |
| 518 | { |
| 519 | if (filt[i]) |
| 520 | memset(&nested_filt[offsetAt(i)], 1, sizeAt(i)); |
| 521 | else |
| 522 | memset(&nested_filt[offsetAt(i)], 0, sizeAt(i)); |
| 523 | } |
| 524 | |
| 525 | auto res = ColumnArray::create(data->cloneEmpty()); |
| 526 | |
| 527 | ssize_t nested_result_size_hint = 0; |
| 528 | if (result_size_hint < 0) |
| 529 | nested_result_size_hint = result_size_hint; |
| 530 | else if (result_size_hint && result_size_hint < 1000000000 && data->size() < 1000000000) /// Avoid overflow. |
| 531 | nested_result_size_hint = result_size_hint * data->size() / size; |
| 532 | |
| 533 | res->data = data->filter(nested_filt, nested_result_size_hint); |
| 534 | |
| 535 | Offsets & res_offsets = res->getOffsets(); |
| 536 | if (result_size_hint) |
| 537 | res_offsets.reserve(result_size_hint > 0 ? result_size_hint : size); |
| 538 | |
| 539 | size_t current_offset = 0; |
| 540 | for (size_t i = 0; i < size; ++i) |
| 541 | { |
| 542 | if (filt[i]) |
| 543 | { |
| 544 | current_offset += sizeAt(i); |
| 545 | res_offsets.push_back(current_offset); |
| 546 | } |
| 547 | } |
| 548 | |
| 549 | return res; |
| 550 | } |
| 551 | |
| 552 | ColumnPtr ColumnArray::filterNullable(const Filter & filt, ssize_t result_size_hint) const |
| 553 | { |
| 554 | if (getOffsets().size() == 0) |
| 555 | return ColumnArray::create(data); |
| 556 | |
| 557 | const ColumnNullable & nullable_elems = assert_cast<const ColumnNullable &>(*data); |
| 558 | |
| 559 | auto array_of_nested = ColumnArray::create(nullable_elems.getNestedColumnPtr(), offsets); |
| 560 | auto filtered_array_of_nested_owner = array_of_nested->filter(filt, result_size_hint); |
| 561 | auto & filtered_array_of_nested = assert_cast<const ColumnArray &>(*filtered_array_of_nested_owner); |
| 562 | auto & filtered_offsets = filtered_array_of_nested.getOffsetsPtr(); |
| 563 | |
| 564 | auto res_null_map = ColumnUInt8::create(); |
| 565 | |
| 566 | filterArraysImplOnlyData(nullable_elems.getNullMapData(), getOffsets(), res_null_map->getData(), filt, result_size_hint); |
| 567 | |
| 568 | return ColumnArray::create( |
| 569 | ColumnNullable::create( |
| 570 | filtered_array_of_nested.getDataPtr(), |
| 571 | std::move(res_null_map)), |
| 572 | filtered_offsets); |
| 573 | } |
| 574 | |
| 575 | ColumnPtr ColumnArray::filterTuple(const Filter & filt, ssize_t result_size_hint) const |
| 576 | { |
| 577 | if (getOffsets().size() == 0) |
| 578 | return ColumnArray::create(data); |
| 579 | |
| 580 | const ColumnTuple & tuple = assert_cast<const ColumnTuple &>(*data); |
| 581 | |
| 582 | /// Make temporary arrays for each components of Tuple, then filter and collect back. |
| 583 | |
| 584 | size_t tuple_size = tuple.tupleSize(); |
| 585 | |
| 586 | if (tuple_size == 0) |
| 587 | throw Exception("Logical error: empty tuple" , ErrorCodes::LOGICAL_ERROR); |
| 588 | |
| 589 | Columns temporary_arrays(tuple_size); |
| 590 | for (size_t i = 0; i < tuple_size; ++i) |
| 591 | temporary_arrays[i] = ColumnArray(tuple.getColumns()[i]->assumeMutable(), getOffsetsPtr()->assumeMutable()) |
| 592 | .filter(filt, result_size_hint); |
| 593 | |
| 594 | Columns tuple_columns(tuple_size); |
| 595 | for (size_t i = 0; i < tuple_size; ++i) |
| 596 | tuple_columns[i] = assert_cast<const ColumnArray &>(*temporary_arrays[i]).getDataPtr(); |
| 597 | |
| 598 | return ColumnArray::create( |
| 599 | ColumnTuple::create(tuple_columns), |
| 600 | assert_cast<const ColumnArray &>(*temporary_arrays.front()).getOffsetsPtr()); |
| 601 | } |
| 602 | |
| 603 | |
| 604 | ColumnPtr ColumnArray::permute(const Permutation & perm, size_t limit) const |
| 605 | { |
| 606 | size_t size = getOffsets().size(); |
| 607 | |
| 608 | if (limit == 0) |
| 609 | limit = size; |
| 610 | else |
| 611 | limit = std::min(size, limit); |
| 612 | |
| 613 | if (perm.size() < limit) |
| 614 | throw Exception("Size of permutation is less than required." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 615 | |
| 616 | if (limit == 0) |
| 617 | return ColumnArray::create(data); |
| 618 | |
| 619 | Permutation nested_perm(getOffsets().back()); |
| 620 | |
| 621 | auto res = ColumnArray::create(data->cloneEmpty()); |
| 622 | |
| 623 | Offsets & res_offsets = res->getOffsets(); |
| 624 | res_offsets.resize(limit); |
| 625 | size_t current_offset = 0; |
| 626 | |
| 627 | for (size_t i = 0; i < limit; ++i) |
| 628 | { |
| 629 | for (size_t j = 0; j < sizeAt(perm[i]); ++j) |
| 630 | nested_perm[current_offset + j] = offsetAt(perm[i]) + j; |
| 631 | current_offset += sizeAt(perm[i]); |
| 632 | res_offsets[i] = current_offset; |
| 633 | } |
| 634 | |
| 635 | if (current_offset != 0) |
| 636 | res->data = data->permute(nested_perm, current_offset); |
| 637 | |
| 638 | return res; |
| 639 | } |
| 640 | |
| 641 | ColumnPtr ColumnArray::index(const IColumn & indexes, size_t limit) const |
| 642 | { |
| 643 | return selectIndexImpl(*this, indexes, limit); |
| 644 | } |
| 645 | |
| 646 | template <typename T> |
| 647 | ColumnPtr ColumnArray::indexImpl(const PaddedPODArray<T> & indexes, size_t limit) const |
| 648 | { |
| 649 | if (limit == 0) |
| 650 | return ColumnArray::create(data); |
| 651 | |
| 652 | /// Convert indexes to UInt64 in case of overflow. |
| 653 | auto nested_indexes_column = ColumnUInt64::create(); |
| 654 | PaddedPODArray<UInt64> & nested_indexes = nested_indexes_column->getData(); |
| 655 | nested_indexes.reserve(getOffsets().back()); |
| 656 | |
| 657 | auto res = ColumnArray::create(data->cloneEmpty()); |
| 658 | |
| 659 | Offsets & res_offsets = res->getOffsets(); |
| 660 | res_offsets.resize(limit); |
| 661 | size_t current_offset = 0; |
| 662 | |
| 663 | for (size_t i = 0; i < limit; ++i) |
| 664 | { |
| 665 | for (size_t j = 0; j < sizeAt(indexes[i]); ++j) |
| 666 | nested_indexes.push_back(offsetAt(indexes[i]) + j); |
| 667 | current_offset += sizeAt(indexes[i]); |
| 668 | res_offsets[i] = current_offset; |
| 669 | } |
| 670 | |
| 671 | if (current_offset != 0) |
| 672 | res->data = data->index(*nested_indexes_column, current_offset); |
| 673 | |
| 674 | return res; |
| 675 | } |
| 676 | |
| 677 | INSTANTIATE_INDEX_IMPL(ColumnArray) |
| 678 | |
| 679 | void ColumnArray::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const |
| 680 | { |
| 681 | size_t s = size(); |
| 682 | if (limit >= s) |
| 683 | limit = 0; |
| 684 | |
| 685 | res.resize(s); |
| 686 | for (size_t i = 0; i < s; ++i) |
| 687 | res[i] = i; |
| 688 | |
| 689 | if (limit) |
| 690 | { |
| 691 | if (reverse) |
| 692 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<false>(*this, nan_direction_hint)); |
| 693 | else |
| 694 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<true>(*this, nan_direction_hint)); |
| 695 | } |
| 696 | else |
| 697 | { |
| 698 | if (reverse) |
| 699 | std::sort(res.begin(), res.end(), less<false>(*this, nan_direction_hint)); |
| 700 | else |
| 701 | std::sort(res.begin(), res.end(), less<true>(*this, nan_direction_hint)); |
| 702 | } |
| 703 | } |
| 704 | |
| 705 | |
| 706 | ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const |
| 707 | { |
| 708 | if (replicate_offsets.empty()) |
| 709 | return cloneEmpty(); |
| 710 | |
| 711 | if (typeid_cast<const ColumnUInt8 *>(data.get())) return replicateNumber<UInt8>(replicate_offsets); |
| 712 | if (typeid_cast<const ColumnUInt16 *>(data.get())) return replicateNumber<UInt16>(replicate_offsets); |
| 713 | if (typeid_cast<const ColumnUInt32 *>(data.get())) return replicateNumber<UInt32>(replicate_offsets); |
| 714 | if (typeid_cast<const ColumnUInt64 *>(data.get())) return replicateNumber<UInt64>(replicate_offsets); |
| 715 | if (typeid_cast<const ColumnInt8 *>(data.get())) return replicateNumber<Int8>(replicate_offsets); |
| 716 | if (typeid_cast<const ColumnInt16 *>(data.get())) return replicateNumber<Int16>(replicate_offsets); |
| 717 | if (typeid_cast<const ColumnInt32 *>(data.get())) return replicateNumber<Int32>(replicate_offsets); |
| 718 | if (typeid_cast<const ColumnInt64 *>(data.get())) return replicateNumber<Int64>(replicate_offsets); |
| 719 | if (typeid_cast<const ColumnFloat32 *>(data.get())) return replicateNumber<Float32>(replicate_offsets); |
| 720 | if (typeid_cast<const ColumnFloat64 *>(data.get())) return replicateNumber<Float64>(replicate_offsets); |
| 721 | if (typeid_cast<const ColumnString *>(data.get())) return replicateString(replicate_offsets); |
| 722 | if (typeid_cast<const ColumnConst *>(data.get())) return replicateConst(replicate_offsets); |
| 723 | if (typeid_cast<const ColumnNullable *>(data.get())) return replicateNullable(replicate_offsets); |
| 724 | if (typeid_cast<const ColumnTuple *>(data.get())) return replicateTuple(replicate_offsets); |
| 725 | return replicateGeneric(replicate_offsets); |
| 726 | } |
| 727 | |
| 728 | |
| 729 | template <typename T> |
| 730 | ColumnPtr ColumnArray::replicateNumber(const Offsets & replicate_offsets) const |
| 731 | { |
| 732 | size_t col_size = size(); |
| 733 | if (col_size != replicate_offsets.size()) |
| 734 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 735 | |
| 736 | MutableColumnPtr res = cloneEmpty(); |
| 737 | |
| 738 | if (0 == col_size) |
| 739 | return res; |
| 740 | |
| 741 | ColumnArray & res_ = typeid_cast<ColumnArray &>(*res); |
| 742 | |
| 743 | const typename ColumnVector<T>::Container & src_data = typeid_cast<const ColumnVector<T> &>(*data).getData(); |
| 744 | const Offsets & src_offsets = getOffsets(); |
| 745 | |
| 746 | typename ColumnVector<T>::Container & res_data = typeid_cast<ColumnVector<T> &>(res_.getData()).getData(); |
| 747 | Offsets & res_offsets = res_.getOffsets(); |
| 748 | |
| 749 | res_data.reserve(data->size() / col_size * replicate_offsets.back()); |
| 750 | res_offsets.reserve(replicate_offsets.back()); |
| 751 | |
| 752 | Offset prev_replicate_offset = 0; |
| 753 | Offset prev_data_offset = 0; |
| 754 | Offset current_new_offset = 0; |
| 755 | |
| 756 | for (size_t i = 0; i < col_size; ++i) |
| 757 | { |
| 758 | size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; |
| 759 | size_t value_size = src_offsets[i] - prev_data_offset; |
| 760 | |
| 761 | for (size_t j = 0; j < size_to_replicate; ++j) |
| 762 | { |
| 763 | current_new_offset += value_size; |
| 764 | res_offsets.push_back(current_new_offset); |
| 765 | |
| 766 | if (value_size) |
| 767 | { |
| 768 | res_data.resize(res_data.size() + value_size); |
| 769 | memcpy(&res_data[res_data.size() - value_size], &src_data[prev_data_offset], value_size * sizeof(T)); |
| 770 | } |
| 771 | } |
| 772 | |
| 773 | prev_replicate_offset = replicate_offsets[i]; |
| 774 | prev_data_offset = src_offsets[i]; |
| 775 | } |
| 776 | |
| 777 | return res; |
| 778 | } |
| 779 | |
| 780 | |
| 781 | ColumnPtr ColumnArray::replicateString(const Offsets & replicate_offsets) const |
| 782 | { |
| 783 | size_t col_size = size(); |
| 784 | if (col_size != replicate_offsets.size()) |
| 785 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 786 | |
| 787 | MutableColumnPtr res = cloneEmpty(); |
| 788 | |
| 789 | if (0 == col_size) |
| 790 | return res; |
| 791 | |
| 792 | ColumnArray & res_ = assert_cast<ColumnArray &>(*res); |
| 793 | |
| 794 | const ColumnString & src_string = typeid_cast<const ColumnString &>(*data); |
| 795 | const ColumnString::Chars & src_chars = src_string.getChars(); |
| 796 | const Offsets & src_string_offsets = src_string.getOffsets(); |
| 797 | const Offsets & src_offsets = getOffsets(); |
| 798 | |
| 799 | ColumnString::Chars & res_chars = typeid_cast<ColumnString &>(res_.getData()).getChars(); |
| 800 | Offsets & res_string_offsets = typeid_cast<ColumnString &>(res_.getData()).getOffsets(); |
| 801 | Offsets & res_offsets = res_.getOffsets(); |
| 802 | |
| 803 | res_chars.reserve(src_chars.size() / col_size * replicate_offsets.back()); |
| 804 | res_string_offsets.reserve(src_string_offsets.size() / col_size * replicate_offsets.back()); |
| 805 | res_offsets.reserve(replicate_offsets.back()); |
| 806 | |
| 807 | Offset prev_replicate_offset = 0; |
| 808 | |
| 809 | Offset prev_src_offset = 0; |
| 810 | Offset prev_src_string_offset = 0; |
| 811 | |
| 812 | Offset current_res_offset = 0; |
| 813 | Offset current_res_string_offset = 0; |
| 814 | |
| 815 | for (size_t i = 0; i < col_size; ++i) |
| 816 | { |
| 817 | /// How many times to replicate the array. |
| 818 | size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; |
| 819 | /// The number of strings in the array. |
| 820 | size_t value_size = src_offsets[i] - prev_src_offset; |
| 821 | /// Number of characters in strings of the array, including zero bytes. |
| 822 | size_t sum_chars_size = src_string_offsets[prev_src_offset + value_size - 1] - prev_src_string_offset; /// -1th index is Ok, see PaddedPODArray. |
| 823 | |
| 824 | for (size_t j = 0; j < size_to_replicate; ++j) |
| 825 | { |
| 826 | current_res_offset += value_size; |
| 827 | res_offsets.push_back(current_res_offset); |
| 828 | |
| 829 | size_t prev_src_string_offset_local = prev_src_string_offset; |
| 830 | for (size_t k = 0; k < value_size; ++k) |
| 831 | { |
| 832 | /// Size of single string. |
| 833 | size_t chars_size = src_string_offsets[k + prev_src_offset] - prev_src_string_offset_local; |
| 834 | |
| 835 | current_res_string_offset += chars_size; |
| 836 | res_string_offsets.push_back(current_res_string_offset); |
| 837 | |
| 838 | prev_src_string_offset_local += chars_size; |
| 839 | } |
| 840 | |
| 841 | if (sum_chars_size) |
| 842 | { |
| 843 | /// Copies the characters of the array of strings. |
| 844 | res_chars.resize(res_chars.size() + sum_chars_size); |
| 845 | memcpySmallAllowReadWriteOverflow15( |
| 846 | &res_chars[res_chars.size() - sum_chars_size], &src_chars[prev_src_string_offset], sum_chars_size); |
| 847 | } |
| 848 | } |
| 849 | |
| 850 | prev_replicate_offset = replicate_offsets[i]; |
| 851 | prev_src_offset = src_offsets[i]; |
| 852 | prev_src_string_offset += sum_chars_size; |
| 853 | } |
| 854 | |
| 855 | return res; |
| 856 | } |
| 857 | |
| 858 | |
| 859 | ColumnPtr ColumnArray::replicateConst(const Offsets & replicate_offsets) const |
| 860 | { |
| 861 | size_t col_size = size(); |
| 862 | if (col_size != replicate_offsets.size()) |
| 863 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 864 | |
| 865 | if (0 == col_size) |
| 866 | return cloneEmpty(); |
| 867 | |
| 868 | const Offsets & src_offsets = getOffsets(); |
| 869 | |
| 870 | auto res_column_offsets = ColumnOffsets::create(); |
| 871 | Offsets & res_offsets = res_column_offsets->getData(); |
| 872 | res_offsets.reserve(replicate_offsets.back()); |
| 873 | |
| 874 | Offset prev_replicate_offset = 0; |
| 875 | Offset prev_data_offset = 0; |
| 876 | Offset current_new_offset = 0; |
| 877 | |
| 878 | for (size_t i = 0; i < col_size; ++i) |
| 879 | { |
| 880 | size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; |
| 881 | size_t value_size = src_offsets[i] - prev_data_offset; |
| 882 | |
| 883 | for (size_t j = 0; j < size_to_replicate; ++j) |
| 884 | { |
| 885 | current_new_offset += value_size; |
| 886 | res_offsets.push_back(current_new_offset); |
| 887 | } |
| 888 | |
| 889 | prev_replicate_offset = replicate_offsets[i]; |
| 890 | prev_data_offset = src_offsets[i]; |
| 891 | } |
| 892 | |
| 893 | return ColumnArray::create(getData().cloneResized(current_new_offset), std::move(res_column_offsets)); |
| 894 | } |
| 895 | |
| 896 | |
| 897 | ColumnPtr ColumnArray::replicateGeneric(const Offsets & replicate_offsets) const |
| 898 | { |
| 899 | size_t col_size = size(); |
| 900 | if (col_size != replicate_offsets.size()) |
| 901 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 902 | |
| 903 | MutableColumnPtr res = cloneEmpty(); |
| 904 | ColumnArray & res_concrete = assert_cast<ColumnArray &>(*res); |
| 905 | |
| 906 | if (0 == col_size) |
| 907 | return res; |
| 908 | |
| 909 | IColumn::Offset prev_offset = 0; |
| 910 | for (size_t i = 0; i < col_size; ++i) |
| 911 | { |
| 912 | size_t size_to_replicate = replicate_offsets[i] - prev_offset; |
| 913 | prev_offset = replicate_offsets[i]; |
| 914 | |
| 915 | for (size_t j = 0; j < size_to_replicate; ++j) |
| 916 | res_concrete.insertFrom(*this, i); |
| 917 | } |
| 918 | |
| 919 | return res; |
| 920 | } |
| 921 | |
| 922 | |
| 923 | ColumnPtr ColumnArray::replicateNullable(const Offsets & replicate_offsets) const |
| 924 | { |
| 925 | const ColumnNullable & nullable = assert_cast<const ColumnNullable &>(*data); |
| 926 | |
| 927 | /// Make temporary arrays for each components of Nullable. Then replicate them independently and collect back to result. |
| 928 | /// NOTE Offsets are calculated twice and it is redundant. |
| 929 | |
| 930 | auto array_of_nested = ColumnArray(nullable.getNestedColumnPtr()->assumeMutable(), getOffsetsPtr()->assumeMutable()) |
| 931 | .replicate(replicate_offsets); |
| 932 | auto array_of_null_map = ColumnArray(nullable.getNullMapColumnPtr()->assumeMutable(), getOffsetsPtr()->assumeMutable()) |
| 933 | .replicate(replicate_offsets); |
| 934 | |
| 935 | return ColumnArray::create( |
| 936 | ColumnNullable::create( |
| 937 | assert_cast<const ColumnArray &>(*array_of_nested).getDataPtr(), |
| 938 | assert_cast<const ColumnArray &>(*array_of_null_map).getDataPtr()), |
| 939 | assert_cast<const ColumnArray &>(*array_of_nested).getOffsetsPtr()); |
| 940 | } |
| 941 | |
| 942 | |
| 943 | ColumnPtr ColumnArray::replicateTuple(const Offsets & replicate_offsets) const |
| 944 | { |
| 945 | const ColumnTuple & tuple = assert_cast<const ColumnTuple &>(*data); |
| 946 | |
| 947 | /// Make temporary arrays for each components of Tuple. In the same way as for Nullable. |
| 948 | |
| 949 | size_t tuple_size = tuple.tupleSize(); |
| 950 | |
| 951 | if (tuple_size == 0) |
| 952 | throw Exception("Logical error: empty tuple" , ErrorCodes::LOGICAL_ERROR); |
| 953 | |
| 954 | Columns temporary_arrays(tuple_size); |
| 955 | for (size_t i = 0; i < tuple_size; ++i) |
| 956 | temporary_arrays[i] = ColumnArray(tuple.getColumns()[i]->assumeMutable(), getOffsetsPtr()->assumeMutable()) |
| 957 | .replicate(replicate_offsets); |
| 958 | |
| 959 | Columns tuple_columns(tuple_size); |
| 960 | for (size_t i = 0; i < tuple_size; ++i) |
| 961 | tuple_columns[i] = assert_cast<const ColumnArray &>(*temporary_arrays[i]).getDataPtr(); |
| 962 | |
| 963 | return ColumnArray::create( |
| 964 | ColumnTuple::create(tuple_columns), |
| 965 | assert_cast<const ColumnArray &>(*temporary_arrays.front()).getOffsetsPtr()); |
| 966 | } |
| 967 | |
| 968 | |
| 969 | void ColumnArray::gather(ColumnGathererStream & gatherer) |
| 970 | { |
| 971 | gatherer.gather(*this); |
| 972 | } |
| 973 | |
| 974 | } |
| 975 | |