1 | #include <string.h> // memcpy |
2 | |
3 | #include <Columns/ColumnArray.h> |
4 | #include <Columns/ColumnsNumber.h> |
5 | #include <Columns/ColumnString.h> |
6 | #include <Columns/ColumnTuple.h> |
7 | #include <Columns/ColumnNullable.h> |
8 | #include <Columns/ColumnConst.h> |
9 | #include <Columns/ColumnsCommon.h> |
10 | |
11 | #include <common/unaligned.h> |
12 | |
13 | #include <DataStreams/ColumnGathererStream.h> |
14 | |
15 | #include <Common/Exception.h> |
16 | #include <Common/Arena.h> |
17 | #include <Common/SipHash.h> |
18 | #include <Common/typeid_cast.h> |
19 | #include <Common/assert_cast.h> |
20 | |
21 | |
22 | namespace DB |
23 | { |
24 | |
25 | namespace ErrorCodes |
26 | { |
27 | extern const int ILLEGAL_COLUMN; |
28 | extern const int NOT_IMPLEMENTED; |
29 | extern const int BAD_ARGUMENTS; |
30 | extern const int PARAMETER_OUT_OF_BOUND; |
31 | extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; |
32 | extern const int LOGICAL_ERROR; |
33 | } |
34 | |
35 | |
36 | ColumnArray::ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr && offsets_column) |
37 | : data(std::move(nested_column)), offsets(std::move(offsets_column)) |
38 | { |
39 | if (!typeid_cast<const ColumnOffsets *>(offsets.get())) |
40 | throw Exception("offsets_column must be a ColumnUInt64" , ErrorCodes::ILLEGAL_COLUMN); |
41 | |
42 | /** NOTE |
43 | * Arrays with constant value are possible and used in implementation of higher order functions (see FunctionReplicate). |
44 | * But in most cases, arrays with constant value are unexpected and code will work wrong. Use with caution. |
45 | */ |
46 | } |
47 | |
48 | ColumnArray::ColumnArray(MutableColumnPtr && nested_column) |
49 | : data(std::move(nested_column)) |
50 | { |
51 | if (!data->empty()) |
52 | throw Exception("Not empty data passed to ColumnArray, but no offsets passed" , ErrorCodes::ILLEGAL_COLUMN); |
53 | |
54 | offsets = ColumnOffsets::create(); |
55 | } |
56 | |
57 | |
58 | std::string ColumnArray::getName() const { return "Array(" + getData().getName() + ")" ; } |
59 | |
60 | |
61 | MutableColumnPtr ColumnArray::cloneResized(size_t to_size) const |
62 | { |
63 | auto res = ColumnArray::create(getData().cloneEmpty()); |
64 | |
65 | if (to_size == 0) |
66 | return res; |
67 | |
68 | size_t from_size = size(); |
69 | |
70 | if (to_size <= from_size) |
71 | { |
72 | /// Just cut column. |
73 | |
74 | res->getOffsets().assign(getOffsets().begin(), getOffsets().begin() + to_size); |
75 | res->getData().insertRangeFrom(getData(), 0, getOffsets()[to_size - 1]); |
76 | } |
77 | else |
78 | { |
79 | /// Copy column and append empty arrays for extra elements. |
80 | |
81 | Offset offset = 0; |
82 | if (from_size > 0) |
83 | { |
84 | res->getOffsets().assign(getOffsets().begin(), getOffsets().end()); |
85 | res->getData().insertRangeFrom(getData(), 0, getData().size()); |
86 | offset = getOffsets().back(); |
87 | } |
88 | |
89 | res->getOffsets().resize(to_size); |
90 | for (size_t i = from_size; i < to_size; ++i) |
91 | res->getOffsets()[i] = offset; |
92 | } |
93 | |
94 | return res; |
95 | } |
96 | |
97 | |
98 | size_t ColumnArray::size() const |
99 | { |
100 | return getOffsets().size(); |
101 | } |
102 | |
103 | |
104 | Field ColumnArray::operator[](size_t n) const |
105 | { |
106 | size_t offset = offsetAt(n); |
107 | size_t size = sizeAt(n); |
108 | Array res(size); |
109 | |
110 | for (size_t i = 0; i < size; ++i) |
111 | res[i] = getData()[offset + i]; |
112 | |
113 | return res; |
114 | } |
115 | |
116 | |
117 | void ColumnArray::get(size_t n, Field & res) const |
118 | { |
119 | size_t offset = offsetAt(n); |
120 | size_t size = sizeAt(n); |
121 | res = Array(size); |
122 | Array & res_arr = DB::get<Array &>(res); |
123 | |
124 | for (size_t i = 0; i < size; ++i) |
125 | getData().get(offset + i, res_arr[i]); |
126 | } |
127 | |
128 | |
129 | StringRef ColumnArray::getDataAt(size_t n) const |
130 | { |
131 | /** Returns the range of memory that covers all elements of the array. |
132 | * Works for arrays of fixed length values. |
133 | * For arrays of strings and arrays of arrays, the resulting chunk of memory may not be one-to-one correspondence with the elements, |
134 | * since it contains only the data laid in succession, but not the offsets. |
135 | */ |
136 | |
137 | size_t offset_of_first_elem = offsetAt(n); |
138 | StringRef first = getData().getDataAtWithTerminatingZero(offset_of_first_elem); |
139 | |
140 | size_t array_size = sizeAt(n); |
141 | if (array_size == 0) |
142 | return StringRef(first.data, 0); |
143 | |
144 | size_t offset_of_last_elem = getOffsets()[n] - 1; |
145 | StringRef last = getData().getDataAtWithTerminatingZero(offset_of_last_elem); |
146 | |
147 | return StringRef(first.data, last.data + last.size - first.data); |
148 | } |
149 | |
150 | |
151 | void ColumnArray::insertData(const char * pos, size_t length) |
152 | { |
153 | /** Similarly - only for arrays of fixed length values. |
154 | */ |
155 | IColumn * data_ = &getData(); |
156 | if (!data_->isFixedAndContiguous()) |
157 | throw Exception("Method insertData is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); |
158 | |
159 | size_t field_size = data_->sizeOfValueIfFixed(); |
160 | |
161 | const char * end = pos + length; |
162 | size_t elems = 0; |
163 | for (; pos + field_size <= end; pos += field_size, ++elems) |
164 | data_->insertData(pos, field_size); |
165 | |
166 | if (pos != end) |
167 | throw Exception("Incorrect length argument for method ColumnArray::insertData" , ErrorCodes::BAD_ARGUMENTS); |
168 | |
169 | getOffsets().push_back(getOffsets().back() + elems); |
170 | } |
171 | |
172 | |
173 | StringRef ColumnArray::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const |
174 | { |
175 | size_t array_size = sizeAt(n); |
176 | size_t offset = offsetAt(n); |
177 | |
178 | char * pos = arena.allocContinue(sizeof(array_size), begin); |
179 | memcpy(pos, &array_size, sizeof(array_size)); |
180 | |
181 | StringRef res(pos, sizeof(array_size)); |
182 | |
183 | for (size_t i = 0; i < array_size; ++i) |
184 | { |
185 | auto value_ref = getData().serializeValueIntoArena(offset + i, arena, begin); |
186 | res.data = value_ref.data - res.size; |
187 | res.size += value_ref.size; |
188 | } |
189 | |
190 | return res; |
191 | } |
192 | |
193 | |
194 | const char * ColumnArray::deserializeAndInsertFromArena(const char * pos) |
195 | { |
196 | size_t array_size = unalignedLoad<size_t>(pos); |
197 | pos += sizeof(array_size); |
198 | |
199 | for (size_t i = 0; i < array_size; ++i) |
200 | pos = getData().deserializeAndInsertFromArena(pos); |
201 | |
202 | getOffsets().push_back(getOffsets().back() + array_size); |
203 | return pos; |
204 | } |
205 | |
206 | |
207 | void ColumnArray::updateHashWithValue(size_t n, SipHash & hash) const |
208 | { |
209 | size_t array_size = sizeAt(n); |
210 | size_t offset = offsetAt(n); |
211 | |
212 | hash.update(array_size); |
213 | for (size_t i = 0; i < array_size; ++i) |
214 | getData().updateHashWithValue(offset + i, hash); |
215 | } |
216 | |
217 | |
218 | void ColumnArray::insert(const Field & x) |
219 | { |
220 | const Array & array = DB::get<const Array &>(x); |
221 | size_t size = array.size(); |
222 | for (size_t i = 0; i < size; ++i) |
223 | getData().insert(array[i]); |
224 | getOffsets().push_back(getOffsets().back() + size); |
225 | } |
226 | |
227 | |
228 | void ColumnArray::insertFrom(const IColumn & src_, size_t n) |
229 | { |
230 | const ColumnArray & src = assert_cast<const ColumnArray &>(src_); |
231 | size_t size = src.sizeAt(n); |
232 | size_t offset = src.offsetAt(n); |
233 | |
234 | getData().insertRangeFrom(src.getData(), offset, size); |
235 | getOffsets().push_back(getOffsets().back() + size); |
236 | } |
237 | |
238 | |
239 | void ColumnArray::insertDefault() |
240 | { |
241 | /// NOTE 1: We can use back() even if the array is empty (due to zero -1th element in PODArray). |
242 | /// NOTE 2: We cannot use reference in push_back, because reference get invalidated if array is reallocated. |
243 | auto last_offset = getOffsets().back(); |
244 | getOffsets().push_back(last_offset); |
245 | } |
246 | |
247 | |
248 | void ColumnArray::popBack(size_t n) |
249 | { |
250 | auto & offsets_data = getOffsets(); |
251 | size_t nested_n = offsets_data.back() - offsetAt(offsets_data.size() - n); |
252 | if (nested_n) |
253 | getData().popBack(nested_n); |
254 | offsets_data.resize_assume_reserved(offsets_data.size() - n); |
255 | } |
256 | |
257 | |
258 | int ColumnArray::compareAt(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint) const |
259 | { |
260 | const ColumnArray & rhs = assert_cast<const ColumnArray &>(rhs_); |
261 | |
262 | /// Suboptimal |
263 | size_t lhs_size = sizeAt(n); |
264 | size_t rhs_size = rhs.sizeAt(m); |
265 | size_t min_size = std::min(lhs_size, rhs_size); |
266 | for (size_t i = 0; i < min_size; ++i) |
267 | if (int res = getData().compareAt(offsetAt(n) + i, rhs.offsetAt(m) + i, *rhs.data.get(), nan_direction_hint)) |
268 | return res; |
269 | |
270 | return lhs_size < rhs_size |
271 | ? -1 |
272 | : (lhs_size == rhs_size |
273 | ? 0 |
274 | : 1); |
275 | } |
276 | |
277 | |
278 | namespace |
279 | { |
280 | template <bool positive> |
281 | struct less |
282 | { |
283 | const ColumnArray & parent; |
284 | int nan_direction_hint; |
285 | |
286 | less(const ColumnArray & parent_, int nan_direction_hint_) |
287 | : parent(parent_), nan_direction_hint(nan_direction_hint_) {} |
288 | |
289 | bool operator()(size_t lhs, size_t rhs) const |
290 | { |
291 | if (positive) |
292 | return parent.compareAt(lhs, rhs, parent, nan_direction_hint) < 0; |
293 | else |
294 | return parent.compareAt(lhs, rhs, parent, nan_direction_hint) > 0; |
295 | } |
296 | }; |
297 | } |
298 | |
299 | |
300 | void ColumnArray::reserve(size_t n) |
301 | { |
302 | getOffsets().reserve(n); |
303 | getData().reserve(n); /// The average size of arrays is not taken into account here. Or it is considered to be no more than 1. |
304 | } |
305 | |
306 | |
307 | size_t ColumnArray::byteSize() const |
308 | { |
309 | return getData().byteSize() + getOffsets().size() * sizeof(getOffsets()[0]); |
310 | } |
311 | |
312 | |
313 | size_t ColumnArray::allocatedBytes() const |
314 | { |
315 | return getData().allocatedBytes() + getOffsets().allocated_bytes(); |
316 | } |
317 | |
318 | |
319 | void ColumnArray::protect() |
320 | { |
321 | getData().protect(); |
322 | getOffsets().protect(); |
323 | } |
324 | |
325 | |
326 | bool ColumnArray::hasEqualOffsets(const ColumnArray & other) const |
327 | { |
328 | if (offsets == other.offsets) |
329 | return true; |
330 | |
331 | const Offsets & offsets1 = getOffsets(); |
332 | const Offsets & offsets2 = other.getOffsets(); |
333 | return offsets1.size() == offsets2.size() |
334 | && (offsets1.size() == 0 || 0 == memcmp(offsets1.data(), offsets2.data(), sizeof(offsets1[0]) * offsets1.size())); |
335 | } |
336 | |
337 | |
338 | ColumnPtr ColumnArray::convertToFullColumnIfConst() const |
339 | { |
340 | /// It is possible to have an array with constant data and non-constant offsets. |
341 | /// Example is the result of expression: replicate('hello', [1]) |
342 | return ColumnArray::create(data->convertToFullColumnIfConst(), offsets); |
343 | } |
344 | |
345 | void ColumnArray::getExtremes(Field & min, Field & max) const |
346 | { |
347 | min = Array(); |
348 | max = Array(); |
349 | |
350 | size_t col_size = size(); |
351 | |
352 | if (col_size == 0) |
353 | return; |
354 | |
355 | size_t min_idx = 0; |
356 | size_t max_idx = 0; |
357 | |
358 | for (size_t i = 1; i < col_size; ++i) |
359 | { |
360 | if (compareAt(i, min_idx, *this, /* nan_direction_hint = */ 1) < 0) |
361 | min_idx = i; |
362 | else if (compareAt(i, max_idx, *this, /* nan_direction_hint = */ -1) > 0) |
363 | max_idx = i; |
364 | } |
365 | |
366 | get(min_idx, min); |
367 | get(max_idx, max); |
368 | } |
369 | |
370 | |
371 | void ColumnArray::insertRangeFrom(const IColumn & src, size_t start, size_t length) |
372 | { |
373 | if (length == 0) |
374 | return; |
375 | |
376 | const ColumnArray & src_concrete = assert_cast<const ColumnArray &>(src); |
377 | |
378 | if (start + length > src_concrete.getOffsets().size()) |
379 | throw Exception("Parameter out of bound in ColumnArray::insertRangeFrom method. [start(" + std::to_string(start) + ") + length(" + std::to_string(length) + ") > offsets.size(" + std::to_string(src_concrete.getOffsets().size()) + ")]" , |
380 | ErrorCodes::PARAMETER_OUT_OF_BOUND); |
381 | |
382 | size_t nested_offset = src_concrete.offsetAt(start); |
383 | size_t nested_length = src_concrete.getOffsets()[start + length - 1] - nested_offset; |
384 | |
385 | getData().insertRangeFrom(src_concrete.getData(), nested_offset, nested_length); |
386 | |
387 | Offsets & cur_offsets = getOffsets(); |
388 | const Offsets & src_offsets = src_concrete.getOffsets(); |
389 | |
390 | if (start == 0 && cur_offsets.empty()) |
391 | { |
392 | cur_offsets.assign(src_offsets.begin(), src_offsets.begin() + length); |
393 | } |
394 | else |
395 | { |
396 | size_t old_size = cur_offsets.size(); |
397 | size_t prev_max_offset = old_size ? cur_offsets.back() : 0; |
398 | cur_offsets.resize(old_size + length); |
399 | |
400 | for (size_t i = 0; i < length; ++i) |
401 | cur_offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset; |
402 | } |
403 | } |
404 | |
405 | |
406 | ColumnPtr ColumnArray::filter(const Filter & filt, ssize_t result_size_hint) const |
407 | { |
408 | if (typeid_cast<const ColumnUInt8 *>(data.get())) return filterNumber<UInt8>(filt, result_size_hint); |
409 | if (typeid_cast<const ColumnUInt16 *>(data.get())) return filterNumber<UInt16>(filt, result_size_hint); |
410 | if (typeid_cast<const ColumnUInt32 *>(data.get())) return filterNumber<UInt32>(filt, result_size_hint); |
411 | if (typeid_cast<const ColumnUInt64 *>(data.get())) return filterNumber<UInt64>(filt, result_size_hint); |
412 | if (typeid_cast<const ColumnInt8 *>(data.get())) return filterNumber<Int8>(filt, result_size_hint); |
413 | if (typeid_cast<const ColumnInt16 *>(data.get())) return filterNumber<Int16>(filt, result_size_hint); |
414 | if (typeid_cast<const ColumnInt32 *>(data.get())) return filterNumber<Int32>(filt, result_size_hint); |
415 | if (typeid_cast<const ColumnInt64 *>(data.get())) return filterNumber<Int64>(filt, result_size_hint); |
416 | if (typeid_cast<const ColumnFloat32 *>(data.get())) return filterNumber<Float32>(filt, result_size_hint); |
417 | if (typeid_cast<const ColumnFloat64 *>(data.get())) return filterNumber<Float64>(filt, result_size_hint); |
418 | if (typeid_cast<const ColumnString *>(data.get())) return filterString(filt, result_size_hint); |
419 | if (typeid_cast<const ColumnTuple *>(data.get())) return filterTuple(filt, result_size_hint); |
420 | if (typeid_cast<const ColumnNullable *>(data.get())) return filterNullable(filt, result_size_hint); |
421 | return filterGeneric(filt, result_size_hint); |
422 | } |
423 | |
424 | template <typename T> |
425 | ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const |
426 | { |
427 | if (getOffsets().size() == 0) |
428 | return ColumnArray::create(data); |
429 | |
430 | auto res = ColumnArray::create(data->cloneEmpty()); |
431 | |
432 | auto & res_elems = assert_cast<ColumnVector<T> &>(res->getData()).getData(); |
433 | Offsets & res_offsets = res->getOffsets(); |
434 | |
435 | filterArraysImpl<T>(assert_cast<const ColumnVector<T> &>(*data).getData(), getOffsets(), res_elems, res_offsets, filt, result_size_hint); |
436 | return res; |
437 | } |
438 | |
439 | ColumnPtr ColumnArray::filterString(const Filter & filt, ssize_t result_size_hint) const |
440 | { |
441 | size_t col_size = getOffsets().size(); |
442 | if (col_size != filt.size()) |
443 | throw Exception("Size of filter doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
444 | |
445 | if (0 == col_size) |
446 | return ColumnArray::create(data); |
447 | |
448 | auto res = ColumnArray::create(data->cloneEmpty()); |
449 | |
450 | const ColumnString & src_string = typeid_cast<const ColumnString &>(*data); |
451 | const ColumnString::Chars & src_chars = src_string.getChars(); |
452 | const Offsets & src_string_offsets = src_string.getOffsets(); |
453 | const Offsets & src_offsets = getOffsets(); |
454 | |
455 | ColumnString::Chars & res_chars = typeid_cast<ColumnString &>(res->getData()).getChars(); |
456 | Offsets & res_string_offsets = typeid_cast<ColumnString &>(res->getData()).getOffsets(); |
457 | Offsets & res_offsets = res->getOffsets(); |
458 | |
459 | if (result_size_hint < 0) /// Other cases are not considered. |
460 | { |
461 | res_chars.reserve(src_chars.size()); |
462 | res_string_offsets.reserve(src_string_offsets.size()); |
463 | res_offsets.reserve(col_size); |
464 | } |
465 | |
466 | Offset prev_src_offset = 0; |
467 | Offset prev_src_string_offset = 0; |
468 | |
469 | Offset prev_res_offset = 0; |
470 | Offset prev_res_string_offset = 0; |
471 | |
472 | for (size_t i = 0; i < col_size; ++i) |
473 | { |
474 | /// Number of rows in the array. |
475 | size_t array_size = src_offsets[i] - prev_src_offset; |
476 | |
477 | if (filt[i]) |
478 | { |
479 | /// If the array is not empty - copy content. |
480 | if (array_size) |
481 | { |
482 | size_t chars_to_copy = src_string_offsets[array_size + prev_src_offset - 1] - prev_src_string_offset; |
483 | size_t res_chars_prev_size = res_chars.size(); |
484 | res_chars.resize(res_chars_prev_size + chars_to_copy); |
485 | memcpy(&res_chars[res_chars_prev_size], &src_chars[prev_src_string_offset], chars_to_copy); |
486 | |
487 | for (size_t j = 0; j < array_size; ++j) |
488 | res_string_offsets.push_back(src_string_offsets[j + prev_src_offset] + prev_res_string_offset - prev_src_string_offset); |
489 | |
490 | prev_res_string_offset = res_string_offsets.back(); |
491 | } |
492 | |
493 | prev_res_offset += array_size; |
494 | res_offsets.push_back(prev_res_offset); |
495 | } |
496 | |
497 | if (array_size) |
498 | { |
499 | prev_src_offset += array_size; |
500 | prev_src_string_offset = src_string_offsets[prev_src_offset - 1]; |
501 | } |
502 | } |
503 | |
504 | return res; |
505 | } |
506 | |
507 | ColumnPtr ColumnArray::filterGeneric(const Filter & filt, ssize_t result_size_hint) const |
508 | { |
509 | size_t size = getOffsets().size(); |
510 | if (size != filt.size()) |
511 | throw Exception("Size of filter doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
512 | |
513 | if (size == 0) |
514 | return ColumnArray::create(data); |
515 | |
516 | Filter nested_filt(getOffsets().back()); |
517 | for (size_t i = 0; i < size; ++i) |
518 | { |
519 | if (filt[i]) |
520 | memset(&nested_filt[offsetAt(i)], 1, sizeAt(i)); |
521 | else |
522 | memset(&nested_filt[offsetAt(i)], 0, sizeAt(i)); |
523 | } |
524 | |
525 | auto res = ColumnArray::create(data->cloneEmpty()); |
526 | |
527 | ssize_t nested_result_size_hint = 0; |
528 | if (result_size_hint < 0) |
529 | nested_result_size_hint = result_size_hint; |
530 | else if (result_size_hint && result_size_hint < 1000000000 && data->size() < 1000000000) /// Avoid overflow. |
531 | nested_result_size_hint = result_size_hint * data->size() / size; |
532 | |
533 | res->data = data->filter(nested_filt, nested_result_size_hint); |
534 | |
535 | Offsets & res_offsets = res->getOffsets(); |
536 | if (result_size_hint) |
537 | res_offsets.reserve(result_size_hint > 0 ? result_size_hint : size); |
538 | |
539 | size_t current_offset = 0; |
540 | for (size_t i = 0; i < size; ++i) |
541 | { |
542 | if (filt[i]) |
543 | { |
544 | current_offset += sizeAt(i); |
545 | res_offsets.push_back(current_offset); |
546 | } |
547 | } |
548 | |
549 | return res; |
550 | } |
551 | |
552 | ColumnPtr ColumnArray::filterNullable(const Filter & filt, ssize_t result_size_hint) const |
553 | { |
554 | if (getOffsets().size() == 0) |
555 | return ColumnArray::create(data); |
556 | |
557 | const ColumnNullable & nullable_elems = assert_cast<const ColumnNullable &>(*data); |
558 | |
559 | auto array_of_nested = ColumnArray::create(nullable_elems.getNestedColumnPtr(), offsets); |
560 | auto filtered_array_of_nested_owner = array_of_nested->filter(filt, result_size_hint); |
561 | auto & filtered_array_of_nested = assert_cast<const ColumnArray &>(*filtered_array_of_nested_owner); |
562 | auto & filtered_offsets = filtered_array_of_nested.getOffsetsPtr(); |
563 | |
564 | auto res_null_map = ColumnUInt8::create(); |
565 | |
566 | filterArraysImplOnlyData(nullable_elems.getNullMapData(), getOffsets(), res_null_map->getData(), filt, result_size_hint); |
567 | |
568 | return ColumnArray::create( |
569 | ColumnNullable::create( |
570 | filtered_array_of_nested.getDataPtr(), |
571 | std::move(res_null_map)), |
572 | filtered_offsets); |
573 | } |
574 | |
575 | ColumnPtr ColumnArray::filterTuple(const Filter & filt, ssize_t result_size_hint) const |
576 | { |
577 | if (getOffsets().size() == 0) |
578 | return ColumnArray::create(data); |
579 | |
580 | const ColumnTuple & tuple = assert_cast<const ColumnTuple &>(*data); |
581 | |
582 | /// Make temporary arrays for each components of Tuple, then filter and collect back. |
583 | |
584 | size_t tuple_size = tuple.tupleSize(); |
585 | |
586 | if (tuple_size == 0) |
587 | throw Exception("Logical error: empty tuple" , ErrorCodes::LOGICAL_ERROR); |
588 | |
589 | Columns temporary_arrays(tuple_size); |
590 | for (size_t i = 0; i < tuple_size; ++i) |
591 | temporary_arrays[i] = ColumnArray(tuple.getColumns()[i]->assumeMutable(), getOffsetsPtr()->assumeMutable()) |
592 | .filter(filt, result_size_hint); |
593 | |
594 | Columns tuple_columns(tuple_size); |
595 | for (size_t i = 0; i < tuple_size; ++i) |
596 | tuple_columns[i] = assert_cast<const ColumnArray &>(*temporary_arrays[i]).getDataPtr(); |
597 | |
598 | return ColumnArray::create( |
599 | ColumnTuple::create(tuple_columns), |
600 | assert_cast<const ColumnArray &>(*temporary_arrays.front()).getOffsetsPtr()); |
601 | } |
602 | |
603 | |
604 | ColumnPtr ColumnArray::permute(const Permutation & perm, size_t limit) const |
605 | { |
606 | size_t size = getOffsets().size(); |
607 | |
608 | if (limit == 0) |
609 | limit = size; |
610 | else |
611 | limit = std::min(size, limit); |
612 | |
613 | if (perm.size() < limit) |
614 | throw Exception("Size of permutation is less than required." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
615 | |
616 | if (limit == 0) |
617 | return ColumnArray::create(data); |
618 | |
619 | Permutation nested_perm(getOffsets().back()); |
620 | |
621 | auto res = ColumnArray::create(data->cloneEmpty()); |
622 | |
623 | Offsets & res_offsets = res->getOffsets(); |
624 | res_offsets.resize(limit); |
625 | size_t current_offset = 0; |
626 | |
627 | for (size_t i = 0; i < limit; ++i) |
628 | { |
629 | for (size_t j = 0; j < sizeAt(perm[i]); ++j) |
630 | nested_perm[current_offset + j] = offsetAt(perm[i]) + j; |
631 | current_offset += sizeAt(perm[i]); |
632 | res_offsets[i] = current_offset; |
633 | } |
634 | |
635 | if (current_offset != 0) |
636 | res->data = data->permute(nested_perm, current_offset); |
637 | |
638 | return res; |
639 | } |
640 | |
641 | ColumnPtr ColumnArray::index(const IColumn & indexes, size_t limit) const |
642 | { |
643 | return selectIndexImpl(*this, indexes, limit); |
644 | } |
645 | |
646 | template <typename T> |
647 | ColumnPtr ColumnArray::indexImpl(const PaddedPODArray<T> & indexes, size_t limit) const |
648 | { |
649 | if (limit == 0) |
650 | return ColumnArray::create(data); |
651 | |
652 | /// Convert indexes to UInt64 in case of overflow. |
653 | auto nested_indexes_column = ColumnUInt64::create(); |
654 | PaddedPODArray<UInt64> & nested_indexes = nested_indexes_column->getData(); |
655 | nested_indexes.reserve(getOffsets().back()); |
656 | |
657 | auto res = ColumnArray::create(data->cloneEmpty()); |
658 | |
659 | Offsets & res_offsets = res->getOffsets(); |
660 | res_offsets.resize(limit); |
661 | size_t current_offset = 0; |
662 | |
663 | for (size_t i = 0; i < limit; ++i) |
664 | { |
665 | for (size_t j = 0; j < sizeAt(indexes[i]); ++j) |
666 | nested_indexes.push_back(offsetAt(indexes[i]) + j); |
667 | current_offset += sizeAt(indexes[i]); |
668 | res_offsets[i] = current_offset; |
669 | } |
670 | |
671 | if (current_offset != 0) |
672 | res->data = data->index(*nested_indexes_column, current_offset); |
673 | |
674 | return res; |
675 | } |
676 | |
677 | INSTANTIATE_INDEX_IMPL(ColumnArray) |
678 | |
679 | void ColumnArray::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const |
680 | { |
681 | size_t s = size(); |
682 | if (limit >= s) |
683 | limit = 0; |
684 | |
685 | res.resize(s); |
686 | for (size_t i = 0; i < s; ++i) |
687 | res[i] = i; |
688 | |
689 | if (limit) |
690 | { |
691 | if (reverse) |
692 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<false>(*this, nan_direction_hint)); |
693 | else |
694 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<true>(*this, nan_direction_hint)); |
695 | } |
696 | else |
697 | { |
698 | if (reverse) |
699 | std::sort(res.begin(), res.end(), less<false>(*this, nan_direction_hint)); |
700 | else |
701 | std::sort(res.begin(), res.end(), less<true>(*this, nan_direction_hint)); |
702 | } |
703 | } |
704 | |
705 | |
706 | ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const |
707 | { |
708 | if (replicate_offsets.empty()) |
709 | return cloneEmpty(); |
710 | |
711 | if (typeid_cast<const ColumnUInt8 *>(data.get())) return replicateNumber<UInt8>(replicate_offsets); |
712 | if (typeid_cast<const ColumnUInt16 *>(data.get())) return replicateNumber<UInt16>(replicate_offsets); |
713 | if (typeid_cast<const ColumnUInt32 *>(data.get())) return replicateNumber<UInt32>(replicate_offsets); |
714 | if (typeid_cast<const ColumnUInt64 *>(data.get())) return replicateNumber<UInt64>(replicate_offsets); |
715 | if (typeid_cast<const ColumnInt8 *>(data.get())) return replicateNumber<Int8>(replicate_offsets); |
716 | if (typeid_cast<const ColumnInt16 *>(data.get())) return replicateNumber<Int16>(replicate_offsets); |
717 | if (typeid_cast<const ColumnInt32 *>(data.get())) return replicateNumber<Int32>(replicate_offsets); |
718 | if (typeid_cast<const ColumnInt64 *>(data.get())) return replicateNumber<Int64>(replicate_offsets); |
719 | if (typeid_cast<const ColumnFloat32 *>(data.get())) return replicateNumber<Float32>(replicate_offsets); |
720 | if (typeid_cast<const ColumnFloat64 *>(data.get())) return replicateNumber<Float64>(replicate_offsets); |
721 | if (typeid_cast<const ColumnString *>(data.get())) return replicateString(replicate_offsets); |
722 | if (typeid_cast<const ColumnConst *>(data.get())) return replicateConst(replicate_offsets); |
723 | if (typeid_cast<const ColumnNullable *>(data.get())) return replicateNullable(replicate_offsets); |
724 | if (typeid_cast<const ColumnTuple *>(data.get())) return replicateTuple(replicate_offsets); |
725 | return replicateGeneric(replicate_offsets); |
726 | } |
727 | |
728 | |
729 | template <typename T> |
730 | ColumnPtr ColumnArray::replicateNumber(const Offsets & replicate_offsets) const |
731 | { |
732 | size_t col_size = size(); |
733 | if (col_size != replicate_offsets.size()) |
734 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
735 | |
736 | MutableColumnPtr res = cloneEmpty(); |
737 | |
738 | if (0 == col_size) |
739 | return res; |
740 | |
741 | ColumnArray & res_ = typeid_cast<ColumnArray &>(*res); |
742 | |
743 | const typename ColumnVector<T>::Container & src_data = typeid_cast<const ColumnVector<T> &>(*data).getData(); |
744 | const Offsets & src_offsets = getOffsets(); |
745 | |
746 | typename ColumnVector<T>::Container & res_data = typeid_cast<ColumnVector<T> &>(res_.getData()).getData(); |
747 | Offsets & res_offsets = res_.getOffsets(); |
748 | |
749 | res_data.reserve(data->size() / col_size * replicate_offsets.back()); |
750 | res_offsets.reserve(replicate_offsets.back()); |
751 | |
752 | Offset prev_replicate_offset = 0; |
753 | Offset prev_data_offset = 0; |
754 | Offset current_new_offset = 0; |
755 | |
756 | for (size_t i = 0; i < col_size; ++i) |
757 | { |
758 | size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; |
759 | size_t value_size = src_offsets[i] - prev_data_offset; |
760 | |
761 | for (size_t j = 0; j < size_to_replicate; ++j) |
762 | { |
763 | current_new_offset += value_size; |
764 | res_offsets.push_back(current_new_offset); |
765 | |
766 | if (value_size) |
767 | { |
768 | res_data.resize(res_data.size() + value_size); |
769 | memcpy(&res_data[res_data.size() - value_size], &src_data[prev_data_offset], value_size * sizeof(T)); |
770 | } |
771 | } |
772 | |
773 | prev_replicate_offset = replicate_offsets[i]; |
774 | prev_data_offset = src_offsets[i]; |
775 | } |
776 | |
777 | return res; |
778 | } |
779 | |
780 | |
781 | ColumnPtr ColumnArray::replicateString(const Offsets & replicate_offsets) const |
782 | { |
783 | size_t col_size = size(); |
784 | if (col_size != replicate_offsets.size()) |
785 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
786 | |
787 | MutableColumnPtr res = cloneEmpty(); |
788 | |
789 | if (0 == col_size) |
790 | return res; |
791 | |
792 | ColumnArray & res_ = assert_cast<ColumnArray &>(*res); |
793 | |
794 | const ColumnString & src_string = typeid_cast<const ColumnString &>(*data); |
795 | const ColumnString::Chars & src_chars = src_string.getChars(); |
796 | const Offsets & src_string_offsets = src_string.getOffsets(); |
797 | const Offsets & src_offsets = getOffsets(); |
798 | |
799 | ColumnString::Chars & res_chars = typeid_cast<ColumnString &>(res_.getData()).getChars(); |
800 | Offsets & res_string_offsets = typeid_cast<ColumnString &>(res_.getData()).getOffsets(); |
801 | Offsets & res_offsets = res_.getOffsets(); |
802 | |
803 | res_chars.reserve(src_chars.size() / col_size * replicate_offsets.back()); |
804 | res_string_offsets.reserve(src_string_offsets.size() / col_size * replicate_offsets.back()); |
805 | res_offsets.reserve(replicate_offsets.back()); |
806 | |
807 | Offset prev_replicate_offset = 0; |
808 | |
809 | Offset prev_src_offset = 0; |
810 | Offset prev_src_string_offset = 0; |
811 | |
812 | Offset current_res_offset = 0; |
813 | Offset current_res_string_offset = 0; |
814 | |
815 | for (size_t i = 0; i < col_size; ++i) |
816 | { |
817 | /// How many times to replicate the array. |
818 | size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; |
819 | /// The number of strings in the array. |
820 | size_t value_size = src_offsets[i] - prev_src_offset; |
821 | /// Number of characters in strings of the array, including zero bytes. |
822 | size_t sum_chars_size = src_string_offsets[prev_src_offset + value_size - 1] - prev_src_string_offset; /// -1th index is Ok, see PaddedPODArray. |
823 | |
824 | for (size_t j = 0; j < size_to_replicate; ++j) |
825 | { |
826 | current_res_offset += value_size; |
827 | res_offsets.push_back(current_res_offset); |
828 | |
829 | size_t prev_src_string_offset_local = prev_src_string_offset; |
830 | for (size_t k = 0; k < value_size; ++k) |
831 | { |
832 | /// Size of single string. |
833 | size_t chars_size = src_string_offsets[k + prev_src_offset] - prev_src_string_offset_local; |
834 | |
835 | current_res_string_offset += chars_size; |
836 | res_string_offsets.push_back(current_res_string_offset); |
837 | |
838 | prev_src_string_offset_local += chars_size; |
839 | } |
840 | |
841 | if (sum_chars_size) |
842 | { |
843 | /// Copies the characters of the array of strings. |
844 | res_chars.resize(res_chars.size() + sum_chars_size); |
845 | memcpySmallAllowReadWriteOverflow15( |
846 | &res_chars[res_chars.size() - sum_chars_size], &src_chars[prev_src_string_offset], sum_chars_size); |
847 | } |
848 | } |
849 | |
850 | prev_replicate_offset = replicate_offsets[i]; |
851 | prev_src_offset = src_offsets[i]; |
852 | prev_src_string_offset += sum_chars_size; |
853 | } |
854 | |
855 | return res; |
856 | } |
857 | |
858 | |
859 | ColumnPtr ColumnArray::replicateConst(const Offsets & replicate_offsets) const |
860 | { |
861 | size_t col_size = size(); |
862 | if (col_size != replicate_offsets.size()) |
863 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
864 | |
865 | if (0 == col_size) |
866 | return cloneEmpty(); |
867 | |
868 | const Offsets & src_offsets = getOffsets(); |
869 | |
870 | auto res_column_offsets = ColumnOffsets::create(); |
871 | Offsets & res_offsets = res_column_offsets->getData(); |
872 | res_offsets.reserve(replicate_offsets.back()); |
873 | |
874 | Offset prev_replicate_offset = 0; |
875 | Offset prev_data_offset = 0; |
876 | Offset current_new_offset = 0; |
877 | |
878 | for (size_t i = 0; i < col_size; ++i) |
879 | { |
880 | size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; |
881 | size_t value_size = src_offsets[i] - prev_data_offset; |
882 | |
883 | for (size_t j = 0; j < size_to_replicate; ++j) |
884 | { |
885 | current_new_offset += value_size; |
886 | res_offsets.push_back(current_new_offset); |
887 | } |
888 | |
889 | prev_replicate_offset = replicate_offsets[i]; |
890 | prev_data_offset = src_offsets[i]; |
891 | } |
892 | |
893 | return ColumnArray::create(getData().cloneResized(current_new_offset), std::move(res_column_offsets)); |
894 | } |
895 | |
896 | |
897 | ColumnPtr ColumnArray::replicateGeneric(const Offsets & replicate_offsets) const |
898 | { |
899 | size_t col_size = size(); |
900 | if (col_size != replicate_offsets.size()) |
901 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
902 | |
903 | MutableColumnPtr res = cloneEmpty(); |
904 | ColumnArray & res_concrete = assert_cast<ColumnArray &>(*res); |
905 | |
906 | if (0 == col_size) |
907 | return res; |
908 | |
909 | IColumn::Offset prev_offset = 0; |
910 | for (size_t i = 0; i < col_size; ++i) |
911 | { |
912 | size_t size_to_replicate = replicate_offsets[i] - prev_offset; |
913 | prev_offset = replicate_offsets[i]; |
914 | |
915 | for (size_t j = 0; j < size_to_replicate; ++j) |
916 | res_concrete.insertFrom(*this, i); |
917 | } |
918 | |
919 | return res; |
920 | } |
921 | |
922 | |
923 | ColumnPtr ColumnArray::replicateNullable(const Offsets & replicate_offsets) const |
924 | { |
925 | const ColumnNullable & nullable = assert_cast<const ColumnNullable &>(*data); |
926 | |
927 | /// Make temporary arrays for each components of Nullable. Then replicate them independently and collect back to result. |
928 | /// NOTE Offsets are calculated twice and it is redundant. |
929 | |
930 | auto array_of_nested = ColumnArray(nullable.getNestedColumnPtr()->assumeMutable(), getOffsetsPtr()->assumeMutable()) |
931 | .replicate(replicate_offsets); |
932 | auto array_of_null_map = ColumnArray(nullable.getNullMapColumnPtr()->assumeMutable(), getOffsetsPtr()->assumeMutable()) |
933 | .replicate(replicate_offsets); |
934 | |
935 | return ColumnArray::create( |
936 | ColumnNullable::create( |
937 | assert_cast<const ColumnArray &>(*array_of_nested).getDataPtr(), |
938 | assert_cast<const ColumnArray &>(*array_of_null_map).getDataPtr()), |
939 | assert_cast<const ColumnArray &>(*array_of_nested).getOffsetsPtr()); |
940 | } |
941 | |
942 | |
943 | ColumnPtr ColumnArray::replicateTuple(const Offsets & replicate_offsets) const |
944 | { |
945 | const ColumnTuple & tuple = assert_cast<const ColumnTuple &>(*data); |
946 | |
947 | /// Make temporary arrays for each components of Tuple. In the same way as for Nullable. |
948 | |
949 | size_t tuple_size = tuple.tupleSize(); |
950 | |
951 | if (tuple_size == 0) |
952 | throw Exception("Logical error: empty tuple" , ErrorCodes::LOGICAL_ERROR); |
953 | |
954 | Columns temporary_arrays(tuple_size); |
955 | for (size_t i = 0; i < tuple_size; ++i) |
956 | temporary_arrays[i] = ColumnArray(tuple.getColumns()[i]->assumeMutable(), getOffsetsPtr()->assumeMutable()) |
957 | .replicate(replicate_offsets); |
958 | |
959 | Columns tuple_columns(tuple_size); |
960 | for (size_t i = 0; i < tuple_size; ++i) |
961 | tuple_columns[i] = assert_cast<const ColumnArray &>(*temporary_arrays[i]).getDataPtr(); |
962 | |
963 | return ColumnArray::create( |
964 | ColumnTuple::create(tuple_columns), |
965 | assert_cast<const ColumnArray &>(*temporary_arrays.front()).getOffsetsPtr()); |
966 | } |
967 | |
968 | |
969 | void ColumnArray::gather(ColumnGathererStream & gatherer) |
970 | { |
971 | gatherer.gather(*this); |
972 | } |
973 | |
974 | } |
975 | |