1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "arrow/array.h" |
19 | |
20 | #include <algorithm> |
21 | #include <cstddef> |
22 | #include <cstdint> |
23 | #include <limits> |
24 | #include <sstream> |
25 | #include <utility> |
26 | |
27 | #include "arrow/buffer.h" |
28 | #include "arrow/compare.h" |
29 | #include "arrow/pretty_print.h" |
30 | #include "arrow/status.h" |
31 | #include "arrow/type.h" |
32 | #include "arrow/type_traits.h" |
33 | #include "arrow/util/bit-util.h" |
34 | #include "arrow/util/checked_cast.h" |
35 | #include "arrow/util/decimal.h" |
36 | #include "arrow/util/int-util.h" |
37 | #include "arrow/util/logging.h" |
38 | #include "arrow/util/macros.h" |
39 | #include "arrow/visitor.h" |
40 | #include "arrow/visitor_inline.h" |
41 | |
42 | namespace arrow { |
43 | |
44 | using internal::BitmapAnd; |
45 | using internal::checked_cast; |
46 | using internal::CopyBitmap; |
47 | using internal::CountSetBits; |
48 | |
49 | std::shared_ptr<ArrayData> ArrayData::Make(const std::shared_ptr<DataType>& type, |
50 | int64_t length, |
51 | std::vector<std::shared_ptr<Buffer>>&& buffers, |
52 | int64_t null_count, int64_t offset) { |
53 | return std::make_shared<ArrayData>(type, length, std::move(buffers), null_count, |
54 | offset); |
55 | } |
56 | |
57 | std::shared_ptr<ArrayData> ArrayData::Make( |
58 | const std::shared_ptr<DataType>& type, int64_t length, |
59 | const std::vector<std::shared_ptr<Buffer>>& buffers, int64_t null_count, |
60 | int64_t offset) { |
61 | return std::make_shared<ArrayData>(type, length, buffers, null_count, offset); |
62 | } |
63 | |
64 | std::shared_ptr<ArrayData> ArrayData::Make( |
65 | const std::shared_ptr<DataType>& type, int64_t length, |
66 | const std::vector<std::shared_ptr<Buffer>>& buffers, |
67 | const std::vector<std::shared_ptr<ArrayData>>& child_data, int64_t null_count, |
68 | int64_t offset) { |
69 | return std::make_shared<ArrayData>(type, length, buffers, child_data, null_count, |
70 | offset); |
71 | } |
72 | |
73 | std::shared_ptr<ArrayData> ArrayData::Make(const std::shared_ptr<DataType>& type, |
74 | int64_t length, int64_t null_count, |
75 | int64_t offset) { |
76 | return std::make_shared<ArrayData>(type, length, null_count, offset); |
77 | } |
78 | |
79 | // ---------------------------------------------------------------------- |
80 | // Base array class |
81 | |
82 | int64_t Array::null_count() const { |
83 | if (ARROW_PREDICT_FALSE(data_->null_count < 0)) { |
84 | if (data_->buffers[0]) { |
85 | data_->null_count = |
86 | data_->length - CountSetBits(null_bitmap_data_, data_->offset, data_->length); |
87 | |
88 | } else { |
89 | data_->null_count = 0; |
90 | } |
91 | } |
92 | return data_->null_count; |
93 | } |
94 | |
95 | bool Array::Equals(const Array& arr) const { return ArrayEquals(*this, arr); } |
96 | |
97 | bool Array::Equals(const std::shared_ptr<Array>& arr) const { |
98 | if (!arr) { |
99 | return false; |
100 | } |
101 | return Equals(*arr); |
102 | } |
103 | |
104 | bool Array::ApproxEquals(const Array& arr) const { return ArrayApproxEquals(*this, arr); } |
105 | |
106 | bool Array::ApproxEquals(const std::shared_ptr<Array>& arr) const { |
107 | if (!arr) { |
108 | return false; |
109 | } |
110 | return ApproxEquals(*arr); |
111 | } |
112 | |
113 | bool Array::RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, |
114 | const std::shared_ptr<Array>& other) const { |
115 | if (!other) { |
116 | return false; |
117 | } |
118 | return RangeEquals(*other, start_idx, end_idx, other_start_idx); |
119 | } |
120 | |
121 | bool Array::RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, |
122 | int64_t other_start_idx) const { |
123 | return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx); |
124 | } |
125 | |
126 | static inline std::shared_ptr<ArrayData> SliceData(const ArrayData& data, int64_t offset, |
127 | int64_t length) { |
128 | DCHECK_LE(offset, data.length); |
129 | length = std::min(data.length - offset, length); |
130 | offset += data.offset; |
131 | |
132 | auto new_data = data.Copy(); |
133 | new_data->length = length; |
134 | new_data->offset = offset; |
135 | new_data->null_count = data.null_count != 0 ? kUnknownNullCount : 0; |
136 | return new_data; |
137 | } |
138 | |
139 | std::shared_ptr<Array> Array::Slice(int64_t offset, int64_t length) const { |
140 | return MakeArray(SliceData(*data_, offset, length)); |
141 | } |
142 | |
143 | std::shared_ptr<Array> Array::Slice(int64_t offset) const { |
144 | int64_t slice_length = data_->length - offset; |
145 | return Slice(offset, slice_length); |
146 | } |
147 | |
148 | std::string Array::ToString() const { |
149 | std::stringstream ss; |
150 | DCHECK(PrettyPrint(*this, 0, &ss).ok()); |
151 | return ss.str(); |
152 | } |
153 | |
154 | NullArray::NullArray(int64_t length) { |
155 | SetData(ArrayData::Make(null(), length, {nullptr}, length)); |
156 | } |
157 | |
158 | // ---------------------------------------------------------------------- |
159 | // Primitive array base |
160 | |
161 | PrimitiveArray::PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length, |
162 | const std::shared_ptr<Buffer>& data, |
163 | const std::shared_ptr<Buffer>& null_bitmap, |
164 | int64_t null_count, int64_t offset) { |
165 | SetData(ArrayData::Make(type, length, {null_bitmap, data}, null_count, offset)); |
166 | } |
167 | |
168 | // ---------------------------------------------------------------------- |
169 | // BooleanArray |
170 | |
171 | BooleanArray::BooleanArray(const std::shared_ptr<ArrayData>& data) |
172 | : PrimitiveArray(data) { |
173 | DCHECK_EQ(data->type->id(), Type::BOOL); |
174 | } |
175 | |
176 | BooleanArray::BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data, |
177 | const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, |
178 | int64_t offset) |
179 | : PrimitiveArray(boolean(), length, data, null_bitmap, null_count, offset) {} |
180 | |
181 | // ---------------------------------------------------------------------- |
182 | // ListArray |
183 | |
184 | ListArray::ListArray(const std::shared_ptr<ArrayData>& data) { |
185 | DCHECK_EQ(data->type->id(), Type::LIST); |
186 | SetData(data); |
187 | } |
188 | |
189 | ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length, |
190 | const std::shared_ptr<Buffer>& value_offsets, |
191 | const std::shared_ptr<Array>& values, |
192 | const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, |
193 | int64_t offset) { |
194 | auto internal_data = |
195 | ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); |
196 | internal_data->child_data.emplace_back(values->data()); |
197 | SetData(internal_data); |
198 | } |
199 | |
200 | Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, |
201 | std::shared_ptr<Array>* out) { |
202 | if (offsets.length() == 0) { |
203 | return Status::Invalid("List offsets must have non-zero length" ); |
204 | } |
205 | |
206 | if (offsets.type_id() != Type::INT32) { |
207 | return Status::Invalid("List offsets must be signed int32" ); |
208 | } |
209 | |
210 | BufferVector buffers = {}; |
211 | |
212 | const auto& typed_offsets = checked_cast<const Int32Array&>(offsets); |
213 | |
214 | const int64_t num_offsets = offsets.length(); |
215 | |
216 | if (offsets.null_count() > 0) { |
217 | std::shared_ptr<Buffer> clean_offsets, clean_valid_bits; |
218 | |
219 | RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t), &clean_offsets)); |
220 | |
221 | // Copy valid bits, zero out the bit for the final offset |
222 | RETURN_NOT_OK(offsets.null_bitmap()->Copy(0, BitUtil::BytesForBits(num_offsets - 1), |
223 | &clean_valid_bits)); |
224 | BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets); |
225 | buffers.emplace_back(std::move(clean_valid_bits)); |
226 | |
227 | const int32_t* raw_offsets = typed_offsets.raw_values(); |
228 | auto clean_raw_offsets = reinterpret_cast<int32_t*>(clean_offsets->mutable_data()); |
229 | |
230 | // Must work backwards so we can tell how many values were in the last non-null value |
231 | DCHECK(offsets.IsValid(num_offsets - 1)); |
232 | int32_t current_offset = raw_offsets[num_offsets - 1]; |
233 | for (int64_t i = num_offsets - 1; i >= 0; --i) { |
234 | if (offsets.IsValid(i)) { |
235 | current_offset = raw_offsets[i]; |
236 | } |
237 | clean_raw_offsets[i] = current_offset; |
238 | } |
239 | |
240 | buffers.emplace_back(std::move(clean_offsets)); |
241 | } else { |
242 | buffers.emplace_back(offsets.null_bitmap()); |
243 | buffers.emplace_back(typed_offsets.values()); |
244 | } |
245 | |
246 | auto list_type = list(values.type()); |
247 | auto internal_data = ArrayData::Make(list_type, num_offsets - 1, std::move(buffers), |
248 | offsets.null_count(), offsets.offset()); |
249 | internal_data->child_data.push_back(values.data()); |
250 | |
251 | *out = std::make_shared<ListArray>(internal_data); |
252 | return Status::OK(); |
253 | } |
254 | |
255 | void ListArray::SetData(const std::shared_ptr<ArrayData>& data) { |
256 | this->Array::SetData(data); |
257 | DCHECK_EQ(data->buffers.size(), 2); |
258 | |
259 | auto value_offsets = data->buffers[1]; |
260 | raw_value_offsets_ = value_offsets == nullptr |
261 | ? nullptr |
262 | : reinterpret_cast<const int32_t*>(value_offsets->data()); |
263 | |
264 | DCHECK_EQ(data_->child_data.size(), 1); |
265 | values_ = MakeArray(data_->child_data[0]); |
266 | } |
267 | |
268 | std::shared_ptr<DataType> ListArray::value_type() const { |
269 | return checked_cast<const ListType&>(*type()).value_type(); |
270 | } |
271 | |
272 | std::shared_ptr<Array> ListArray::values() const { return values_; } |
273 | |
274 | // ---------------------------------------------------------------------- |
275 | // String and binary |
276 | |
277 | BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) { |
278 | DCHECK_EQ(data->type->id(), Type::BINARY); |
279 | SetData(data); |
280 | } |
281 | |
282 | void BinaryArray::SetData(const std::shared_ptr<ArrayData>& data) { |
283 | DCHECK_EQ(data->buffers.size(), 3); |
284 | auto value_offsets = data->buffers[1]; |
285 | auto value_data = data->buffers[2]; |
286 | this->Array::SetData(data); |
287 | raw_data_ = value_data == nullptr ? nullptr : value_data->data(); |
288 | raw_value_offsets_ = value_offsets == nullptr |
289 | ? nullptr |
290 | : reinterpret_cast<const int32_t*>(value_offsets->data()); |
291 | } |
292 | |
293 | BinaryArray::BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, |
294 | const std::shared_ptr<Buffer>& data, |
295 | const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, |
296 | int64_t offset) |
297 | : BinaryArray(binary(), length, value_offsets, data, null_bitmap, null_count, |
298 | offset) {} |
299 | |
300 | BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int64_t length, |
301 | const std::shared_ptr<Buffer>& value_offsets, |
302 | const std::shared_ptr<Buffer>& data, |
303 | const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, |
304 | int64_t offset) { |
305 | SetData(ArrayData::Make(type, length, {null_bitmap, value_offsets, data}, null_count, |
306 | offset)); |
307 | } |
308 | |
309 | StringArray::StringArray(const std::shared_ptr<ArrayData>& data) { |
310 | DCHECK_EQ(data->type->id(), Type::STRING); |
311 | SetData(data); |
312 | } |
313 | |
314 | StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, |
315 | const std::shared_ptr<Buffer>& data, |
316 | const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, |
317 | int64_t offset) |
318 | : BinaryArray(utf8(), length, value_offsets, data, null_bitmap, null_count, offset) {} |
319 | |
320 | // ---------------------------------------------------------------------- |
321 | // Fixed width binary |
322 | |
323 | FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) { |
324 | SetData(data); |
325 | } |
326 | |
327 | FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, |
328 | int64_t length, |
329 | const std::shared_ptr<Buffer>& data, |
330 | const std::shared_ptr<Buffer>& null_bitmap, |
331 | int64_t null_count, int64_t offset) |
332 | : PrimitiveArray(type, length, data, null_bitmap, null_count, offset), |
333 | byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {} |
334 | |
335 | const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const { |
336 | return raw_values_ + (i + data_->offset) * byte_width_; |
337 | } |
338 | |
339 | // ---------------------------------------------------------------------- |
340 | // Decimal |
341 | |
342 | Decimal128Array::Decimal128Array(const std::shared_ptr<ArrayData>& data) |
343 | : FixedSizeBinaryArray(data) { |
344 | DCHECK_EQ(data->type->id(), Type::DECIMAL); |
345 | } |
346 | |
347 | std::string Decimal128Array::FormatValue(int64_t i) const { |
348 | const auto& type_ = checked_cast<const Decimal128Type&>(*type()); |
349 | const Decimal128 value(GetValue(i)); |
350 | return value.ToString(type_.scale()); |
351 | } |
352 | |
353 | // ---------------------------------------------------------------------- |
354 | // Struct |
355 | |
356 | StructArray::StructArray(const std::shared_ptr<ArrayData>& data) { |
357 | DCHECK_EQ(data->type->id(), Type::STRUCT); |
358 | SetData(data); |
359 | boxed_fields_.resize(data->child_data.size()); |
360 | } |
361 | |
362 | StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length, |
363 | const std::vector<std::shared_ptr<Array>>& children, |
364 | std::shared_ptr<Buffer> null_bitmap, int64_t null_count, |
365 | int64_t offset) { |
366 | SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset)); |
367 | for (const auto& child : children) { |
368 | data_->child_data.push_back(child->data()); |
369 | } |
370 | boxed_fields_.resize(children.size()); |
371 | } |
372 | |
373 | const StructType* StructArray::struct_type() const { |
374 | return checked_cast<const StructType*>(data_->type.get()); |
375 | } |
376 | |
377 | std::shared_ptr<Array> StructArray::field(int i) const { |
378 | if (!boxed_fields_[i]) { |
379 | std::shared_ptr<ArrayData> field_data; |
380 | if (data_->offset != 0 || data_->child_data[i]->length != data_->length) { |
381 | field_data = SliceData(*data_->child_data[i].get(), data_->offset, data_->length); |
382 | } else { |
383 | field_data = data_->child_data[i]; |
384 | } |
385 | boxed_fields_[i] = MakeArray(field_data); |
386 | } |
387 | DCHECK(boxed_fields_[i]); |
388 | return boxed_fields_[i]; |
389 | } |
390 | |
391 | std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) const { |
392 | int i = struct_type()->GetFieldIndex(name); |
393 | return i == -1 ? nullptr : field(i); |
394 | } |
395 | |
396 | Status StructArray::Flatten(MemoryPool* pool, ArrayVector* out) const { |
397 | ArrayVector flattened; |
398 | std::shared_ptr<Buffer> null_bitmap = data_->buffers[0]; |
399 | |
400 | for (auto& child_data : data_->child_data) { |
401 | std::shared_ptr<Buffer> flattened_null_bitmap; |
402 | int64_t flattened_null_count = kUnknownNullCount; |
403 | |
404 | // Need to adjust for parent offset |
405 | if (data_->offset != 0 || data_->length != child_data->length) { |
406 | child_data = SliceData(*child_data, data_->offset, data_->length); |
407 | } |
408 | std::shared_ptr<Buffer> child_null_bitmap = child_data->buffers[0]; |
409 | const int64_t child_offset = child_data->offset; |
410 | |
411 | // The validity of a flattened datum is the logical AND of the struct |
412 | // element's validity and the individual field element's validity. |
413 | if (null_bitmap && child_null_bitmap) { |
414 | RETURN_NOT_OK(BitmapAnd(pool, child_null_bitmap->data(), child_offset, |
415 | null_bitmap_data_, data_->offset, data_->length, |
416 | child_offset, &flattened_null_bitmap)); |
417 | } else if (child_null_bitmap) { |
418 | flattened_null_bitmap = child_null_bitmap; |
419 | flattened_null_count = child_data->null_count; |
420 | } else if (null_bitmap) { |
421 | if (child_offset == data_->offset) { |
422 | flattened_null_bitmap = null_bitmap; |
423 | } else { |
424 | RETURN_NOT_OK(CopyBitmap(pool, null_bitmap_data_, data_->offset, data_->length, |
425 | &flattened_null_bitmap)); |
426 | } |
427 | flattened_null_count = data_->null_count; |
428 | } else { |
429 | flattened_null_count = 0; |
430 | } |
431 | |
432 | auto flattened_data = child_data->Copy(); |
433 | flattened_data->buffers[0] = flattened_null_bitmap; |
434 | flattened_data->null_count = flattened_null_count; |
435 | |
436 | flattened.push_back(MakeArray(flattened_data)); |
437 | } |
438 | |
439 | *out = flattened; |
440 | return Status::OK(); |
441 | } |
442 | |
443 | // ---------------------------------------------------------------------- |
444 | // UnionArray |
445 | |
446 | void UnionArray::SetData(const std::shared_ptr<ArrayData>& data) { |
447 | this->Array::SetData(data); |
448 | |
449 | DCHECK_EQ(data->buffers.size(), 3); |
450 | |
451 | auto type_ids = data_->buffers[1]; |
452 | auto value_offsets = data_->buffers[2]; |
453 | raw_type_ids_ = |
454 | type_ids == nullptr ? nullptr : reinterpret_cast<const uint8_t*>(type_ids->data()); |
455 | raw_value_offsets_ = value_offsets == nullptr |
456 | ? nullptr |
457 | : reinterpret_cast<const int32_t*>(value_offsets->data()); |
458 | boxed_fields_.resize(data->child_data.size()); |
459 | } |
460 | |
461 | UnionArray::UnionArray(const std::shared_ptr<ArrayData>& data) { |
462 | DCHECK_EQ(data->type->id(), Type::UNION); |
463 | SetData(data); |
464 | } |
465 | |
466 | UnionArray::UnionArray(const std::shared_ptr<DataType>& type, int64_t length, |
467 | const std::vector<std::shared_ptr<Array>>& children, |
468 | const std::shared_ptr<Buffer>& type_ids, |
469 | const std::shared_ptr<Buffer>& value_offsets, |
470 | const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, |
471 | int64_t offset) { |
472 | auto internal_data = ArrayData::Make( |
473 | type, length, {null_bitmap, type_ids, value_offsets}, null_count, offset); |
474 | for (const auto& child : children) { |
475 | internal_data->child_data.push_back(child->data()); |
476 | } |
477 | SetData(internal_data); |
478 | } |
479 | |
480 | Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets, |
481 | const std::vector<std::shared_ptr<Array>>& children, |
482 | std::shared_ptr<Array>* out) { |
483 | if (value_offsets.length() == 0) { |
484 | return Status::Invalid("UnionArray offsets must have non-zero length" ); |
485 | } |
486 | |
487 | if (value_offsets.type_id() != Type::INT32) { |
488 | return Status::Invalid("UnionArray offsets must be signed int32" ); |
489 | } |
490 | |
491 | if (type_ids.type_id() != Type::INT8) { |
492 | return Status::Invalid("UnionArray type_ids must be signed int8" ); |
493 | } |
494 | |
495 | if (value_offsets.null_count() != 0) { |
496 | return Status::Invalid("MakeDense does not allow NAs in value_offsets" ); |
497 | } |
498 | |
499 | BufferVector buffers = {type_ids.null_bitmap(), |
500 | checked_cast<const Int8Array&>(type_ids).values(), |
501 | checked_cast<const Int32Array&>(value_offsets).values()}; |
502 | auto union_type = union_(children, UnionMode::DENSE); |
503 | auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers), |
504 | type_ids.null_count(), type_ids.offset()); |
505 | for (const auto& child : children) { |
506 | internal_data->child_data.push_back(child->data()); |
507 | } |
508 | *out = std::make_shared<UnionArray>(internal_data); |
509 | return Status::OK(); |
510 | } |
511 | |
512 | Status UnionArray::MakeSparse(const Array& type_ids, |
513 | const std::vector<std::shared_ptr<Array>>& children, |
514 | std::shared_ptr<Array>* out) { |
515 | if (type_ids.type_id() != Type::INT8) { |
516 | return Status::Invalid("UnionArray type_ids must be signed int8" ); |
517 | } |
518 | BufferVector buffers = {type_ids.null_bitmap(), |
519 | checked_cast<const Int8Array&>(type_ids).values(), nullptr}; |
520 | auto union_type = union_(children, UnionMode::SPARSE); |
521 | auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers), |
522 | type_ids.null_count(), type_ids.offset()); |
523 | for (const auto& child : children) { |
524 | internal_data->child_data.push_back(child->data()); |
525 | if (child->length() != type_ids.length()) { |
526 | return Status::Invalid( |
527 | "Sparse UnionArray must have len(child) == len(type_ids) for all children" ); |
528 | } |
529 | } |
530 | *out = std::make_shared<UnionArray>(internal_data); |
531 | return Status::OK(); |
532 | } |
533 | |
534 | std::shared_ptr<Array> UnionArray::child(int i) const { |
535 | if (!boxed_fields_[i]) { |
536 | std::shared_ptr<ArrayData> child_data = data_->child_data[i]; |
537 | if (mode() == UnionMode::SPARSE) { |
538 | // Sparse union: need to adjust child if union is sliced |
539 | // (for dense unions, the need to lookup through the offsets |
540 | // makes this unnecessary) |
541 | if (data_->offset != 0 || child_data->length > data_->length) { |
542 | child_data = SliceData(*child_data.get(), data_->offset, data_->length); |
543 | } |
544 | } |
545 | boxed_fields_[i] = MakeArray(child_data); |
546 | } |
547 | DCHECK(boxed_fields_[i]); |
548 | return boxed_fields_[i]; |
549 | } |
550 | |
551 | const Array* UnionArray::UnsafeChild(int i) const { |
552 | if (!boxed_fields_[i]) { |
553 | boxed_fields_[i] = MakeArray(data_->child_data[i]); |
554 | } |
555 | DCHECK(boxed_fields_[i]); |
556 | return boxed_fields_[i].get(); |
557 | } |
558 | |
559 | // ---------------------------------------------------------------------- |
560 | // DictionaryArray |
561 | |
562 | /// \brief Perform validation check to determine if all dictionary indices |
563 | /// are within valid range (0 <= index < upper_bound) |
564 | /// |
565 | /// \param[in] indices array of dictionary indices |
566 | /// \param[in] upper_bound upper bound of valid range for indices |
567 | /// \return Status |
568 | template <typename ArrowType> |
569 | Status ValidateDictionaryIndices(const std::shared_ptr<Array>& indices, |
570 | const int64_t upper_bound) { |
571 | using ArrayType = typename TypeTraits<ArrowType>::ArrayType; |
572 | const auto& array = checked_cast<const ArrayType&>(*indices); |
573 | const typename ArrowType::c_type* data = array.raw_values(); |
574 | const int64_t size = array.length(); |
575 | |
576 | if (array.null_count() == 0) { |
577 | for (int64_t idx = 0; idx < size; ++idx) { |
578 | if (data[idx] < 0 || data[idx] >= upper_bound) { |
579 | return Status::Invalid("Dictionary has out-of-bound index [0, dict.length)" ); |
580 | } |
581 | } |
582 | } else { |
583 | for (int64_t idx = 0; idx < size; ++idx) { |
584 | if (!array.IsNull(idx)) { |
585 | if (data[idx] < 0 || data[idx] >= upper_bound) { |
586 | return Status::Invalid("Dictionary has out-of-bound index [0, dict.length)" ); |
587 | } |
588 | } |
589 | } |
590 | } |
591 | |
592 | return Status::OK(); |
593 | } |
594 | |
595 | DictionaryArray::DictionaryArray(const std::shared_ptr<ArrayData>& data) |
596 | : dict_type_(checked_cast<const DictionaryType*>(data->type.get())) { |
597 | DCHECK_EQ(data->type->id(), Type::DICTIONARY); |
598 | SetData(data); |
599 | } |
600 | |
601 | DictionaryArray::DictionaryArray(const std::shared_ptr<DataType>& type, |
602 | const std::shared_ptr<Array>& indices) |
603 | : dict_type_(checked_cast<const DictionaryType*>(type.get())) { |
604 | DCHECK_EQ(type->id(), Type::DICTIONARY); |
605 | DCHECK_EQ(indices->type_id(), dict_type_->index_type()->id()); |
606 | auto data = indices->data()->Copy(); |
607 | data->type = type; |
608 | SetData(data); |
609 | } |
610 | |
611 | Status DictionaryArray::FromArrays(const std::shared_ptr<DataType>& type, |
612 | const std::shared_ptr<Array>& indices, |
613 | std::shared_ptr<Array>* out) { |
614 | DCHECK_EQ(type->id(), Type::DICTIONARY); |
615 | const auto& dict = checked_cast<const DictionaryType&>(*type); |
616 | DCHECK_EQ(indices->type_id(), dict.index_type()->id()); |
617 | |
618 | int64_t upper_bound = dict.dictionary()->length(); |
619 | Status is_valid; |
620 | |
621 | switch (indices->type_id()) { |
622 | case Type::INT8: |
623 | is_valid = ValidateDictionaryIndices<Int8Type>(indices, upper_bound); |
624 | break; |
625 | case Type::INT16: |
626 | is_valid = ValidateDictionaryIndices<Int16Type>(indices, upper_bound); |
627 | break; |
628 | case Type::INT32: |
629 | is_valid = ValidateDictionaryIndices<Int32Type>(indices, upper_bound); |
630 | break; |
631 | case Type::INT64: |
632 | is_valid = ValidateDictionaryIndices<Int64Type>(indices, upper_bound); |
633 | break; |
634 | default: |
635 | return Status::NotImplemented("Categorical index type not supported: " , |
636 | indices->type()->ToString()); |
637 | } |
638 | |
639 | if (!is_valid.ok()) { |
640 | return is_valid; |
641 | } |
642 | |
643 | *out = std::make_shared<DictionaryArray>(type, indices); |
644 | return is_valid; |
645 | } |
646 | |
647 | void DictionaryArray::SetData(const std::shared_ptr<ArrayData>& data) { |
648 | this->Array::SetData(data); |
649 | auto indices_data = data_->Copy(); |
650 | indices_data->type = dict_type_->index_type(); |
651 | indices_ = MakeArray(indices_data); |
652 | } |
653 | |
654 | std::shared_ptr<Array> DictionaryArray::indices() const { return indices_; } |
655 | |
656 | std::shared_ptr<Array> DictionaryArray::dictionary() const { |
657 | return dict_type_->dictionary(); |
658 | } |
659 | |
660 | template <typename InType, typename OutType> |
661 | static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data, |
662 | const std::shared_ptr<DataType>& type, |
663 | const std::vector<int32_t>& transpose_map, |
664 | std::shared_ptr<Array>* out) { |
665 | using in_c_type = typename InType::c_type; |
666 | using out_c_type = typename OutType::c_type; |
667 | |
668 | std::shared_ptr<Buffer> out_buffer; |
669 | RETURN_NOT_OK(AllocateBuffer(pool, in_data.length * sizeof(out_c_type), &out_buffer)); |
670 | // Null bitmap is unchanged |
671 | auto out_data = ArrayData::Make(type, in_data.length, {in_data.buffers[0], out_buffer}, |
672 | in_data.null_count); |
673 | internal::TransposeInts(in_data.GetValues<in_c_type>(1), |
674 | out_data->GetMutableValues<out_c_type>(1), in_data.length, |
675 | transpose_map.data()); |
676 | *out = MakeArray(out_data); |
677 | return Status::OK(); |
678 | } |
679 | |
680 | Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type, |
681 | const std::vector<int32_t>& transpose_map, |
682 | std::shared_ptr<Array>* out) const { |
683 | DCHECK_EQ(type->id(), Type::DICTIONARY); |
684 | const auto& out_dict_type = checked_cast<const DictionaryType&>(*type); |
685 | |
686 | // XXX We'll probably want to make this operation a kernel when we |
687 | // implement dictionary-to-dictionary casting. |
688 | auto in_type_id = dict_type_->index_type()->id(); |
689 | auto out_type_id = out_dict_type.index_type()->id(); |
690 | |
691 | #define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \ |
692 | case OUT_INDEX_TYPE::type_id: \ |
693 | return TransposeDictIndices<IN_INDEX_TYPE, OUT_INDEX_TYPE>(pool, *data(), type, \ |
694 | transpose_map, out); |
695 | |
696 | #define TRANSPOSE_IN_CASE(IN_INDEX_TYPE) \ |
697 | case IN_INDEX_TYPE::type_id: \ |
698 | switch (out_type_id) { \ |
699 | TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int8Type) \ |
700 | TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int16Type) \ |
701 | TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int32Type) \ |
702 | TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int64Type) \ |
703 | default: \ |
704 | return Status::NotImplemented("unexpected index type"); \ |
705 | } |
706 | |
707 | switch (in_type_id) { |
708 | TRANSPOSE_IN_CASE(Int8Type) |
709 | TRANSPOSE_IN_CASE(Int16Type) |
710 | TRANSPOSE_IN_CASE(Int32Type) |
711 | TRANSPOSE_IN_CASE(Int64Type) |
712 | default: |
713 | return Status::NotImplemented("unexpected index type" ); |
714 | } |
715 | |
716 | #undef TRANSPOSE_IN_OUT_CASE |
717 | #undef TRANSPOSE_IN_CASE |
718 | } |
719 | |
720 | // ---------------------------------------------------------------------- |
721 | // Implement Array::Accept as inline visitor |
722 | |
723 | Status Array::Accept(ArrayVisitor* visitor) const { |
724 | return VisitArrayInline(*this, visitor); |
725 | } |
726 | |
727 | // ---------------------------------------------------------------------- |
728 | // Implement Array::Validate as inline visitor |
729 | |
730 | namespace internal { |
731 | |
732 | struct ValidateVisitor { |
733 | Status Visit(const NullArray&) { return Status::OK(); } |
734 | |
735 | Status Visit(const PrimitiveArray& array) { |
736 | ARROW_RETURN_IF(array.data()->buffers.size() != 2, |
737 | Status::Invalid("number of buffers was != 2" )); |
738 | |
739 | ARROW_RETURN_IF(array.values() == nullptr, Status::Invalid("values was null" )); |
740 | |
741 | return Status::OK(); |
742 | } |
743 | |
744 | Status Visit(const Decimal128Array& array) { |
745 | if (array.data()->buffers.size() != 2) { |
746 | return Status::Invalid("number of buffers was != 2" ); |
747 | } |
748 | if (array.values() == nullptr) { |
749 | return Status::Invalid("values was null" ); |
750 | } |
751 | return Status::OK(); |
752 | } |
753 | |
754 | Status Visit(const BinaryArray& array) { |
755 | if (array.data()->buffers.size() != 3) { |
756 | return Status::Invalid("number of buffers was != 3" ); |
757 | } |
758 | return Status::OK(); |
759 | } |
760 | |
761 | Status Visit(const ListArray& array) { |
762 | if (array.length() < 0) { |
763 | return Status::Invalid("Length was negative" ); |
764 | } |
765 | |
766 | auto value_offsets = array.value_offsets(); |
767 | if (array.length() && !value_offsets) { |
768 | return Status::Invalid("value_offsets_ was null" ); |
769 | } |
770 | if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) { |
771 | return Status::Invalid("offset buffer size (bytes): " , value_offsets->size(), |
772 | " isn't large enough for length: " , array.length()); |
773 | } |
774 | |
775 | if (!array.values()) { |
776 | return Status::Invalid("values was null" ); |
777 | } |
778 | |
779 | const int32_t last_offset = array.value_offset(array.length()); |
780 | if (array.values()->length() != last_offset) { |
781 | return Status::Invalid("Final offset invariant not equal to values length: " , |
782 | last_offset, "!=" , array.values()->length()); |
783 | } |
784 | |
785 | const Status child_valid = ValidateArray(*array.values()); |
786 | if (!child_valid.ok()) { |
787 | return Status::Invalid("Child array invalid: " , child_valid.ToString()); |
788 | } |
789 | |
790 | int32_t prev_offset = array.value_offset(0); |
791 | if (prev_offset != 0) { |
792 | return Status::Invalid("The first offset wasn't zero" ); |
793 | } |
794 | for (int64_t i = 1; i <= array.length(); ++i) { |
795 | int32_t current_offset = array.value_offset(i); |
796 | if (array.IsNull(i - 1) && current_offset != prev_offset) { |
797 | return Status::Invalid("Offset invariant failure at: " , i, |
798 | " inconsistent value_offsets for null slot" , |
799 | current_offset, "!=" , prev_offset); |
800 | } |
801 | if (current_offset < prev_offset) { |
802 | return Status::Invalid("Offset invariant failure: " , i, |
803 | " inconsistent offset for non-null slot: " , current_offset, |
804 | "<" , prev_offset); |
805 | } |
806 | prev_offset = current_offset; |
807 | } |
808 | return Status::OK(); |
809 | } |
810 | |
811 | Status Visit(const StructArray& array) { |
812 | if (array.length() < 0) { |
813 | return Status::Invalid("Length was negative" ); |
814 | } |
815 | |
816 | if (array.null_count() > array.length()) { |
817 | return Status::Invalid("Null count exceeds the length of this struct" ); |
818 | } |
819 | |
820 | if (array.num_fields() > 0) { |
821 | // Validate fields |
822 | int64_t array_length = array.field(0)->length(); |
823 | size_t idx = 0; |
824 | for (int i = 0; i < array.num_fields(); ++i) { |
825 | auto it = array.field(i); |
826 | if (it->length() != array_length) { |
827 | return Status::Invalid("Length is not equal from field " , |
828 | it->type()->ToString(), " at position [" , idx, "]" ); |
829 | } |
830 | |
831 | const Status child_valid = ValidateArray(*it); |
832 | if (!child_valid.ok()) { |
833 | return Status::Invalid("Child array invalid: " , child_valid.ToString(), |
834 | " at position [" , idx, "}" ); |
835 | } |
836 | ++idx; |
837 | } |
838 | |
839 | if (array_length > 0 && array_length != array.length()) { |
840 | return Status::Invalid("Struct's length is not equal to its child arrays" ); |
841 | } |
842 | } |
843 | return Status::OK(); |
844 | } |
845 | |
846 | Status Visit(const UnionArray& array) { |
847 | if (array.length() < 0) { |
848 | return Status::Invalid("Length was negative" ); |
849 | } |
850 | |
851 | if (array.null_count() > array.length()) { |
852 | return Status::Invalid("Null count exceeds the length of this struct" ); |
853 | } |
854 | return Status::OK(); |
855 | } |
856 | |
857 | Status Visit(const DictionaryArray& array) { |
858 | Type::type index_type_id = array.indices()->type()->id(); |
859 | if (!is_integer(index_type_id)) { |
860 | return Status::Invalid("Dictionary indices must be integer type" ); |
861 | } |
862 | return Status::OK(); |
863 | } |
864 | }; |
865 | |
866 | } // namespace internal |
867 | |
868 | Status ValidateArray(const Array& array) { |
869 | internal::ValidateVisitor validate_visitor; |
870 | return VisitArrayInline(array, &validate_visitor); |
871 | } |
872 | |
873 | // ---------------------------------------------------------------------- |
874 | // Loading from ArrayData |
875 | |
876 | namespace internal { |
877 | |
878 | class ArrayDataWrapper { |
879 | public: |
880 | ArrayDataWrapper(const std::shared_ptr<ArrayData>& data, std::shared_ptr<Array>* out) |
881 | : data_(data), out_(out) {} |
882 | |
883 | template <typename T> |
884 | Status Visit(const T&) { |
885 | using ArrayType = typename TypeTraits<T>::ArrayType; |
886 | *out_ = std::make_shared<ArrayType>(data_); |
887 | return Status::OK(); |
888 | } |
889 | |
890 | const std::shared_ptr<ArrayData>& data_; |
891 | std::shared_ptr<Array>* out_; |
892 | }; |
893 | |
894 | } // namespace internal |
895 | |
896 | std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data) { |
897 | std::shared_ptr<Array> out; |
898 | internal::ArrayDataWrapper wrapper_visitor(data, &out); |
899 | Status s = VisitTypeInline(*data->type, &wrapper_visitor); |
900 | DCHECK(s.ok()); |
901 | DCHECK(out); |
902 | return out; |
903 | } |
904 | |
905 | // ---------------------------------------------------------------------- |
906 | // Misc APIs |
907 | |
908 | namespace internal { |
909 | |
910 | std::vector<ArrayVector> RechunkArraysConsistently( |
911 | const std::vector<ArrayVector>& groups) { |
912 | if (groups.size() <= 1) { |
913 | return groups; |
914 | } |
915 | int64_t total_length = 0; |
916 | for (const auto& array : groups.front()) { |
917 | total_length += array->length(); |
918 | } |
919 | #ifndef NDEBUG |
920 | for (const auto& group : groups) { |
921 | int64_t group_length = 0; |
922 | for (const auto& array : group) { |
923 | group_length += array->length(); |
924 | } |
925 | DCHECK_EQ(group_length, total_length) |
926 | << "Array groups should have the same total number of elements" ; |
927 | } |
928 | #endif |
929 | if (total_length == 0) { |
930 | return groups; |
931 | } |
932 | |
933 | // Set up result vectors |
934 | std::vector<ArrayVector> rechunked_groups(groups.size()); |
935 | |
936 | // Set up progress counters |
937 | std::vector<ArrayVector::const_iterator> current_arrays; |
938 | std::vector<int64_t> array_offsets; |
939 | for (const auto& group : groups) { |
940 | current_arrays.emplace_back(group.cbegin()); |
941 | array_offsets.emplace_back(0); |
942 | } |
943 | |
944 | // Scan all array vectors at once, rechunking along the way |
945 | int64_t start = 0; |
946 | while (start < total_length) { |
947 | // First compute max possible length for next chunk |
948 | int64_t chunk_length = std::numeric_limits<int64_t>::max(); |
949 | for (size_t i = 0; i < groups.size(); i++) { |
950 | auto& arr_it = current_arrays[i]; |
951 | auto& offset = array_offsets[i]; |
952 | // Skip any done arrays (including 0-length arrays) |
953 | while (offset == (*arr_it)->length()) { |
954 | ++arr_it; |
955 | offset = 0; |
956 | } |
957 | const auto& array = *arr_it; |
958 | DCHECK_GT(array->length(), offset); |
959 | chunk_length = std::min(chunk_length, array->length() - offset); |
960 | } |
961 | DCHECK_GT(chunk_length, 0); |
962 | |
963 | // Then slice all arrays along this chunk size |
964 | for (size_t i = 0; i < groups.size(); i++) { |
965 | const auto& array = *current_arrays[i]; |
966 | auto& offset = array_offsets[i]; |
967 | if (offset == 0 && array->length() == chunk_length) { |
968 | // Slice spans entire array |
969 | rechunked_groups[i].emplace_back(array); |
970 | } else { |
971 | DCHECK_LT(chunk_length - offset, array->length()); |
972 | rechunked_groups[i].emplace_back(array->Slice(offset, chunk_length)); |
973 | } |
974 | offset += chunk_length; |
975 | } |
976 | start += chunk_length; |
977 | } |
978 | |
979 | return rechunked_groups; |
980 | } |
981 | |
982 | } // namespace internal |
983 | |
984 | } // namespace arrow |
985 | |