1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/array.h"
19
20#include <algorithm>
21#include <cstddef>
22#include <cstdint>
23#include <limits>
24#include <sstream>
25#include <utility>
26
27#include "arrow/buffer.h"
28#include "arrow/compare.h"
29#include "arrow/pretty_print.h"
30#include "arrow/status.h"
31#include "arrow/type.h"
32#include "arrow/type_traits.h"
33#include "arrow/util/bit-util.h"
34#include "arrow/util/checked_cast.h"
35#include "arrow/util/decimal.h"
36#include "arrow/util/int-util.h"
37#include "arrow/util/logging.h"
38#include "arrow/util/macros.h"
39#include "arrow/visitor.h"
40#include "arrow/visitor_inline.h"
41
42namespace arrow {
43
44using internal::BitmapAnd;
45using internal::checked_cast;
46using internal::CopyBitmap;
47using internal::CountSetBits;
48
49std::shared_ptr<ArrayData> ArrayData::Make(const std::shared_ptr<DataType>& type,
50 int64_t length,
51 std::vector<std::shared_ptr<Buffer>>&& buffers,
52 int64_t null_count, int64_t offset) {
53 return std::make_shared<ArrayData>(type, length, std::move(buffers), null_count,
54 offset);
55}
56
57std::shared_ptr<ArrayData> ArrayData::Make(
58 const std::shared_ptr<DataType>& type, int64_t length,
59 const std::vector<std::shared_ptr<Buffer>>& buffers, int64_t null_count,
60 int64_t offset) {
61 return std::make_shared<ArrayData>(type, length, buffers, null_count, offset);
62}
63
64std::shared_ptr<ArrayData> ArrayData::Make(
65 const std::shared_ptr<DataType>& type, int64_t length,
66 const std::vector<std::shared_ptr<Buffer>>& buffers,
67 const std::vector<std::shared_ptr<ArrayData>>& child_data, int64_t null_count,
68 int64_t offset) {
69 return std::make_shared<ArrayData>(type, length, buffers, child_data, null_count,
70 offset);
71}
72
73std::shared_ptr<ArrayData> ArrayData::Make(const std::shared_ptr<DataType>& type,
74 int64_t length, int64_t null_count,
75 int64_t offset) {
76 return std::make_shared<ArrayData>(type, length, null_count, offset);
77}
78
79// ----------------------------------------------------------------------
80// Base array class
81
82int64_t Array::null_count() const {
83 if (ARROW_PREDICT_FALSE(data_->null_count < 0)) {
84 if (data_->buffers[0]) {
85 data_->null_count =
86 data_->length - CountSetBits(null_bitmap_data_, data_->offset, data_->length);
87
88 } else {
89 data_->null_count = 0;
90 }
91 }
92 return data_->null_count;
93}
94
95bool Array::Equals(const Array& arr) const { return ArrayEquals(*this, arr); }
96
97bool Array::Equals(const std::shared_ptr<Array>& arr) const {
98 if (!arr) {
99 return false;
100 }
101 return Equals(*arr);
102}
103
104bool Array::ApproxEquals(const Array& arr) const { return ArrayApproxEquals(*this, arr); }
105
106bool Array::ApproxEquals(const std::shared_ptr<Array>& arr) const {
107 if (!arr) {
108 return false;
109 }
110 return ApproxEquals(*arr);
111}
112
113bool Array::RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
114 const std::shared_ptr<Array>& other) const {
115 if (!other) {
116 return false;
117 }
118 return RangeEquals(*other, start_idx, end_idx, other_start_idx);
119}
120
121bool Array::RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
122 int64_t other_start_idx) const {
123 return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx);
124}
125
126static inline std::shared_ptr<ArrayData> SliceData(const ArrayData& data, int64_t offset,
127 int64_t length) {
128 DCHECK_LE(offset, data.length);
129 length = std::min(data.length - offset, length);
130 offset += data.offset;
131
132 auto new_data = data.Copy();
133 new_data->length = length;
134 new_data->offset = offset;
135 new_data->null_count = data.null_count != 0 ? kUnknownNullCount : 0;
136 return new_data;
137}
138
139std::shared_ptr<Array> Array::Slice(int64_t offset, int64_t length) const {
140 return MakeArray(SliceData(*data_, offset, length));
141}
142
143std::shared_ptr<Array> Array::Slice(int64_t offset) const {
144 int64_t slice_length = data_->length - offset;
145 return Slice(offset, slice_length);
146}
147
148std::string Array::ToString() const {
149 std::stringstream ss;
150 DCHECK(PrettyPrint(*this, 0, &ss).ok());
151 return ss.str();
152}
153
154NullArray::NullArray(int64_t length) {
155 SetData(ArrayData::Make(null(), length, {nullptr}, length));
156}
157
158// ----------------------------------------------------------------------
159// Primitive array base
160
161PrimitiveArray::PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
162 const std::shared_ptr<Buffer>& data,
163 const std::shared_ptr<Buffer>& null_bitmap,
164 int64_t null_count, int64_t offset) {
165 SetData(ArrayData::Make(type, length, {null_bitmap, data}, null_count, offset));
166}
167
168// ----------------------------------------------------------------------
169// BooleanArray
170
171BooleanArray::BooleanArray(const std::shared_ptr<ArrayData>& data)
172 : PrimitiveArray(data) {
173 DCHECK_EQ(data->type->id(), Type::BOOL);
174}
175
176BooleanArray::BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
177 const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
178 int64_t offset)
179 : PrimitiveArray(boolean(), length, data, null_bitmap, null_count, offset) {}
180
181// ----------------------------------------------------------------------
182// ListArray
183
184ListArray::ListArray(const std::shared_ptr<ArrayData>& data) {
185 DCHECK_EQ(data->type->id(), Type::LIST);
186 SetData(data);
187}
188
189ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length,
190 const std::shared_ptr<Buffer>& value_offsets,
191 const std::shared_ptr<Array>& values,
192 const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
193 int64_t offset) {
194 auto internal_data =
195 ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset);
196 internal_data->child_data.emplace_back(values->data());
197 SetData(internal_data);
198}
199
200Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
201 std::shared_ptr<Array>* out) {
202 if (offsets.length() == 0) {
203 return Status::Invalid("List offsets must have non-zero length");
204 }
205
206 if (offsets.type_id() != Type::INT32) {
207 return Status::Invalid("List offsets must be signed int32");
208 }
209
210 BufferVector buffers = {};
211
212 const auto& typed_offsets = checked_cast<const Int32Array&>(offsets);
213
214 const int64_t num_offsets = offsets.length();
215
216 if (offsets.null_count() > 0) {
217 std::shared_ptr<Buffer> clean_offsets, clean_valid_bits;
218
219 RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t), &clean_offsets));
220
221 // Copy valid bits, zero out the bit for the final offset
222 RETURN_NOT_OK(offsets.null_bitmap()->Copy(0, BitUtil::BytesForBits(num_offsets - 1),
223 &clean_valid_bits));
224 BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets);
225 buffers.emplace_back(std::move(clean_valid_bits));
226
227 const int32_t* raw_offsets = typed_offsets.raw_values();
228 auto clean_raw_offsets = reinterpret_cast<int32_t*>(clean_offsets->mutable_data());
229
230 // Must work backwards so we can tell how many values were in the last non-null value
231 DCHECK(offsets.IsValid(num_offsets - 1));
232 int32_t current_offset = raw_offsets[num_offsets - 1];
233 for (int64_t i = num_offsets - 1; i >= 0; --i) {
234 if (offsets.IsValid(i)) {
235 current_offset = raw_offsets[i];
236 }
237 clean_raw_offsets[i] = current_offset;
238 }
239
240 buffers.emplace_back(std::move(clean_offsets));
241 } else {
242 buffers.emplace_back(offsets.null_bitmap());
243 buffers.emplace_back(typed_offsets.values());
244 }
245
246 auto list_type = list(values.type());
247 auto internal_data = ArrayData::Make(list_type, num_offsets - 1, std::move(buffers),
248 offsets.null_count(), offsets.offset());
249 internal_data->child_data.push_back(values.data());
250
251 *out = std::make_shared<ListArray>(internal_data);
252 return Status::OK();
253}
254
255void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
256 this->Array::SetData(data);
257 DCHECK_EQ(data->buffers.size(), 2);
258
259 auto value_offsets = data->buffers[1];
260 raw_value_offsets_ = value_offsets == nullptr
261 ? nullptr
262 : reinterpret_cast<const int32_t*>(value_offsets->data());
263
264 DCHECK_EQ(data_->child_data.size(), 1);
265 values_ = MakeArray(data_->child_data[0]);
266}
267
268std::shared_ptr<DataType> ListArray::value_type() const {
269 return checked_cast<const ListType&>(*type()).value_type();
270}
271
272std::shared_ptr<Array> ListArray::values() const { return values_; }
273
274// ----------------------------------------------------------------------
275// String and binary
276
277BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) {
278 DCHECK_EQ(data->type->id(), Type::BINARY);
279 SetData(data);
280}
281
282void BinaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
283 DCHECK_EQ(data->buffers.size(), 3);
284 auto value_offsets = data->buffers[1];
285 auto value_data = data->buffers[2];
286 this->Array::SetData(data);
287 raw_data_ = value_data == nullptr ? nullptr : value_data->data();
288 raw_value_offsets_ = value_offsets == nullptr
289 ? nullptr
290 : reinterpret_cast<const int32_t*>(value_offsets->data());
291}
292
293BinaryArray::BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
294 const std::shared_ptr<Buffer>& data,
295 const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
296 int64_t offset)
297 : BinaryArray(binary(), length, value_offsets, data, null_bitmap, null_count,
298 offset) {}
299
300BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
301 const std::shared_ptr<Buffer>& value_offsets,
302 const std::shared_ptr<Buffer>& data,
303 const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
304 int64_t offset) {
305 SetData(ArrayData::Make(type, length, {null_bitmap, value_offsets, data}, null_count,
306 offset));
307}
308
309StringArray::StringArray(const std::shared_ptr<ArrayData>& data) {
310 DCHECK_EQ(data->type->id(), Type::STRING);
311 SetData(data);
312}
313
314StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
315 const std::shared_ptr<Buffer>& data,
316 const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
317 int64_t offset)
318 : BinaryArray(utf8(), length, value_offsets, data, null_bitmap, null_count, offset) {}
319
320// ----------------------------------------------------------------------
321// Fixed width binary
322
323FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
324 SetData(data);
325}
326
327FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<DataType>& type,
328 int64_t length,
329 const std::shared_ptr<Buffer>& data,
330 const std::shared_ptr<Buffer>& null_bitmap,
331 int64_t null_count, int64_t offset)
332 : PrimitiveArray(type, length, data, null_bitmap, null_count, offset),
333 byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}
334
335const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const {
336 return raw_values_ + (i + data_->offset) * byte_width_;
337}
338
339// ----------------------------------------------------------------------
340// Decimal
341
342Decimal128Array::Decimal128Array(const std::shared_ptr<ArrayData>& data)
343 : FixedSizeBinaryArray(data) {
344 DCHECK_EQ(data->type->id(), Type::DECIMAL);
345}
346
347std::string Decimal128Array::FormatValue(int64_t i) const {
348 const auto& type_ = checked_cast<const Decimal128Type&>(*type());
349 const Decimal128 value(GetValue(i));
350 return value.ToString(type_.scale());
351}
352
353// ----------------------------------------------------------------------
354// Struct
355
356StructArray::StructArray(const std::shared_ptr<ArrayData>& data) {
357 DCHECK_EQ(data->type->id(), Type::STRUCT);
358 SetData(data);
359 boxed_fields_.resize(data->child_data.size());
360}
361
362StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length,
363 const std::vector<std::shared_ptr<Array>>& children,
364 std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
365 int64_t offset) {
366 SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset));
367 for (const auto& child : children) {
368 data_->child_data.push_back(child->data());
369 }
370 boxed_fields_.resize(children.size());
371}
372
373const StructType* StructArray::struct_type() const {
374 return checked_cast<const StructType*>(data_->type.get());
375}
376
377std::shared_ptr<Array> StructArray::field(int i) const {
378 if (!boxed_fields_[i]) {
379 std::shared_ptr<ArrayData> field_data;
380 if (data_->offset != 0 || data_->child_data[i]->length != data_->length) {
381 field_data = SliceData(*data_->child_data[i].get(), data_->offset, data_->length);
382 } else {
383 field_data = data_->child_data[i];
384 }
385 boxed_fields_[i] = MakeArray(field_data);
386 }
387 DCHECK(boxed_fields_[i]);
388 return boxed_fields_[i];
389}
390
391std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) const {
392 int i = struct_type()->GetFieldIndex(name);
393 return i == -1 ? nullptr : field(i);
394}
395
396Status StructArray::Flatten(MemoryPool* pool, ArrayVector* out) const {
397 ArrayVector flattened;
398 std::shared_ptr<Buffer> null_bitmap = data_->buffers[0];
399
400 for (auto& child_data : data_->child_data) {
401 std::shared_ptr<Buffer> flattened_null_bitmap;
402 int64_t flattened_null_count = kUnknownNullCount;
403
404 // Need to adjust for parent offset
405 if (data_->offset != 0 || data_->length != child_data->length) {
406 child_data = SliceData(*child_data, data_->offset, data_->length);
407 }
408 std::shared_ptr<Buffer> child_null_bitmap = child_data->buffers[0];
409 const int64_t child_offset = child_data->offset;
410
411 // The validity of a flattened datum is the logical AND of the struct
412 // element's validity and the individual field element's validity.
413 if (null_bitmap && child_null_bitmap) {
414 RETURN_NOT_OK(BitmapAnd(pool, child_null_bitmap->data(), child_offset,
415 null_bitmap_data_, data_->offset, data_->length,
416 child_offset, &flattened_null_bitmap));
417 } else if (child_null_bitmap) {
418 flattened_null_bitmap = child_null_bitmap;
419 flattened_null_count = child_data->null_count;
420 } else if (null_bitmap) {
421 if (child_offset == data_->offset) {
422 flattened_null_bitmap = null_bitmap;
423 } else {
424 RETURN_NOT_OK(CopyBitmap(pool, null_bitmap_data_, data_->offset, data_->length,
425 &flattened_null_bitmap));
426 }
427 flattened_null_count = data_->null_count;
428 } else {
429 flattened_null_count = 0;
430 }
431
432 auto flattened_data = child_data->Copy();
433 flattened_data->buffers[0] = flattened_null_bitmap;
434 flattened_data->null_count = flattened_null_count;
435
436 flattened.push_back(MakeArray(flattened_data));
437 }
438
439 *out = flattened;
440 return Status::OK();
441}
442
443// ----------------------------------------------------------------------
444// UnionArray
445
446void UnionArray::SetData(const std::shared_ptr<ArrayData>& data) {
447 this->Array::SetData(data);
448
449 DCHECK_EQ(data->buffers.size(), 3);
450
451 auto type_ids = data_->buffers[1];
452 auto value_offsets = data_->buffers[2];
453 raw_type_ids_ =
454 type_ids == nullptr ? nullptr : reinterpret_cast<const uint8_t*>(type_ids->data());
455 raw_value_offsets_ = value_offsets == nullptr
456 ? nullptr
457 : reinterpret_cast<const int32_t*>(value_offsets->data());
458 boxed_fields_.resize(data->child_data.size());
459}
460
461UnionArray::UnionArray(const std::shared_ptr<ArrayData>& data) {
462 DCHECK_EQ(data->type->id(), Type::UNION);
463 SetData(data);
464}
465
466UnionArray::UnionArray(const std::shared_ptr<DataType>& type, int64_t length,
467 const std::vector<std::shared_ptr<Array>>& children,
468 const std::shared_ptr<Buffer>& type_ids,
469 const std::shared_ptr<Buffer>& value_offsets,
470 const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
471 int64_t offset) {
472 auto internal_data = ArrayData::Make(
473 type, length, {null_bitmap, type_ids, value_offsets}, null_count, offset);
474 for (const auto& child : children) {
475 internal_data->child_data.push_back(child->data());
476 }
477 SetData(internal_data);
478}
479
480Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets,
481 const std::vector<std::shared_ptr<Array>>& children,
482 std::shared_ptr<Array>* out) {
483 if (value_offsets.length() == 0) {
484 return Status::Invalid("UnionArray offsets must have non-zero length");
485 }
486
487 if (value_offsets.type_id() != Type::INT32) {
488 return Status::Invalid("UnionArray offsets must be signed int32");
489 }
490
491 if (type_ids.type_id() != Type::INT8) {
492 return Status::Invalid("UnionArray type_ids must be signed int8");
493 }
494
495 if (value_offsets.null_count() != 0) {
496 return Status::Invalid("MakeDense does not allow NAs in value_offsets");
497 }
498
499 BufferVector buffers = {type_ids.null_bitmap(),
500 checked_cast<const Int8Array&>(type_ids).values(),
501 checked_cast<const Int32Array&>(value_offsets).values()};
502 auto union_type = union_(children, UnionMode::DENSE);
503 auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers),
504 type_ids.null_count(), type_ids.offset());
505 for (const auto& child : children) {
506 internal_data->child_data.push_back(child->data());
507 }
508 *out = std::make_shared<UnionArray>(internal_data);
509 return Status::OK();
510}
511
512Status UnionArray::MakeSparse(const Array& type_ids,
513 const std::vector<std::shared_ptr<Array>>& children,
514 std::shared_ptr<Array>* out) {
515 if (type_ids.type_id() != Type::INT8) {
516 return Status::Invalid("UnionArray type_ids must be signed int8");
517 }
518 BufferVector buffers = {type_ids.null_bitmap(),
519 checked_cast<const Int8Array&>(type_ids).values(), nullptr};
520 auto union_type = union_(children, UnionMode::SPARSE);
521 auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers),
522 type_ids.null_count(), type_ids.offset());
523 for (const auto& child : children) {
524 internal_data->child_data.push_back(child->data());
525 if (child->length() != type_ids.length()) {
526 return Status::Invalid(
527 "Sparse UnionArray must have len(child) == len(type_ids) for all children");
528 }
529 }
530 *out = std::make_shared<UnionArray>(internal_data);
531 return Status::OK();
532}
533
534std::shared_ptr<Array> UnionArray::child(int i) const {
535 if (!boxed_fields_[i]) {
536 std::shared_ptr<ArrayData> child_data = data_->child_data[i];
537 if (mode() == UnionMode::SPARSE) {
538 // Sparse union: need to adjust child if union is sliced
539 // (for dense unions, the need to lookup through the offsets
540 // makes this unnecessary)
541 if (data_->offset != 0 || child_data->length > data_->length) {
542 child_data = SliceData(*child_data.get(), data_->offset, data_->length);
543 }
544 }
545 boxed_fields_[i] = MakeArray(child_data);
546 }
547 DCHECK(boxed_fields_[i]);
548 return boxed_fields_[i];
549}
550
551const Array* UnionArray::UnsafeChild(int i) const {
552 if (!boxed_fields_[i]) {
553 boxed_fields_[i] = MakeArray(data_->child_data[i]);
554 }
555 DCHECK(boxed_fields_[i]);
556 return boxed_fields_[i].get();
557}
558
559// ----------------------------------------------------------------------
560// DictionaryArray
561
562/// \brief Perform validation check to determine if all dictionary indices
563/// are within valid range (0 <= index < upper_bound)
564///
565/// \param[in] indices array of dictionary indices
566/// \param[in] upper_bound upper bound of valid range for indices
567/// \return Status
568template <typename ArrowType>
569Status ValidateDictionaryIndices(const std::shared_ptr<Array>& indices,
570 const int64_t upper_bound) {
571 using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
572 const auto& array = checked_cast<const ArrayType&>(*indices);
573 const typename ArrowType::c_type* data = array.raw_values();
574 const int64_t size = array.length();
575
576 if (array.null_count() == 0) {
577 for (int64_t idx = 0; idx < size; ++idx) {
578 if (data[idx] < 0 || data[idx] >= upper_bound) {
579 return Status::Invalid("Dictionary has out-of-bound index [0, dict.length)");
580 }
581 }
582 } else {
583 for (int64_t idx = 0; idx < size; ++idx) {
584 if (!array.IsNull(idx)) {
585 if (data[idx] < 0 || data[idx] >= upper_bound) {
586 return Status::Invalid("Dictionary has out-of-bound index [0, dict.length)");
587 }
588 }
589 }
590 }
591
592 return Status::OK();
593}
594
595DictionaryArray::DictionaryArray(const std::shared_ptr<ArrayData>& data)
596 : dict_type_(checked_cast<const DictionaryType*>(data->type.get())) {
597 DCHECK_EQ(data->type->id(), Type::DICTIONARY);
598 SetData(data);
599}
600
601DictionaryArray::DictionaryArray(const std::shared_ptr<DataType>& type,
602 const std::shared_ptr<Array>& indices)
603 : dict_type_(checked_cast<const DictionaryType*>(type.get())) {
604 DCHECK_EQ(type->id(), Type::DICTIONARY);
605 DCHECK_EQ(indices->type_id(), dict_type_->index_type()->id());
606 auto data = indices->data()->Copy();
607 data->type = type;
608 SetData(data);
609}
610
611Status DictionaryArray::FromArrays(const std::shared_ptr<DataType>& type,
612 const std::shared_ptr<Array>& indices,
613 std::shared_ptr<Array>* out) {
614 DCHECK_EQ(type->id(), Type::DICTIONARY);
615 const auto& dict = checked_cast<const DictionaryType&>(*type);
616 DCHECK_EQ(indices->type_id(), dict.index_type()->id());
617
618 int64_t upper_bound = dict.dictionary()->length();
619 Status is_valid;
620
621 switch (indices->type_id()) {
622 case Type::INT8:
623 is_valid = ValidateDictionaryIndices<Int8Type>(indices, upper_bound);
624 break;
625 case Type::INT16:
626 is_valid = ValidateDictionaryIndices<Int16Type>(indices, upper_bound);
627 break;
628 case Type::INT32:
629 is_valid = ValidateDictionaryIndices<Int32Type>(indices, upper_bound);
630 break;
631 case Type::INT64:
632 is_valid = ValidateDictionaryIndices<Int64Type>(indices, upper_bound);
633 break;
634 default:
635 return Status::NotImplemented("Categorical index type not supported: ",
636 indices->type()->ToString());
637 }
638
639 if (!is_valid.ok()) {
640 return is_valid;
641 }
642
643 *out = std::make_shared<DictionaryArray>(type, indices);
644 return is_valid;
645}
646
647void DictionaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
648 this->Array::SetData(data);
649 auto indices_data = data_->Copy();
650 indices_data->type = dict_type_->index_type();
651 indices_ = MakeArray(indices_data);
652}
653
654std::shared_ptr<Array> DictionaryArray::indices() const { return indices_; }
655
656std::shared_ptr<Array> DictionaryArray::dictionary() const {
657 return dict_type_->dictionary();
658}
659
660template <typename InType, typename OutType>
661static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data,
662 const std::shared_ptr<DataType>& type,
663 const std::vector<int32_t>& transpose_map,
664 std::shared_ptr<Array>* out) {
665 using in_c_type = typename InType::c_type;
666 using out_c_type = typename OutType::c_type;
667
668 std::shared_ptr<Buffer> out_buffer;
669 RETURN_NOT_OK(AllocateBuffer(pool, in_data.length * sizeof(out_c_type), &out_buffer));
670 // Null bitmap is unchanged
671 auto out_data = ArrayData::Make(type, in_data.length, {in_data.buffers[0], out_buffer},
672 in_data.null_count);
673 internal::TransposeInts(in_data.GetValues<in_c_type>(1),
674 out_data->GetMutableValues<out_c_type>(1), in_data.length,
675 transpose_map.data());
676 *out = MakeArray(out_data);
677 return Status::OK();
678}
679
680Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type,
681 const std::vector<int32_t>& transpose_map,
682 std::shared_ptr<Array>* out) const {
683 DCHECK_EQ(type->id(), Type::DICTIONARY);
684 const auto& out_dict_type = checked_cast<const DictionaryType&>(*type);
685
686 // XXX We'll probably want to make this operation a kernel when we
687 // implement dictionary-to-dictionary casting.
688 auto in_type_id = dict_type_->index_type()->id();
689 auto out_type_id = out_dict_type.index_type()->id();
690
691#define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \
692 case OUT_INDEX_TYPE::type_id: \
693 return TransposeDictIndices<IN_INDEX_TYPE, OUT_INDEX_TYPE>(pool, *data(), type, \
694 transpose_map, out);
695
696#define TRANSPOSE_IN_CASE(IN_INDEX_TYPE) \
697 case IN_INDEX_TYPE::type_id: \
698 switch (out_type_id) { \
699 TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int8Type) \
700 TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int16Type) \
701 TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int32Type) \
702 TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int64Type) \
703 default: \
704 return Status::NotImplemented("unexpected index type"); \
705 }
706
707 switch (in_type_id) {
708 TRANSPOSE_IN_CASE(Int8Type)
709 TRANSPOSE_IN_CASE(Int16Type)
710 TRANSPOSE_IN_CASE(Int32Type)
711 TRANSPOSE_IN_CASE(Int64Type)
712 default:
713 return Status::NotImplemented("unexpected index type");
714 }
715
716#undef TRANSPOSE_IN_OUT_CASE
717#undef TRANSPOSE_IN_CASE
718}
719
720// ----------------------------------------------------------------------
721// Implement Array::Accept as inline visitor
722
723Status Array::Accept(ArrayVisitor* visitor) const {
724 return VisitArrayInline(*this, visitor);
725}
726
727// ----------------------------------------------------------------------
728// Implement Array::Validate as inline visitor
729
730namespace internal {
731
732struct ValidateVisitor {
733 Status Visit(const NullArray&) { return Status::OK(); }
734
735 Status Visit(const PrimitiveArray& array) {
736 ARROW_RETURN_IF(array.data()->buffers.size() != 2,
737 Status::Invalid("number of buffers was != 2"));
738
739 ARROW_RETURN_IF(array.values() == nullptr, Status::Invalid("values was null"));
740
741 return Status::OK();
742 }
743
744 Status Visit(const Decimal128Array& array) {
745 if (array.data()->buffers.size() != 2) {
746 return Status::Invalid("number of buffers was != 2");
747 }
748 if (array.values() == nullptr) {
749 return Status::Invalid("values was null");
750 }
751 return Status::OK();
752 }
753
754 Status Visit(const BinaryArray& array) {
755 if (array.data()->buffers.size() != 3) {
756 return Status::Invalid("number of buffers was != 3");
757 }
758 return Status::OK();
759 }
760
761 Status Visit(const ListArray& array) {
762 if (array.length() < 0) {
763 return Status::Invalid("Length was negative");
764 }
765
766 auto value_offsets = array.value_offsets();
767 if (array.length() && !value_offsets) {
768 return Status::Invalid("value_offsets_ was null");
769 }
770 if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) {
771 return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(),
772 " isn't large enough for length: ", array.length());
773 }
774
775 if (!array.values()) {
776 return Status::Invalid("values was null");
777 }
778
779 const int32_t last_offset = array.value_offset(array.length());
780 if (array.values()->length() != last_offset) {
781 return Status::Invalid("Final offset invariant not equal to values length: ",
782 last_offset, "!=", array.values()->length());
783 }
784
785 const Status child_valid = ValidateArray(*array.values());
786 if (!child_valid.ok()) {
787 return Status::Invalid("Child array invalid: ", child_valid.ToString());
788 }
789
790 int32_t prev_offset = array.value_offset(0);
791 if (prev_offset != 0) {
792 return Status::Invalid("The first offset wasn't zero");
793 }
794 for (int64_t i = 1; i <= array.length(); ++i) {
795 int32_t current_offset = array.value_offset(i);
796 if (array.IsNull(i - 1) && current_offset != prev_offset) {
797 return Status::Invalid("Offset invariant failure at: ", i,
798 " inconsistent value_offsets for null slot",
799 current_offset, "!=", prev_offset);
800 }
801 if (current_offset < prev_offset) {
802 return Status::Invalid("Offset invariant failure: ", i,
803 " inconsistent offset for non-null slot: ", current_offset,
804 "<", prev_offset);
805 }
806 prev_offset = current_offset;
807 }
808 return Status::OK();
809 }
810
811 Status Visit(const StructArray& array) {
812 if (array.length() < 0) {
813 return Status::Invalid("Length was negative");
814 }
815
816 if (array.null_count() > array.length()) {
817 return Status::Invalid("Null count exceeds the length of this struct");
818 }
819
820 if (array.num_fields() > 0) {
821 // Validate fields
822 int64_t array_length = array.field(0)->length();
823 size_t idx = 0;
824 for (int i = 0; i < array.num_fields(); ++i) {
825 auto it = array.field(i);
826 if (it->length() != array_length) {
827 return Status::Invalid("Length is not equal from field ",
828 it->type()->ToString(), " at position [", idx, "]");
829 }
830
831 const Status child_valid = ValidateArray(*it);
832 if (!child_valid.ok()) {
833 return Status::Invalid("Child array invalid: ", child_valid.ToString(),
834 " at position [", idx, "}");
835 }
836 ++idx;
837 }
838
839 if (array_length > 0 && array_length != array.length()) {
840 return Status::Invalid("Struct's length is not equal to its child arrays");
841 }
842 }
843 return Status::OK();
844 }
845
846 Status Visit(const UnionArray& array) {
847 if (array.length() < 0) {
848 return Status::Invalid("Length was negative");
849 }
850
851 if (array.null_count() > array.length()) {
852 return Status::Invalid("Null count exceeds the length of this struct");
853 }
854 return Status::OK();
855 }
856
857 Status Visit(const DictionaryArray& array) {
858 Type::type index_type_id = array.indices()->type()->id();
859 if (!is_integer(index_type_id)) {
860 return Status::Invalid("Dictionary indices must be integer type");
861 }
862 return Status::OK();
863 }
864};
865
866} // namespace internal
867
868Status ValidateArray(const Array& array) {
869 internal::ValidateVisitor validate_visitor;
870 return VisitArrayInline(array, &validate_visitor);
871}
872
873// ----------------------------------------------------------------------
874// Loading from ArrayData
875
876namespace internal {
877
878class ArrayDataWrapper {
879 public:
880 ArrayDataWrapper(const std::shared_ptr<ArrayData>& data, std::shared_ptr<Array>* out)
881 : data_(data), out_(out) {}
882
883 template <typename T>
884 Status Visit(const T&) {
885 using ArrayType = typename TypeTraits<T>::ArrayType;
886 *out_ = std::make_shared<ArrayType>(data_);
887 return Status::OK();
888 }
889
890 const std::shared_ptr<ArrayData>& data_;
891 std::shared_ptr<Array>* out_;
892};
893
894} // namespace internal
895
896std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data) {
897 std::shared_ptr<Array> out;
898 internal::ArrayDataWrapper wrapper_visitor(data, &out);
899 Status s = VisitTypeInline(*data->type, &wrapper_visitor);
900 DCHECK(s.ok());
901 DCHECK(out);
902 return out;
903}
904
905// ----------------------------------------------------------------------
906// Misc APIs
907
908namespace internal {
909
910std::vector<ArrayVector> RechunkArraysConsistently(
911 const std::vector<ArrayVector>& groups) {
912 if (groups.size() <= 1) {
913 return groups;
914 }
915 int64_t total_length = 0;
916 for (const auto& array : groups.front()) {
917 total_length += array->length();
918 }
919#ifndef NDEBUG
920 for (const auto& group : groups) {
921 int64_t group_length = 0;
922 for (const auto& array : group) {
923 group_length += array->length();
924 }
925 DCHECK_EQ(group_length, total_length)
926 << "Array groups should have the same total number of elements";
927 }
928#endif
929 if (total_length == 0) {
930 return groups;
931 }
932
933 // Set up result vectors
934 std::vector<ArrayVector> rechunked_groups(groups.size());
935
936 // Set up progress counters
937 std::vector<ArrayVector::const_iterator> current_arrays;
938 std::vector<int64_t> array_offsets;
939 for (const auto& group : groups) {
940 current_arrays.emplace_back(group.cbegin());
941 array_offsets.emplace_back(0);
942 }
943
944 // Scan all array vectors at once, rechunking along the way
945 int64_t start = 0;
946 while (start < total_length) {
947 // First compute max possible length for next chunk
948 int64_t chunk_length = std::numeric_limits<int64_t>::max();
949 for (size_t i = 0; i < groups.size(); i++) {
950 auto& arr_it = current_arrays[i];
951 auto& offset = array_offsets[i];
952 // Skip any done arrays (including 0-length arrays)
953 while (offset == (*arr_it)->length()) {
954 ++arr_it;
955 offset = 0;
956 }
957 const auto& array = *arr_it;
958 DCHECK_GT(array->length(), offset);
959 chunk_length = std::min(chunk_length, array->length() - offset);
960 }
961 DCHECK_GT(chunk_length, 0);
962
963 // Then slice all arrays along this chunk size
964 for (size_t i = 0; i < groups.size(); i++) {
965 const auto& array = *current_arrays[i];
966 auto& offset = array_offsets[i];
967 if (offset == 0 && array->length() == chunk_length) {
968 // Slice spans entire array
969 rechunked_groups[i].emplace_back(array);
970 } else {
971 DCHECK_LT(chunk_length - offset, array->length());
972 rechunked_groups[i].emplace_back(array->Slice(offset, chunk_length));
973 }
974 offset += chunk_length;
975 }
976 start += chunk_length;
977 }
978
979 return rechunked_groups;
980}
981
982} // namespace internal
983
984} // namespace arrow
985