1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef ARROW_ARRAY_H
19#define ARROW_ARRAY_H
20
21#include <cstdint>
22#include <iosfwd>
23#include <memory>
24#include <string>
25#include <type_traits>
26#include <utility>
27#include <vector>
28
29#include "arrow/buffer.h"
30#include "arrow/type.h"
31#include "arrow/type_traits.h"
32#include "arrow/util/bit-util.h"
33#include "arrow/util/checked_cast.h"
34#include "arrow/util/macros.h"
35#include "arrow/util/string_view.h"
36#include "arrow/util/visibility.h"
37
38namespace arrow {
39
40class Array;
41class ArrayVisitor;
42
43using BufferVector = std::vector<std::shared_ptr<Buffer>>;
44
45// When slicing, we do not know the null count of the sliced range without
46// doing some computation. To avoid doing this eagerly, we set the null count
47// to -1 (any negative number will do). When Array::null_count is called the
48// first time, the null count will be computed. See ARROW-33
49constexpr int64_t kUnknownNullCount = -1;
50
51class MemoryPool;
52class Status;
53
54// ----------------------------------------------------------------------
55// Generic array data container
56
57/// \class ArrayData
58/// \brief Mutable container for generic Arrow array data
59///
60/// This data structure is a self-contained representation of the memory and
61/// metadata inside an Arrow array data structure (called vectors in Java). The
62/// classes arrow::Array and its subclasses provide strongly-typed accessors
63/// with support for the visitor pattern and other affordances.
64///
65/// This class is designed for easy internal data manipulation, analytical data
66/// processing, and data transport to and from IPC messages. For example, we
67/// could cast from int64 to float64 like so:
68///
69/// Int64Array arr = GetMyData();
70/// auto new_data = arr.data()->ShallowCopy();
71/// new_data->type = arrow::float64();
72/// DoubleArray double_arr(new_data);
73///
74/// This object is also useful in an analytics setting where memory may be
75/// reused. For example, if we had a group of operations all returning doubles,
76/// say:
77///
78/// Log(Sqrt(Expr(arr))
79///
80/// Then the low-level implementations of each of these functions could have
81/// the signatures
82///
83/// void Log(const ArrayData& values, ArrayData* out);
84///
85/// As another example a function may consume one or more memory buffers in an
86/// input array and replace them with newly-allocated data, changing the output
87/// data type as well.
88struct ARROW_EXPORT ArrayData {
89 ArrayData() : length(0), null_count(0), offset(0) {}
90
91 ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
92 int64_t null_count = kUnknownNullCount, int64_t offset = 0)
93 : type(type), length(length), null_count(null_count), offset(offset) {}
94
95 ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
96 const std::vector<std::shared_ptr<Buffer>>& buffers,
97 int64_t null_count = kUnknownNullCount, int64_t offset = 0)
98 : ArrayData(type, length, null_count, offset) {
99 this->buffers = buffers;
100 }
101
102 ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
103 const std::vector<std::shared_ptr<Buffer>>& buffers,
104 const std::vector<std::shared_ptr<ArrayData>>& child_data,
105 int64_t null_count = kUnknownNullCount, int64_t offset = 0)
106 : ArrayData(type, length, null_count, offset) {
107 this->buffers = buffers;
108 this->child_data = child_data;
109 }
110
111 ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
112 std::vector<std::shared_ptr<Buffer>>&& buffers,
113 int64_t null_count = kUnknownNullCount, int64_t offset = 0)
114 : ArrayData(type, length, null_count, offset) {
115 this->buffers = std::move(buffers);
116 }
117
118 static std::shared_ptr<ArrayData> Make(const std::shared_ptr<DataType>& type,
119 int64_t length,
120 std::vector<std::shared_ptr<Buffer>>&& buffers,
121 int64_t null_count = kUnknownNullCount,
122 int64_t offset = 0);
123
124 static std::shared_ptr<ArrayData> Make(
125 const std::shared_ptr<DataType>& type, int64_t length,
126 const std::vector<std::shared_ptr<Buffer>>& buffers,
127 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
128
129 static std::shared_ptr<ArrayData> Make(
130 const std::shared_ptr<DataType>& type, int64_t length,
131 const std::vector<std::shared_ptr<Buffer>>& buffers,
132 const std::vector<std::shared_ptr<ArrayData>>& child_data,
133 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
134
135 static std::shared_ptr<ArrayData> Make(const std::shared_ptr<DataType>& type,
136 int64_t length,
137 int64_t null_count = kUnknownNullCount,
138 int64_t offset = 0);
139
140 // Move constructor
141 ArrayData(ArrayData&& other) noexcept
142 : type(std::move(other.type)),
143 length(other.length),
144 null_count(other.null_count),
145 offset(other.offset),
146 buffers(std::move(other.buffers)),
147 child_data(std::move(other.child_data)) {}
148
149 ArrayData(const ArrayData& other) noexcept
150 : type(other.type),
151 length(other.length),
152 null_count(other.null_count),
153 offset(other.offset),
154 buffers(other.buffers),
155 child_data(other.child_data) {}
156
157 // Move assignment
158 ArrayData& operator=(ArrayData&& other) {
159 type = std::move(other.type);
160 length = other.length;
161 null_count = other.null_count;
162 offset = other.offset;
163 buffers = std::move(other.buffers);
164 child_data = std::move(other.child_data);
165 return *this;
166 }
167
168 std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
169
170 // Access a buffer's data as a typed C pointer
171 template <typename T>
172 inline const T* GetValues(int i, int64_t absolute_offset) const {
173 if (buffers[i]) {
174 return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
175 } else {
176 return NULLPTR;
177 }
178 }
179
180 template <typename T>
181 inline const T* GetValues(int i) const {
182 return GetValues<T>(i, offset);
183 }
184
185 // Access a buffer's data as a typed C pointer
186 template <typename T>
187 inline T* GetMutableValues(int i, int64_t absolute_offset) {
188 if (buffers[i]) {
189 return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
190 } else {
191 return NULLPTR;
192 }
193 }
194
195 template <typename T>
196 inline T* GetMutableValues(int i) {
197 return GetMutableValues<T>(i, offset);
198 }
199
200 std::shared_ptr<DataType> type;
201 int64_t length;
202 int64_t null_count;
203 // The logical start point into the physical buffers (in values, not bytes).
204 // Note that, for child data, this must be *added* to the child data's own offset.
205 int64_t offset;
206 std::vector<std::shared_ptr<Buffer>> buffers;
207 std::vector<std::shared_ptr<ArrayData>> child_data;
208};
209
210/// \brief Create a strongly-typed Array instance from generic ArrayData
211/// \param[in] data the array contents
212/// \return the resulting Array instance
213ARROW_EXPORT
214std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
215
216// ----------------------------------------------------------------------
217// User array accessor types
218
219/// \brief Array base type
220/// Immutable data array with some logical type and some length.
221///
222/// Any memory is owned by the respective Buffer instance (or its parents).
223///
224/// The base class is only required to have a null bitmap buffer if the null
225/// count is greater than 0
226///
227/// If known, the null count can be provided in the base Array constructor. If
228/// the null count is not known, pass -1 to indicate that the null count is to
229/// be computed on the first call to null_count()
230class ARROW_EXPORT Array {
231 public:
232 virtual ~Array() = default;
233
234 /// \brief Return true if value at index is null. Does not boundscheck
235 bool IsNull(int64_t i) const {
236 return null_bitmap_data_ != NULLPTR &&
237 !BitUtil::GetBit(null_bitmap_data_, i + data_->offset);
238 }
239
240 /// \brief Return true if value at index is valid (not null). Does not
241 /// boundscheck
242 bool IsValid(int64_t i) const {
243 return null_bitmap_data_ == NULLPTR ||
244 BitUtil::GetBit(null_bitmap_data_, i + data_->offset);
245 }
246
247 /// Size in the number of elements this array contains.
248 int64_t length() const { return data_->length; }
249
250 /// A relative position into another array's data, to enable zero-copy
251 /// slicing. This value defaults to zero
252 int64_t offset() const { return data_->offset; }
253
254 /// The number of null entries in the array. If the null count was not known
255 /// at time of construction (and set to a negative value), then the null
256 /// count will be computed and cached on the first invocation of this
257 /// function
258 int64_t null_count() const;
259
260 std::shared_ptr<DataType> type() const { return data_->type; }
261 Type::type type_id() const { return data_->type->id(); }
262
263 /// Buffer for the null bitmap.
264 ///
265 /// Note that for `null_count == 0`, this can be null.
266 /// This buffer does not account for any slice offset
267 std::shared_ptr<Buffer> null_bitmap() const { return data_->buffers[0]; }
268
269 /// Raw pointer to the null bitmap.
270 ///
271 /// Note that for `null_count == 0`, this can be null.
272 /// This buffer does not account for any slice offset
273 const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
274
275 bool Equals(const Array& arr) const;
276 bool Equals(const std::shared_ptr<Array>& arr) const;
277
278 bool ApproxEquals(const std::shared_ptr<Array>& arr) const;
279 bool ApproxEquals(const Array& arr) const;
280
281 /// Compare if the range of slots specified are equal for the given array and
282 /// this array. end_idx exclusive. This methods does not bounds check.
283 bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
284 const std::shared_ptr<Array>& other) const;
285
286 bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
287 int64_t other_start_idx) const;
288
289 Status Accept(ArrayVisitor* visitor) const;
290
291 /// Construct a zero-copy slice of the array with the indicated offset and
292 /// length
293 ///
294 /// \param[in] offset the position of the first element in the constructed
295 /// slice
296 /// \param[in] length the length of the slice. If there are not enough
297 /// elements in the array, the length will be adjusted accordingly
298 ///
299 /// \return a new object wrapped in std::shared_ptr<Array>
300 std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;
301
302 /// Slice from offset until end of the array
303 std::shared_ptr<Array> Slice(int64_t offset) const;
304
305 std::shared_ptr<ArrayData> data() const { return data_; }
306
307 int num_fields() const { return static_cast<int>(data_->child_data.size()); }
308
309 /// \return PrettyPrint representation of array suitable for debugging
310 std::string ToString() const;
311
312 protected:
313 Array() : null_bitmap_data_(NULLPTR) {}
314
315 std::shared_ptr<ArrayData> data_;
316 const uint8_t* null_bitmap_data_;
317
318 /// Protected method for constructors
319 inline void SetData(const std::shared_ptr<ArrayData>& data) {
320 if (data->buffers.size() > 0 && data->buffers[0]) {
321 null_bitmap_data_ = data->buffers[0]->data();
322 } else {
323 null_bitmap_data_ = NULLPTR;
324 }
325 data_ = data;
326 }
327
328 private:
329 ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
330};
331
332using ArrayVector = std::vector<std::shared_ptr<Array>>;
333
334namespace internal {
335
336/// Given a number of ArrayVectors, treat each ArrayVector as the
337/// chunks of a chunked array. Then rechunk each ArrayVector such that
338/// all ArrayVectors are chunked identically. It is mandatory that
339/// all ArrayVectors contain the same total number of elements.
340ARROW_EXPORT
341std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);
342
343} // namespace internal
344
345static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
346 os << x.ToString();
347 return os;
348}
349
350/// Base class for non-nested arrays
351class ARROW_EXPORT FlatArray : public Array {
352 protected:
353 using Array::Array;
354};
355
356/// Degenerate null type Array
357class ARROW_EXPORT NullArray : public FlatArray {
358 public:
359 using TypeClass = NullType;
360
361 explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
362 explicit NullArray(int64_t length);
363
364 private:
365 inline void SetData(const std::shared_ptr<ArrayData>& data) {
366 null_bitmap_data_ = NULLPTR;
367 data->null_count = data->length;
368 data_ = data;
369 }
370};
371
372/// Base class for arrays of fixed-size logical types
373class ARROW_EXPORT PrimitiveArray : public FlatArray {
374 public:
375 PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
376 const std::shared_ptr<Buffer>& data,
377 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
378 int64_t null_count = 0, int64_t offset = 0);
379
380 /// Does not account for any slice offset
381 std::shared_ptr<Buffer> values() const { return data_->buffers[1]; }
382
383 protected:
384 PrimitiveArray() : raw_values_(NULLPTR) {}
385
386 inline void SetData(const std::shared_ptr<ArrayData>& data) {
387 auto values = data->buffers[1];
388 this->Array::SetData(data);
389 raw_values_ = values == NULLPTR ? NULLPTR : values->data();
390 }
391
392 explicit inline PrimitiveArray(const std::shared_ptr<ArrayData>& data) {
393 SetData(data);
394 }
395
396 const uint8_t* raw_values_;
397};
398
399/// Concrete Array class for numeric data.
400template <typename TYPE>
401class NumericArray : public PrimitiveArray {
402 public:
403 using TypeClass = TYPE;
404 using value_type = typename TypeClass::c_type;
405
406 explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {}
407
408 // Only enable this constructor without a type argument for types without additional
409 // metadata
410 template <typename T1 = TYPE>
411 NumericArray(
412 typename std::enable_if<TypeTraits<T1>::is_parameter_free, int64_t>::type length,
413 const std::shared_ptr<Buffer>& data,
414 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, int64_t null_count = 0,
415 int64_t offset = 0)
416 : PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap,
417 null_count, offset) {}
418
419 const value_type* raw_values() const {
420 return reinterpret_cast<const value_type*>(raw_values_) + data_->offset;
421 }
422
423 value_type Value(int64_t i) const { return raw_values()[i]; }
424
425 // For API compatibility with BinaryArray etc.
426 value_type GetView(int64_t i) const { return Value(i); }
427
428 protected:
429 using PrimitiveArray::PrimitiveArray;
430};
431
432/// Concrete Array class for boolean data
433class ARROW_EXPORT BooleanArray : public PrimitiveArray {
434 public:
435 using TypeClass = BooleanType;
436
437 explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
438
439 BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
440 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
441 int64_t null_count = 0, int64_t offset = 0);
442
443 bool Value(int64_t i) const {
444 return BitUtil::GetBit(reinterpret_cast<const uint8_t*>(raw_values_),
445 i + data_->offset);
446 }
447
448 bool GetView(int64_t i) const { return Value(i); }
449
450 protected:
451 using PrimitiveArray::PrimitiveArray;
452};
453
454// ----------------------------------------------------------------------
455// ListArray
456
457/// Concrete Array class for list data
458class ARROW_EXPORT ListArray : public Array {
459 public:
460 using TypeClass = ListType;
461
462 explicit ListArray(const std::shared_ptr<ArrayData>& data);
463
464 ListArray(const std::shared_ptr<DataType>& type, int64_t length,
465 const std::shared_ptr<Buffer>& value_offsets,
466 const std::shared_ptr<Array>& values,
467 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, int64_t null_count = 0,
468 int64_t offset = 0);
469
470 /// \brief Construct ListArray from array of offsets and child value array
471 ///
472 /// This function does the bare minimum of validation of the offsets and
473 /// input types, and will allocate a new offsets array if necessary (i.e. if
474 /// the offsets contain any nulls). If the offsets do not have nulls, they
475 /// are assumed to be well-formed
476 ///
477 /// \param[in] offsets Array containing n + 1 offsets encoding length and
478 /// size. Must be of int32 type
479 /// \param[in] values Array containing
480 /// \param[in] pool MemoryPool in case new offsets array needs to be
481 /// allocated because of null values
482 /// \param[out] out Will have length equal to offsets.length() - 1
483 static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
484 std::shared_ptr<Array>* out);
485
486 /// \brief Return array object containing the list's values
487 std::shared_ptr<Array> values() const;
488
489 /// Note that this buffer does not account for any slice offset
490 std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
491
492 std::shared_ptr<DataType> value_type() const;
493
494 /// Return pointer to raw value offsets accounting for any slice offset
495 const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
496
497 // Neither of these functions will perform boundschecking
498 int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
499 int32_t value_length(int64_t i) const {
500 i += data_->offset;
501 return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
502 }
503
504 protected:
505 void SetData(const std::shared_ptr<ArrayData>& data);
506 const int32_t* raw_value_offsets_;
507
508 private:
509 std::shared_ptr<Array> values_;
510};
511
512// ----------------------------------------------------------------------
513// Binary and String
514
515/// Concrete Array class for variable-size binary data
516class ARROW_EXPORT BinaryArray : public FlatArray {
517 public:
518 using TypeClass = BinaryType;
519
520 explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
521
522 BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
523 const std::shared_ptr<Buffer>& data,
524 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
525 int64_t null_count = 0, int64_t offset = 0);
526
527 /// Return the pointer to the given elements bytes
528 // XXX should GetValue(int64_t i) return a string_view?
529 const uint8_t* GetValue(int64_t i, int32_t* out_length) const {
530 // Account for base offset
531 i += data_->offset;
532 const int32_t pos = raw_value_offsets_[i];
533 *out_length = raw_value_offsets_[i + 1] - pos;
534 return raw_data_ + pos;
535 }
536
537 /// \brief Get binary value as a string_view
538 ///
539 /// \param i the value index
540 /// \return the view over the selected value
541 util::string_view GetView(int64_t i) const {
542 // Account for base offset
543 i += data_->offset;
544 const int32_t pos = raw_value_offsets_[i];
545 return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
546 raw_value_offsets_[i + 1] - pos);
547 }
548
549 /// \brief Get binary value as a std::string
550 ///
551 /// \param i the value index
552 /// \return the value copied into a std::string
553 std::string GetString(int64_t i) const { return std::string(GetView(i)); }
554
555 /// Note that this buffer does not account for any slice offset
556 std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
557
558 /// Note that this buffer does not account for any slice offset
559 std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
560
561 const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
562
563 // Neither of these functions will perform boundschecking
564 int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
565 int32_t value_length(int64_t i) const {
566 i += data_->offset;
567 return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
568 }
569
570 protected:
571 // For subclasses
572 BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {}
573
574 /// Protected method for constructors
575 void SetData(const std::shared_ptr<ArrayData>& data);
576
577 // Constructor that allows sub-classes/builders to propagate there logical type up the
578 // class hierarchy.
579 BinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
580 const std::shared_ptr<Buffer>& value_offsets,
581 const std::shared_ptr<Buffer>& data,
582 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
583 int64_t null_count = 0, int64_t offset = 0);
584
585 const int32_t* raw_value_offsets_;
586 const uint8_t* raw_data_;
587};
588
589/// Concrete Array class for variable-size string (utf-8) data
590class ARROW_EXPORT StringArray : public BinaryArray {
591 public:
592 using TypeClass = StringType;
593
594 explicit StringArray(const std::shared_ptr<ArrayData>& data);
595
596 StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
597 const std::shared_ptr<Buffer>& data,
598 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
599 int64_t null_count = 0, int64_t offset = 0);
600};
601
602// ----------------------------------------------------------------------
603// Fixed width binary
604
605/// Concrete Array class for fixed-size binary data
606class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
607 public:
608 using TypeClass = FixedSizeBinaryType;
609
610 explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
611
612 FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
613 const std::shared_ptr<Buffer>& data,
614 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
615 int64_t null_count = 0, int64_t offset = 0);
616
617 const uint8_t* GetValue(int64_t i) const;
618 const uint8_t* Value(int64_t i) const { return GetValue(i); }
619
620 util::string_view GetView(int64_t i) const {
621 return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width());
622 }
623
624 std::string GetString(int64_t i) const { return std::string(GetView(i)); }
625
626 int32_t byte_width() const { return byte_width_; }
627
628 const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
629
630 protected:
631 inline void SetData(const std::shared_ptr<ArrayData>& data) {
632 this->PrimitiveArray::SetData(data);
633 byte_width_ =
634 internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
635 }
636
637 int32_t byte_width_;
638};
639
640// ----------------------------------------------------------------------
641// Decimal128Array
642
643/// Concrete Array class for 128-bit decimal data
644class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
645 public:
646 using TypeClass = Decimal128Type;
647
648 using FixedSizeBinaryArray::FixedSizeBinaryArray;
649
650 /// \brief Construct Decimal128Array from ArrayData instance
651 explicit Decimal128Array(const std::shared_ptr<ArrayData>& data);
652
653 std::string FormatValue(int64_t i) const;
654};
655
656// Backward compatibility
657using DecimalArray = Decimal128Array;
658
659// ----------------------------------------------------------------------
660// Struct
661
662/// Concrete Array class for struct data
663class ARROW_EXPORT StructArray : public Array {
664 public:
665 using TypeClass = StructType;
666
667 explicit StructArray(const std::shared_ptr<ArrayData>& data);
668
669 StructArray(const std::shared_ptr<DataType>& type, int64_t length,
670 const std::vector<std::shared_ptr<Array>>& children,
671 std::shared_ptr<Buffer> null_bitmap = NULLPTR, int64_t null_count = 0,
672 int64_t offset = 0);
673
674 const StructType* struct_type() const;
675
676 // Return a shared pointer in case the requestor desires to share ownership
677 // with this array. The returned array has its offset, length and null
678 // count adjusted.
679 std::shared_ptr<Array> field(int pos) const;
680
681 /// Returns null if name not found
682 std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
683
684 /// \brief Flatten this array as a vector of arrays, one for each field
685 ///
686 /// \param[in] pool The pool to allocate null bitmaps from, if necessary
687 /// \param[out] out The resulting vector of arrays
688 Status Flatten(MemoryPool* pool, ArrayVector* out) const;
689
690 private:
691 // For caching boxed child data
692 mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
693};
694
695// ----------------------------------------------------------------------
696// Union
697
698/// Concrete Array class for union data
699class ARROW_EXPORT UnionArray : public Array {
700 public:
701 using TypeClass = UnionType;
702 using type_id_t = uint8_t;
703
704 explicit UnionArray(const std::shared_ptr<ArrayData>& data);
705
706 UnionArray(const std::shared_ptr<DataType>& type, int64_t length,
707 const std::vector<std::shared_ptr<Array>>& children,
708 const std::shared_ptr<Buffer>& type_ids,
709 const std::shared_ptr<Buffer>& value_offsets = NULLPTR,
710 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, int64_t null_count = 0,
711 int64_t offset = 0);
712
713 /// \brief Construct Dense UnionArray from types_ids, value_offsets and children
714 ///
715 /// This function does the bare minimum of validation of the offsets and
716 /// input types. The value_offsets are assumed to be well-formed.
717 ///
718 /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
719 /// 0 corresponding to each type.
720 /// \param[in] value_offsets An array of signed int32 values indicating the
721 /// relative offset into the respective child array for the type in a given slot.
722 /// The respective offsets for each child value array must be in order / increasing.
723 /// \param[in] children Vector of children Arrays containing the data for each type.
724 /// \param[out] out Will have length equal to value_offsets.length()
725 static Status MakeDense(const Array& type_ids, const Array& value_offsets,
726 const std::vector<std::shared_ptr<Array>>& children,
727 std::shared_ptr<Array>* out);
728
729 /// \brief Construct Sparse UnionArray from type_ids and children
730 ///
731 /// This function does the bare minimum of validation of the offsets and
732 /// input types.
733 ///
734 /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
735 /// 0 corresponding to each type.
736 /// \param[in] children Vector of children Arrays containing the data for each type.
737 /// \param[out] out Will have length equal to type_ids.length()
738 static Status MakeSparse(const Array& type_ids,
739 const std::vector<std::shared_ptr<Array>>& children,
740 std::shared_ptr<Array>* out);
741
742 /// Note that this buffer does not account for any slice offset
743 std::shared_ptr<Buffer> type_ids() const { return data_->buffers[1]; }
744
745 /// Note that this buffer does not account for any slice offset
746 std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
747
748 int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
749
750 const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; }
751 const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
752
753 UnionMode::type mode() const {
754 return internal::checked_cast<const UnionType&>(*type()).mode();
755 }
756
757 // Return the given field as an individual array.
758 // For sparse unions, the returned array has its offset, length and null
759 // count adjusted.
760 // For dense unions, the returned array is unchanged.
761 std::shared_ptr<Array> child(int pos) const;
762
763 /// Only use this while the UnionArray is in scope
764 const Array* UnsafeChild(int pos) const;
765
766 protected:
767 void SetData(const std::shared_ptr<ArrayData>& data);
768
769 const type_id_t* raw_type_ids_;
770 const int32_t* raw_value_offsets_;
771
772 // For caching boxed child data
773 mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
774};
775
776// ----------------------------------------------------------------------
777// DictionaryArray (categorical and dictionary-encoded in memory)
778
779/// \brief Concrete Array class for dictionary data
780///
781/// A dictionary array contains an array of non-negative integers (the
782/// "dictionary indices") along with a data type containing a "dictionary"
783/// corresponding to the distinct values represented in the data.
784///
785/// For example, the array
786///
787/// ["foo", "bar", "foo", "bar", "foo", "bar"]
788///
789/// with dictionary ["bar", "foo"], would have dictionary array representation
790///
791/// indices: [1, 0, 1, 0, 1, 0]
792/// dictionary: ["bar", "foo"]
793///
794/// The indices in principle may have any integer type (signed or unsigned),
795/// though presently data in IPC exchanges must be signed int32.
796class ARROW_EXPORT DictionaryArray : public Array {
797 public:
798 using TypeClass = DictionaryType;
799
800 explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
801
802 DictionaryArray(const std::shared_ptr<DataType>& type,
803 const std::shared_ptr<Array>& indices);
804
805 /// \brief Construct DictionaryArray from dictionary data type and indices array
806 ///
807 /// This function does the validation of the indices and input type. It checks if
808 /// all indices are non-negative and smaller than the size of the dictionary
809 ///
810 /// \param[in] type a dictionary type
811 /// \param[in] indices an array of non-negative signed
812 /// integers smaller than the size of the dictionary
813 /// \param[out] out the resulting DictionaryArray instance
814 static Status FromArrays(const std::shared_ptr<DataType>& type,
815 const std::shared_ptr<Array>& indices,
816 std::shared_ptr<Array>* out);
817
818 /// \brief Transpose this DictionaryArray
819 ///
820 /// This method constructs a new dictionary array with the given dictionary type,
821 /// transposing indices using the transpose map.
822 /// The type and the transpose map are typically computed using
823 /// DictionaryType::Unify.
824 ///
825 /// \param[in] pool a pool to allocate the array data from
826 /// \param[in] type a dictionary type
827 /// \param[in] transpose_map a vector transposing this array's indices
828 /// into the target array's indices
829 /// \param[out] out the resulting DictionaryArray instance
830 Status Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type,
831 const std::vector<int32_t>& transpose_map,
832 std::shared_ptr<Array>* out) const;
833 // XXX Do we also want an unsafe in-place Transpose?
834
835 std::shared_ptr<Array> indices() const;
836 std::shared_ptr<Array> dictionary() const;
837
838 const DictionaryType* dict_type() const { return dict_type_; }
839
840 private:
841 void SetData(const std::shared_ptr<ArrayData>& data);
842
843 const DictionaryType* dict_type_;
844 std::shared_ptr<Array> indices_;
845};
846
847/// \brief Perform any validation checks to determine obvious inconsistencies
848/// with the array's internal data
849///
850/// This can be an expensive check.
851///
852/// \param array an Array instance
853/// \return Status
854ARROW_EXPORT
855Status ValidateArray(const Array& array);
856
857} // namespace arrow
858
859#endif // ARROW_ARRAY_H
860