1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_ARRAY_H |
19 | #define ARROW_ARRAY_H |
20 | |
21 | #include <cstdint> |
22 | #include <iosfwd> |
23 | #include <memory> |
24 | #include <string> |
25 | #include <type_traits> |
26 | #include <utility> |
27 | #include <vector> |
28 | |
29 | #include "arrow/buffer.h" |
30 | #include "arrow/type.h" |
31 | #include "arrow/type_traits.h" |
32 | #include "arrow/util/bit-util.h" |
33 | #include "arrow/util/checked_cast.h" |
34 | #include "arrow/util/macros.h" |
35 | #include "arrow/util/string_view.h" |
36 | #include "arrow/util/visibility.h" |
37 | |
38 | namespace arrow { |
39 | |
40 | class Array; |
41 | class ArrayVisitor; |
42 | |
43 | using BufferVector = std::vector<std::shared_ptr<Buffer>>; |
44 | |
45 | // When slicing, we do not know the null count of the sliced range without |
46 | // doing some computation. To avoid doing this eagerly, we set the null count |
47 | // to -1 (any negative number will do). When Array::null_count is called the |
48 | // first time, the null count will be computed. See ARROW-33 |
49 | constexpr int64_t kUnknownNullCount = -1; |
50 | |
51 | class MemoryPool; |
52 | class Status; |
53 | |
54 | // ---------------------------------------------------------------------- |
55 | // Generic array data container |
56 | |
57 | /// \class ArrayData |
58 | /// \brief Mutable container for generic Arrow array data |
59 | /// |
60 | /// This data structure is a self-contained representation of the memory and |
61 | /// metadata inside an Arrow array data structure (called vectors in Java). The |
62 | /// classes arrow::Array and its subclasses provide strongly-typed accessors |
63 | /// with support for the visitor pattern and other affordances. |
64 | /// |
65 | /// This class is designed for easy internal data manipulation, analytical data |
66 | /// processing, and data transport to and from IPC messages. For example, we |
67 | /// could cast from int64 to float64 like so: |
68 | /// |
69 | /// Int64Array arr = GetMyData(); |
70 | /// auto new_data = arr.data()->ShallowCopy(); |
71 | /// new_data->type = arrow::float64(); |
72 | /// DoubleArray double_arr(new_data); |
73 | /// |
74 | /// This object is also useful in an analytics setting where memory may be |
75 | /// reused. For example, if we had a group of operations all returning doubles, |
76 | /// say: |
77 | /// |
78 | /// Log(Sqrt(Expr(arr)) |
79 | /// |
80 | /// Then the low-level implementations of each of these functions could have |
81 | /// the signatures |
82 | /// |
83 | /// void Log(const ArrayData& values, ArrayData* out); |
84 | /// |
85 | /// As another example a function may consume one or more memory buffers in an |
86 | /// input array and replace them with newly-allocated data, changing the output |
87 | /// data type as well. |
88 | struct ARROW_EXPORT ArrayData { |
89 | ArrayData() : length(0), null_count(0), offset(0) {} |
90 | |
91 | ArrayData(const std::shared_ptr<DataType>& type, int64_t length, |
92 | int64_t null_count = kUnknownNullCount, int64_t offset = 0) |
93 | : type(type), length(length), null_count(null_count), offset(offset) {} |
94 | |
95 | ArrayData(const std::shared_ptr<DataType>& type, int64_t length, |
96 | const std::vector<std::shared_ptr<Buffer>>& buffers, |
97 | int64_t null_count = kUnknownNullCount, int64_t offset = 0) |
98 | : ArrayData(type, length, null_count, offset) { |
99 | this->buffers = buffers; |
100 | } |
101 | |
102 | ArrayData(const std::shared_ptr<DataType>& type, int64_t length, |
103 | const std::vector<std::shared_ptr<Buffer>>& buffers, |
104 | const std::vector<std::shared_ptr<ArrayData>>& child_data, |
105 | int64_t null_count = kUnknownNullCount, int64_t offset = 0) |
106 | : ArrayData(type, length, null_count, offset) { |
107 | this->buffers = buffers; |
108 | this->child_data = child_data; |
109 | } |
110 | |
111 | ArrayData(const std::shared_ptr<DataType>& type, int64_t length, |
112 | std::vector<std::shared_ptr<Buffer>>&& buffers, |
113 | int64_t null_count = kUnknownNullCount, int64_t offset = 0) |
114 | : ArrayData(type, length, null_count, offset) { |
115 | this->buffers = std::move(buffers); |
116 | } |
117 | |
118 | static std::shared_ptr<ArrayData> Make(const std::shared_ptr<DataType>& type, |
119 | int64_t length, |
120 | std::vector<std::shared_ptr<Buffer>>&& buffers, |
121 | int64_t null_count = kUnknownNullCount, |
122 | int64_t offset = 0); |
123 | |
124 | static std::shared_ptr<ArrayData> Make( |
125 | const std::shared_ptr<DataType>& type, int64_t length, |
126 | const std::vector<std::shared_ptr<Buffer>>& buffers, |
127 | int64_t null_count = kUnknownNullCount, int64_t offset = 0); |
128 | |
129 | static std::shared_ptr<ArrayData> Make( |
130 | const std::shared_ptr<DataType>& type, int64_t length, |
131 | const std::vector<std::shared_ptr<Buffer>>& buffers, |
132 | const std::vector<std::shared_ptr<ArrayData>>& child_data, |
133 | int64_t null_count = kUnknownNullCount, int64_t offset = 0); |
134 | |
135 | static std::shared_ptr<ArrayData> Make(const std::shared_ptr<DataType>& type, |
136 | int64_t length, |
137 | int64_t null_count = kUnknownNullCount, |
138 | int64_t offset = 0); |
139 | |
140 | // Move constructor |
141 | ArrayData(ArrayData&& other) noexcept |
142 | : type(std::move(other.type)), |
143 | length(other.length), |
144 | null_count(other.null_count), |
145 | offset(other.offset), |
146 | buffers(std::move(other.buffers)), |
147 | child_data(std::move(other.child_data)) {} |
148 | |
149 | ArrayData(const ArrayData& other) noexcept |
150 | : type(other.type), |
151 | length(other.length), |
152 | null_count(other.null_count), |
153 | offset(other.offset), |
154 | buffers(other.buffers), |
155 | child_data(other.child_data) {} |
156 | |
157 | // Move assignment |
158 | ArrayData& operator=(ArrayData&& other) { |
159 | type = std::move(other.type); |
160 | length = other.length; |
161 | null_count = other.null_count; |
162 | offset = other.offset; |
163 | buffers = std::move(other.buffers); |
164 | child_data = std::move(other.child_data); |
165 | return *this; |
166 | } |
167 | |
168 | std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); } |
169 | |
170 | // Access a buffer's data as a typed C pointer |
171 | template <typename T> |
172 | inline const T* GetValues(int i, int64_t absolute_offset) const { |
173 | if (buffers[i]) { |
174 | return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset; |
175 | } else { |
176 | return NULLPTR; |
177 | } |
178 | } |
179 | |
180 | template <typename T> |
181 | inline const T* GetValues(int i) const { |
182 | return GetValues<T>(i, offset); |
183 | } |
184 | |
185 | // Access a buffer's data as a typed C pointer |
186 | template <typename T> |
187 | inline T* GetMutableValues(int i, int64_t absolute_offset) { |
188 | if (buffers[i]) { |
189 | return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset; |
190 | } else { |
191 | return NULLPTR; |
192 | } |
193 | } |
194 | |
195 | template <typename T> |
196 | inline T* GetMutableValues(int i) { |
197 | return GetMutableValues<T>(i, offset); |
198 | } |
199 | |
200 | std::shared_ptr<DataType> type; |
201 | int64_t length; |
202 | int64_t null_count; |
203 | // The logical start point into the physical buffers (in values, not bytes). |
204 | // Note that, for child data, this must be *added* to the child data's own offset. |
205 | int64_t offset; |
206 | std::vector<std::shared_ptr<Buffer>> buffers; |
207 | std::vector<std::shared_ptr<ArrayData>> child_data; |
208 | }; |
209 | |
210 | /// \brief Create a strongly-typed Array instance from generic ArrayData |
211 | /// \param[in] data the array contents |
212 | /// \return the resulting Array instance |
213 | ARROW_EXPORT |
214 | std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data); |
215 | |
216 | // ---------------------------------------------------------------------- |
217 | // User array accessor types |
218 | |
219 | /// \brief Array base type |
220 | /// Immutable data array with some logical type and some length. |
221 | /// |
222 | /// Any memory is owned by the respective Buffer instance (or its parents). |
223 | /// |
224 | /// The base class is only required to have a null bitmap buffer if the null |
225 | /// count is greater than 0 |
226 | /// |
227 | /// If known, the null count can be provided in the base Array constructor. If |
228 | /// the null count is not known, pass -1 to indicate that the null count is to |
229 | /// be computed on the first call to null_count() |
230 | class ARROW_EXPORT Array { |
231 | public: |
232 | virtual ~Array() = default; |
233 | |
234 | /// \brief Return true if value at index is null. Does not boundscheck |
235 | bool IsNull(int64_t i) const { |
236 | return null_bitmap_data_ != NULLPTR && |
237 | !BitUtil::GetBit(null_bitmap_data_, i + data_->offset); |
238 | } |
239 | |
240 | /// \brief Return true if value at index is valid (not null). Does not |
241 | /// boundscheck |
242 | bool IsValid(int64_t i) const { |
243 | return null_bitmap_data_ == NULLPTR || |
244 | BitUtil::GetBit(null_bitmap_data_, i + data_->offset); |
245 | } |
246 | |
247 | /// Size in the number of elements this array contains. |
248 | int64_t length() const { return data_->length; } |
249 | |
250 | /// A relative position into another array's data, to enable zero-copy |
251 | /// slicing. This value defaults to zero |
252 | int64_t offset() const { return data_->offset; } |
253 | |
254 | /// The number of null entries in the array. If the null count was not known |
255 | /// at time of construction (and set to a negative value), then the null |
256 | /// count will be computed and cached on the first invocation of this |
257 | /// function |
258 | int64_t null_count() const; |
259 | |
260 | std::shared_ptr<DataType> type() const { return data_->type; } |
261 | Type::type type_id() const { return data_->type->id(); } |
262 | |
263 | /// Buffer for the null bitmap. |
264 | /// |
265 | /// Note that for `null_count == 0`, this can be null. |
266 | /// This buffer does not account for any slice offset |
267 | std::shared_ptr<Buffer> null_bitmap() const { return data_->buffers[0]; } |
268 | |
269 | /// Raw pointer to the null bitmap. |
270 | /// |
271 | /// Note that for `null_count == 0`, this can be null. |
272 | /// This buffer does not account for any slice offset |
273 | const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } |
274 | |
275 | bool Equals(const Array& arr) const; |
276 | bool Equals(const std::shared_ptr<Array>& arr) const; |
277 | |
278 | bool ApproxEquals(const std::shared_ptr<Array>& arr) const; |
279 | bool ApproxEquals(const Array& arr) const; |
280 | |
281 | /// Compare if the range of slots specified are equal for the given array and |
282 | /// this array. end_idx exclusive. This methods does not bounds check. |
283 | bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, |
284 | const std::shared_ptr<Array>& other) const; |
285 | |
286 | bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, |
287 | int64_t other_start_idx) const; |
288 | |
289 | Status Accept(ArrayVisitor* visitor) const; |
290 | |
291 | /// Construct a zero-copy slice of the array with the indicated offset and |
292 | /// length |
293 | /// |
294 | /// \param[in] offset the position of the first element in the constructed |
295 | /// slice |
296 | /// \param[in] length the length of the slice. If there are not enough |
297 | /// elements in the array, the length will be adjusted accordingly |
298 | /// |
299 | /// \return a new object wrapped in std::shared_ptr<Array> |
300 | std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const; |
301 | |
302 | /// Slice from offset until end of the array |
303 | std::shared_ptr<Array> Slice(int64_t offset) const; |
304 | |
305 | std::shared_ptr<ArrayData> data() const { return data_; } |
306 | |
307 | int num_fields() const { return static_cast<int>(data_->child_data.size()); } |
308 | |
309 | /// \return PrettyPrint representation of array suitable for debugging |
310 | std::string ToString() const; |
311 | |
312 | protected: |
313 | Array() : null_bitmap_data_(NULLPTR) {} |
314 | |
315 | std::shared_ptr<ArrayData> data_; |
316 | const uint8_t* null_bitmap_data_; |
317 | |
318 | /// Protected method for constructors |
319 | inline void SetData(const std::shared_ptr<ArrayData>& data) { |
320 | if (data->buffers.size() > 0 && data->buffers[0]) { |
321 | null_bitmap_data_ = data->buffers[0]->data(); |
322 | } else { |
323 | null_bitmap_data_ = NULLPTR; |
324 | } |
325 | data_ = data; |
326 | } |
327 | |
328 | private: |
329 | ARROW_DISALLOW_COPY_AND_ASSIGN(Array); |
330 | }; |
331 | |
332 | using ArrayVector = std::vector<std::shared_ptr<Array>>; |
333 | |
334 | namespace internal { |
335 | |
336 | /// Given a number of ArrayVectors, treat each ArrayVector as the |
337 | /// chunks of a chunked array. Then rechunk each ArrayVector such that |
338 | /// all ArrayVectors are chunked identically. It is mandatory that |
339 | /// all ArrayVectors contain the same total number of elements. |
340 | ARROW_EXPORT |
341 | std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&); |
342 | |
343 | } // namespace internal |
344 | |
345 | static inline std::ostream& operator<<(std::ostream& os, const Array& x) { |
346 | os << x.ToString(); |
347 | return os; |
348 | } |
349 | |
350 | /// Base class for non-nested arrays |
351 | class ARROW_EXPORT FlatArray : public Array { |
352 | protected: |
353 | using Array::Array; |
354 | }; |
355 | |
356 | /// Degenerate null type Array |
357 | class ARROW_EXPORT NullArray : public FlatArray { |
358 | public: |
359 | using TypeClass = NullType; |
360 | |
361 | explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); } |
362 | explicit NullArray(int64_t length); |
363 | |
364 | private: |
365 | inline void SetData(const std::shared_ptr<ArrayData>& data) { |
366 | null_bitmap_data_ = NULLPTR; |
367 | data->null_count = data->length; |
368 | data_ = data; |
369 | } |
370 | }; |
371 | |
372 | /// Base class for arrays of fixed-size logical types |
373 | class ARROW_EXPORT PrimitiveArray : public FlatArray { |
374 | public: |
375 | PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length, |
376 | const std::shared_ptr<Buffer>& data, |
377 | const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, |
378 | int64_t null_count = 0, int64_t offset = 0); |
379 | |
380 | /// Does not account for any slice offset |
381 | std::shared_ptr<Buffer> values() const { return data_->buffers[1]; } |
382 | |
383 | protected: |
384 | PrimitiveArray() : raw_values_(NULLPTR) {} |
385 | |
386 | inline void SetData(const std::shared_ptr<ArrayData>& data) { |
387 | auto values = data->buffers[1]; |
388 | this->Array::SetData(data); |
389 | raw_values_ = values == NULLPTR ? NULLPTR : values->data(); |
390 | } |
391 | |
392 | explicit inline PrimitiveArray(const std::shared_ptr<ArrayData>& data) { |
393 | SetData(data); |
394 | } |
395 | |
396 | const uint8_t* raw_values_; |
397 | }; |
398 | |
399 | /// Concrete Array class for numeric data. |
400 | template <typename TYPE> |
401 | class NumericArray : public PrimitiveArray { |
402 | public: |
403 | using TypeClass = TYPE; |
404 | using value_type = typename TypeClass::c_type; |
405 | |
406 | explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {} |
407 | |
408 | // Only enable this constructor without a type argument for types without additional |
409 | // metadata |
410 | template <typename T1 = TYPE> |
411 | NumericArray( |
412 | typename std::enable_if<TypeTraits<T1>::is_parameter_free, int64_t>::type length, |
413 | const std::shared_ptr<Buffer>& data, |
414 | const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, int64_t null_count = 0, |
415 | int64_t offset = 0) |
416 | : PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap, |
417 | null_count, offset) {} |
418 | |
419 | const value_type* raw_values() const { |
420 | return reinterpret_cast<const value_type*>(raw_values_) + data_->offset; |
421 | } |
422 | |
423 | value_type Value(int64_t i) const { return raw_values()[i]; } |
424 | |
425 | // For API compatibility with BinaryArray etc. |
426 | value_type GetView(int64_t i) const { return Value(i); } |
427 | |
428 | protected: |
429 | using PrimitiveArray::PrimitiveArray; |
430 | }; |
431 | |
432 | /// Concrete Array class for boolean data |
433 | class ARROW_EXPORT BooleanArray : public PrimitiveArray { |
434 | public: |
435 | using TypeClass = BooleanType; |
436 | |
437 | explicit BooleanArray(const std::shared_ptr<ArrayData>& data); |
438 | |
439 | BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data, |
440 | const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, |
441 | int64_t null_count = 0, int64_t offset = 0); |
442 | |
443 | bool Value(int64_t i) const { |
444 | return BitUtil::GetBit(reinterpret_cast<const uint8_t*>(raw_values_), |
445 | i + data_->offset); |
446 | } |
447 | |
448 | bool GetView(int64_t i) const { return Value(i); } |
449 | |
450 | protected: |
451 | using PrimitiveArray::PrimitiveArray; |
452 | }; |
453 | |
454 | // ---------------------------------------------------------------------- |
455 | // ListArray |
456 | |
457 | /// Concrete Array class for list data |
458 | class ARROW_EXPORT ListArray : public Array { |
459 | public: |
460 | using TypeClass = ListType; |
461 | |
462 | explicit ListArray(const std::shared_ptr<ArrayData>& data); |
463 | |
464 | ListArray(const std::shared_ptr<DataType>& type, int64_t length, |
465 | const std::shared_ptr<Buffer>& value_offsets, |
466 | const std::shared_ptr<Array>& values, |
467 | const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, int64_t null_count = 0, |
468 | int64_t offset = 0); |
469 | |
470 | /// \brief Construct ListArray from array of offsets and child value array |
471 | /// |
472 | /// This function does the bare minimum of validation of the offsets and |
473 | /// input types, and will allocate a new offsets array if necessary (i.e. if |
474 | /// the offsets contain any nulls). If the offsets do not have nulls, they |
475 | /// are assumed to be well-formed |
476 | /// |
477 | /// \param[in] offsets Array containing n + 1 offsets encoding length and |
478 | /// size. Must be of int32 type |
479 | /// \param[in] values Array containing |
480 | /// \param[in] pool MemoryPool in case new offsets array needs to be |
481 | /// allocated because of null values |
482 | /// \param[out] out Will have length equal to offsets.length() - 1 |
483 | static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, |
484 | std::shared_ptr<Array>* out); |
485 | |
486 | /// \brief Return array object containing the list's values |
487 | std::shared_ptr<Array> values() const; |
488 | |
489 | /// Note that this buffer does not account for any slice offset |
490 | std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; } |
491 | |
492 | std::shared_ptr<DataType> value_type() const; |
493 | |
494 | /// Return pointer to raw value offsets accounting for any slice offset |
495 | const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } |
496 | |
497 | // Neither of these functions will perform boundschecking |
498 | int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } |
499 | int32_t value_length(int64_t i) const { |
500 | i += data_->offset; |
501 | return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; |
502 | } |
503 | |
504 | protected: |
505 | void SetData(const std::shared_ptr<ArrayData>& data); |
506 | const int32_t* raw_value_offsets_; |
507 | |
508 | private: |
509 | std::shared_ptr<Array> values_; |
510 | }; |
511 | |
512 | // ---------------------------------------------------------------------- |
513 | // Binary and String |
514 | |
515 | /// Concrete Array class for variable-size binary data |
516 | class ARROW_EXPORT BinaryArray : public FlatArray { |
517 | public: |
518 | using TypeClass = BinaryType; |
519 | |
520 | explicit BinaryArray(const std::shared_ptr<ArrayData>& data); |
521 | |
522 | BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, |
523 | const std::shared_ptr<Buffer>& data, |
524 | const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, |
525 | int64_t null_count = 0, int64_t offset = 0); |
526 | |
527 | /// Return the pointer to the given elements bytes |
528 | // XXX should GetValue(int64_t i) return a string_view? |
529 | const uint8_t* GetValue(int64_t i, int32_t* out_length) const { |
530 | // Account for base offset |
531 | i += data_->offset; |
532 | const int32_t pos = raw_value_offsets_[i]; |
533 | *out_length = raw_value_offsets_[i + 1] - pos; |
534 | return raw_data_ + pos; |
535 | } |
536 | |
537 | /// \brief Get binary value as a string_view |
538 | /// |
539 | /// \param i the value index |
540 | /// \return the view over the selected value |
541 | util::string_view GetView(int64_t i) const { |
542 | // Account for base offset |
543 | i += data_->offset; |
544 | const int32_t pos = raw_value_offsets_[i]; |
545 | return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos), |
546 | raw_value_offsets_[i + 1] - pos); |
547 | } |
548 | |
549 | /// \brief Get binary value as a std::string |
550 | /// |
551 | /// \param i the value index |
552 | /// \return the value copied into a std::string |
553 | std::string GetString(int64_t i) const { return std::string(GetView(i)); } |
554 | |
555 | /// Note that this buffer does not account for any slice offset |
556 | std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; } |
557 | |
558 | /// Note that this buffer does not account for any slice offset |
559 | std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; } |
560 | |
561 | const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } |
562 | |
563 | // Neither of these functions will perform boundschecking |
564 | int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } |
565 | int32_t value_length(int64_t i) const { |
566 | i += data_->offset; |
567 | return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; |
568 | } |
569 | |
570 | protected: |
571 | // For subclasses |
572 | BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} |
573 | |
574 | /// Protected method for constructors |
575 | void SetData(const std::shared_ptr<ArrayData>& data); |
576 | |
577 | // Constructor that allows sub-classes/builders to propagate there logical type up the |
578 | // class hierarchy. |
579 | BinaryArray(const std::shared_ptr<DataType>& type, int64_t length, |
580 | const std::shared_ptr<Buffer>& value_offsets, |
581 | const std::shared_ptr<Buffer>& data, |
582 | const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, |
583 | int64_t null_count = 0, int64_t offset = 0); |
584 | |
585 | const int32_t* raw_value_offsets_; |
586 | const uint8_t* raw_data_; |
587 | }; |
588 | |
589 | /// Concrete Array class for variable-size string (utf-8) data |
590 | class ARROW_EXPORT StringArray : public BinaryArray { |
591 | public: |
592 | using TypeClass = StringType; |
593 | |
594 | explicit StringArray(const std::shared_ptr<ArrayData>& data); |
595 | |
596 | StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, |
597 | const std::shared_ptr<Buffer>& data, |
598 | const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, |
599 | int64_t null_count = 0, int64_t offset = 0); |
600 | }; |
601 | |
602 | // ---------------------------------------------------------------------- |
603 | // Fixed width binary |
604 | |
605 | /// Concrete Array class for fixed-size binary data |
606 | class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { |
607 | public: |
608 | using TypeClass = FixedSizeBinaryType; |
609 | |
610 | explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data); |
611 | |
612 | FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length, |
613 | const std::shared_ptr<Buffer>& data, |
614 | const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, |
615 | int64_t null_count = 0, int64_t offset = 0); |
616 | |
617 | const uint8_t* GetValue(int64_t i) const; |
618 | const uint8_t* Value(int64_t i) const { return GetValue(i); } |
619 | |
620 | util::string_view GetView(int64_t i) const { |
621 | return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width()); |
622 | } |
623 | |
624 | std::string GetString(int64_t i) const { return std::string(GetView(i)); } |
625 | |
626 | int32_t byte_width() const { return byte_width_; } |
627 | |
628 | const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } |
629 | |
630 | protected: |
631 | inline void SetData(const std::shared_ptr<ArrayData>& data) { |
632 | this->PrimitiveArray::SetData(data); |
633 | byte_width_ = |
634 | internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width(); |
635 | } |
636 | |
637 | int32_t byte_width_; |
638 | }; |
639 | |
640 | // ---------------------------------------------------------------------- |
641 | // Decimal128Array |
642 | |
643 | /// Concrete Array class for 128-bit decimal data |
644 | class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray { |
645 | public: |
646 | using TypeClass = Decimal128Type; |
647 | |
648 | using FixedSizeBinaryArray::FixedSizeBinaryArray; |
649 | |
650 | /// \brief Construct Decimal128Array from ArrayData instance |
651 | explicit Decimal128Array(const std::shared_ptr<ArrayData>& data); |
652 | |
653 | std::string FormatValue(int64_t i) const; |
654 | }; |
655 | |
656 | // Backward compatibility |
657 | using DecimalArray = Decimal128Array; |
658 | |
659 | // ---------------------------------------------------------------------- |
660 | // Struct |
661 | |
662 | /// Concrete Array class for struct data |
663 | class ARROW_EXPORT StructArray : public Array { |
664 | public: |
665 | using TypeClass = StructType; |
666 | |
667 | explicit StructArray(const std::shared_ptr<ArrayData>& data); |
668 | |
669 | StructArray(const std::shared_ptr<DataType>& type, int64_t length, |
670 | const std::vector<std::shared_ptr<Array>>& children, |
671 | std::shared_ptr<Buffer> null_bitmap = NULLPTR, int64_t null_count = 0, |
672 | int64_t offset = 0); |
673 | |
674 | const StructType* struct_type() const; |
675 | |
676 | // Return a shared pointer in case the requestor desires to share ownership |
677 | // with this array. The returned array has its offset, length and null |
678 | // count adjusted. |
679 | std::shared_ptr<Array> field(int pos) const; |
680 | |
681 | /// Returns null if name not found |
682 | std::shared_ptr<Array> GetFieldByName(const std::string& name) const; |
683 | |
684 | /// \brief Flatten this array as a vector of arrays, one for each field |
685 | /// |
686 | /// \param[in] pool The pool to allocate null bitmaps from, if necessary |
687 | /// \param[out] out The resulting vector of arrays |
688 | Status Flatten(MemoryPool* pool, ArrayVector* out) const; |
689 | |
690 | private: |
691 | // For caching boxed child data |
692 | mutable std::vector<std::shared_ptr<Array>> boxed_fields_; |
693 | }; |
694 | |
695 | // ---------------------------------------------------------------------- |
696 | // Union |
697 | |
698 | /// Concrete Array class for union data |
699 | class ARROW_EXPORT UnionArray : public Array { |
700 | public: |
701 | using TypeClass = UnionType; |
702 | using type_id_t = uint8_t; |
703 | |
704 | explicit UnionArray(const std::shared_ptr<ArrayData>& data); |
705 | |
706 | UnionArray(const std::shared_ptr<DataType>& type, int64_t length, |
707 | const std::vector<std::shared_ptr<Array>>& children, |
708 | const std::shared_ptr<Buffer>& type_ids, |
709 | const std::shared_ptr<Buffer>& value_offsets = NULLPTR, |
710 | const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, int64_t null_count = 0, |
711 | int64_t offset = 0); |
712 | |
713 | /// \brief Construct Dense UnionArray from types_ids, value_offsets and children |
714 | /// |
715 | /// This function does the bare minimum of validation of the offsets and |
716 | /// input types. The value_offsets are assumed to be well-formed. |
717 | /// |
718 | /// \param[in] type_ids An array of 8-bit signed integers, enumerated from |
719 | /// 0 corresponding to each type. |
720 | /// \param[in] value_offsets An array of signed int32 values indicating the |
721 | /// relative offset into the respective child array for the type in a given slot. |
722 | /// The respective offsets for each child value array must be in order / increasing. |
723 | /// \param[in] children Vector of children Arrays containing the data for each type. |
724 | /// \param[out] out Will have length equal to value_offsets.length() |
725 | static Status MakeDense(const Array& type_ids, const Array& value_offsets, |
726 | const std::vector<std::shared_ptr<Array>>& children, |
727 | std::shared_ptr<Array>* out); |
728 | |
729 | /// \brief Construct Sparse UnionArray from type_ids and children |
730 | /// |
731 | /// This function does the bare minimum of validation of the offsets and |
732 | /// input types. |
733 | /// |
734 | /// \param[in] type_ids An array of 8-bit signed integers, enumerated from |
735 | /// 0 corresponding to each type. |
736 | /// \param[in] children Vector of children Arrays containing the data for each type. |
737 | /// \param[out] out Will have length equal to type_ids.length() |
738 | static Status MakeSparse(const Array& type_ids, |
739 | const std::vector<std::shared_ptr<Array>>& children, |
740 | std::shared_ptr<Array>* out); |
741 | |
742 | /// Note that this buffer does not account for any slice offset |
743 | std::shared_ptr<Buffer> type_ids() const { return data_->buffers[1]; } |
744 | |
745 | /// Note that this buffer does not account for any slice offset |
746 | std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; } |
747 | |
748 | int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } |
749 | |
750 | const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } |
751 | const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } |
752 | |
753 | UnionMode::type mode() const { |
754 | return internal::checked_cast<const UnionType&>(*type()).mode(); |
755 | } |
756 | |
757 | // Return the given field as an individual array. |
758 | // For sparse unions, the returned array has its offset, length and null |
759 | // count adjusted. |
760 | // For dense unions, the returned array is unchanged. |
761 | std::shared_ptr<Array> child(int pos) const; |
762 | |
763 | /// Only use this while the UnionArray is in scope |
764 | const Array* UnsafeChild(int pos) const; |
765 | |
766 | protected: |
767 | void SetData(const std::shared_ptr<ArrayData>& data); |
768 | |
769 | const type_id_t* raw_type_ids_; |
770 | const int32_t* raw_value_offsets_; |
771 | |
772 | // For caching boxed child data |
773 | mutable std::vector<std::shared_ptr<Array>> boxed_fields_; |
774 | }; |
775 | |
776 | // ---------------------------------------------------------------------- |
777 | // DictionaryArray (categorical and dictionary-encoded in memory) |
778 | |
779 | /// \brief Concrete Array class for dictionary data |
780 | /// |
781 | /// A dictionary array contains an array of non-negative integers (the |
782 | /// "dictionary indices") along with a data type containing a "dictionary" |
783 | /// corresponding to the distinct values represented in the data. |
784 | /// |
785 | /// For example, the array |
786 | /// |
787 | /// ["foo", "bar", "foo", "bar", "foo", "bar"] |
788 | /// |
789 | /// with dictionary ["bar", "foo"], would have dictionary array representation |
790 | /// |
791 | /// indices: [1, 0, 1, 0, 1, 0] |
792 | /// dictionary: ["bar", "foo"] |
793 | /// |
794 | /// The indices in principle may have any integer type (signed or unsigned), |
795 | /// though presently data in IPC exchanges must be signed int32. |
796 | class ARROW_EXPORT DictionaryArray : public Array { |
797 | public: |
798 | using TypeClass = DictionaryType; |
799 | |
800 | explicit DictionaryArray(const std::shared_ptr<ArrayData>& data); |
801 | |
802 | DictionaryArray(const std::shared_ptr<DataType>& type, |
803 | const std::shared_ptr<Array>& indices); |
804 | |
805 | /// \brief Construct DictionaryArray from dictionary data type and indices array |
806 | /// |
807 | /// This function does the validation of the indices and input type. It checks if |
808 | /// all indices are non-negative and smaller than the size of the dictionary |
809 | /// |
810 | /// \param[in] type a dictionary type |
811 | /// \param[in] indices an array of non-negative signed |
812 | /// integers smaller than the size of the dictionary |
813 | /// \param[out] out the resulting DictionaryArray instance |
814 | static Status FromArrays(const std::shared_ptr<DataType>& type, |
815 | const std::shared_ptr<Array>& indices, |
816 | std::shared_ptr<Array>* out); |
817 | |
818 | /// \brief Transpose this DictionaryArray |
819 | /// |
820 | /// This method constructs a new dictionary array with the given dictionary type, |
821 | /// transposing indices using the transpose map. |
822 | /// The type and the transpose map are typically computed using |
823 | /// DictionaryType::Unify. |
824 | /// |
825 | /// \param[in] pool a pool to allocate the array data from |
826 | /// \param[in] type a dictionary type |
827 | /// \param[in] transpose_map a vector transposing this array's indices |
828 | /// into the target array's indices |
829 | /// \param[out] out the resulting DictionaryArray instance |
830 | Status Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type, |
831 | const std::vector<int32_t>& transpose_map, |
832 | std::shared_ptr<Array>* out) const; |
833 | // XXX Do we also want an unsafe in-place Transpose? |
834 | |
835 | std::shared_ptr<Array> indices() const; |
836 | std::shared_ptr<Array> dictionary() const; |
837 | |
838 | const DictionaryType* dict_type() const { return dict_type_; } |
839 | |
840 | private: |
841 | void SetData(const std::shared_ptr<ArrayData>& data); |
842 | |
843 | const DictionaryType* dict_type_; |
844 | std::shared_ptr<Array> indices_; |
845 | }; |
846 | |
847 | /// \brief Perform any validation checks to determine obvious inconsistencies |
848 | /// with the array's internal data |
849 | /// |
850 | /// This can be an expensive check. |
851 | /// |
852 | /// \param array an Array instance |
853 | /// \return Status |
854 | ARROW_EXPORT |
855 | Status ValidateArray(const Array& array); |
856 | |
857 | } // namespace arrow |
858 | |
859 | #endif // ARROW_ARRAY_H |
860 | |