1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #pragma once |
19 | |
20 | #include <algorithm> // IWYU pragma: keep |
21 | #include <array> |
22 | #include <cstddef> |
23 | #include <cstdint> |
24 | #include <cstring> |
25 | #include <iterator> |
26 | #include <limits> |
27 | #include <memory> |
28 | #include <string> |
29 | #include <type_traits> |
30 | #include <vector> |
31 | |
32 | #include "arrow/buffer-builder.h" |
33 | #include "arrow/memory_pool.h" |
34 | #include "arrow/status.h" |
35 | #include "arrow/type.h" |
36 | #include "arrow/type_traits.h" |
37 | #include "arrow/util/bit-util.h" |
38 | #include "arrow/util/macros.h" |
39 | #include "arrow/util/string_view.h" |
40 | #include "arrow/util/type_traits.h" |
41 | #include "arrow/util/visibility.h" |
42 | |
43 | namespace arrow { |
44 | |
45 | class Array; |
46 | struct ArrayData; |
47 | |
48 | constexpr int64_t kMinBuilderCapacity = 1 << 5; |
49 | constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1; |
50 | |
51 | /// Base class for all data array builders. |
52 | /// |
53 | /// This class provides a facilities for incrementally building the null bitmap |
54 | /// (see Append methods) and as a side effect the current number of slots and |
55 | /// the null count. |
56 | /// |
57 | /// \note Users are expected to use builders as one of the concrete types below. |
58 | /// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. |
59 | class ARROW_EXPORT ArrayBuilder { |
60 | public: |
61 | explicit ArrayBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool) |
62 | : type_(type), pool_(pool), null_bitmap_builder_(pool) {} |
63 | |
64 | virtual ~ArrayBuilder() = default; |
65 | |
66 | /// For nested types. Since the objects are owned by this class instance, we |
67 | /// skip shared pointers and just return a raw pointer |
68 | ArrayBuilder* child(int i) { return children_[i].get(); } |
69 | |
70 | int num_children() const { return static_cast<int>(children_.size()); } |
71 | |
72 | int64_t length() const { return length_; } |
73 | int64_t null_count() const { return null_count_; } |
74 | int64_t capacity() const { return capacity_; } |
75 | |
76 | /// \brief Ensure that enough memory has been allocated to fit the indicated |
77 | /// number of total elements in the builder, including any that have already |
78 | /// been appended. Does not account for reallocations that may be due to |
79 | /// variable size data, like binary values. To make space for incremental |
80 | /// appends, use Reserve instead. |
81 | /// |
82 | /// \param[in] capacity the minimum number of total array values to |
83 | /// accommodate. Must be greater than the current capacity. |
84 | /// \return Status |
85 | virtual Status Resize(int64_t capacity); |
86 | |
87 | /// \brief Ensure that there is enough space allocated to add the indicated |
88 | /// number of elements without any further calls to Resize. The memory |
89 | /// allocated is rounded up to the next highest power of 2 similar to memory |
90 | /// allocations in STL containers like std::vector |
91 | /// \param[in] additional_capacity the number of additional array values |
92 | /// \return Status |
93 | Status Reserve(int64_t additional_capacity) { |
94 | auto min_capacity = length() + additional_capacity; |
95 | if (min_capacity <= capacity()) return Status::OK(); |
96 | |
97 | // leave growth factor up to BufferBuilder |
98 | auto new_capacity = BufferBuilder::GrowByFactor(min_capacity); |
99 | return Resize(new_capacity); |
100 | } |
101 | |
102 | /// Reset the builder. |
103 | virtual void Reset(); |
104 | |
105 | /// For cases where raw data was memcpy'd into the internal buffers, allows us |
106 | /// to advance the length of the builder. It is your responsibility to use |
107 | /// this function responsibly. |
108 | Status Advance(int64_t elements); |
109 | |
110 | /// \brief Return result of builder as an internal generic ArrayData |
111 | /// object. Resets builder except for dictionary builder |
112 | /// |
113 | /// \param[out] out the finalized ArrayData object |
114 | /// \return Status |
115 | virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0; |
116 | |
117 | /// \brief Return result of builder as an Array object. |
118 | /// |
119 | /// The builder is reset except for DictionaryBuilder. |
120 | /// |
121 | /// \param[out] out the finalized Array object |
122 | /// \return Status |
123 | Status Finish(std::shared_ptr<Array>* out); |
124 | |
125 | std::shared_ptr<DataType> type() const { return type_; } |
126 | |
127 | protected: |
128 | /// Append to null bitmap |
129 | Status AppendToBitmap(bool is_valid); |
130 | |
131 | /// Vector append. Treat each zero byte as a null. If valid_bytes is null |
132 | /// assume all of length bits are valid. |
133 | Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); |
134 | |
135 | /// Set the next length bits to not null (i.e. valid). |
136 | Status SetNotNull(int64_t length); |
137 | |
138 | // Unsafe operations (don't check capacity/don't resize) |
139 | |
140 | void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } |
141 | |
142 | // Append to null bitmap, update the length |
143 | void UnsafeAppendToBitmap(bool is_valid) { |
144 | null_bitmap_builder_.UnsafeAppend(is_valid); |
145 | ++length_; |
146 | if (!is_valid) ++null_count_; |
147 | } |
148 | |
149 | // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null |
150 | // assume all of length bits are valid. |
151 | void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { |
152 | if (valid_bytes == NULLPTR) { |
153 | return UnsafeSetNotNull(length); |
154 | } |
155 | null_bitmap_builder_.UnsafeAppend(valid_bytes, length); |
156 | length_ += length; |
157 | null_count_ = null_bitmap_builder_.false_count(); |
158 | } |
159 | |
160 | void UnsafeAppendToBitmap(const std::vector<bool>& is_valid); |
161 | |
162 | // Set the next length bits to not null (i.e. valid). |
163 | void UnsafeSetNotNull(int64_t length); |
164 | |
165 | static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); |
166 | |
167 | static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { |
168 | if (new_capacity < 0) { |
169 | return Status::Invalid("Resize capacity must be positive" ); |
170 | } |
171 | if (new_capacity < old_capacity) { |
172 | return Status::Invalid("Resize cannot downsize" ); |
173 | } |
174 | return Status::OK(); |
175 | } |
176 | |
177 | std::shared_ptr<DataType> type_; |
178 | MemoryPool* pool_; |
179 | |
180 | TypedBufferBuilder<bool> null_bitmap_builder_; |
181 | int64_t null_count_ = 0; |
182 | |
183 | // Array length, so far. Also, the index of the next element to be added |
184 | int64_t length_ = 0; |
185 | int64_t capacity_ = 0; |
186 | |
187 | // Child value array builders. These are owned by this class |
188 | std::vector<std::shared_ptr<ArrayBuilder>> children_; |
189 | |
190 | private: |
191 | ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); |
192 | }; |
193 | |
194 | } // namespace arrow |
195 | |