1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #pragma once |
19 | |
20 | #include <limits> |
21 | #include <memory> |
22 | #include <sstream> |
23 | #include <string> |
24 | #include <vector> |
25 | |
26 | #include "arrow/array.h" |
27 | #include "arrow/array/builder_base.h" |
28 | #include "arrow/buffer-builder.h" |
29 | #include "arrow/status.h" |
30 | #include "arrow/type_traits.h" |
31 | #include "arrow/util/macros.h" |
32 | #include "arrow/util/string_view.h" |
33 | |
34 | namespace arrow { |
35 | |
36 | constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1; |
37 | |
38 | // ---------------------------------------------------------------------- |
39 | // Binary and String |
40 | |
41 | /// \class BinaryBuilder |
42 | /// \brief Builder class for variable-length binary data |
43 | class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { |
44 | public: |
45 | explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); |
46 | |
47 | BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool); |
48 | |
49 | Status Append(const uint8_t* value, int32_t length) { |
50 | ARROW_RETURN_NOT_OK(Reserve(1)); |
51 | ARROW_RETURN_NOT_OK(AppendNextOffset()); |
52 | ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); |
53 | |
54 | UnsafeAppendToBitmap(true); |
55 | return Status::OK(); |
56 | } |
57 | |
58 | Status AppendNull() { |
59 | ARROW_RETURN_NOT_OK(AppendNextOffset()); |
60 | ARROW_RETURN_NOT_OK(Reserve(1)); |
61 | UnsafeAppendToBitmap(false); |
62 | return Status::OK(); |
63 | } |
64 | |
65 | Status Append(const char* value, int32_t length) { |
66 | return Append(reinterpret_cast<const uint8_t*>(value), length); |
67 | } |
68 | |
69 | Status Append(util::string_view value) { |
70 | return Append(value.data(), static_cast<int32_t>(value.size())); |
71 | } |
72 | |
73 | /// \brief Append without checking capacity |
74 | /// |
75 | /// Offsets and data should have been presized using Reserve() and |
76 | /// ReserveData(), respectively. |
77 | void UnsafeAppend(const uint8_t* value, int32_t length) { |
78 | UnsafeAppendNextOffset(); |
79 | value_data_builder_.UnsafeAppend(value, length); |
80 | UnsafeAppendToBitmap(true); |
81 | } |
82 | |
83 | void UnsafeAppend(const char* value, int32_t length) { |
84 | UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length); |
85 | } |
86 | |
87 | void UnsafeAppend(const std::string& value) { |
88 | UnsafeAppend(value.c_str(), static_cast<int32_t>(value.size())); |
89 | } |
90 | |
91 | void UnsafeAppendNull() { |
92 | const int64_t num_bytes = value_data_builder_.length(); |
93 | offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes)); |
94 | UnsafeAppendToBitmap(false); |
95 | } |
96 | |
97 | void Reset() override; |
98 | Status Resize(int64_t capacity) override; |
99 | |
100 | /// \brief Ensures there is enough allocated capacity to append the indicated |
101 | /// number of bytes to the value data buffer without additional allocations |
102 | Status ReserveData(int64_t elements); |
103 | |
104 | Status FinishInternal(std::shared_ptr<ArrayData>* out) override; |
105 | |
106 | /// \return size of values buffer so far |
107 | int64_t value_data_length() const { return value_data_builder_.length(); } |
108 | /// \return capacity of values buffer |
109 | int64_t value_data_capacity() const { return value_data_builder_.capacity(); } |
110 | |
111 | /// Temporary access to a value. |
112 | /// |
113 | /// This pointer becomes invalid on the next modifying operation. |
114 | const uint8_t* GetValue(int64_t i, int32_t* out_length) const; |
115 | |
116 | /// Temporary access to a value. |
117 | /// |
118 | /// This view becomes invalid on the next modifying operation. |
119 | util::string_view GetView(int64_t i) const; |
120 | |
121 | protected: |
122 | TypedBufferBuilder<int32_t> offsets_builder_; |
123 | TypedBufferBuilder<uint8_t> value_data_builder_; |
124 | |
125 | Status AppendOverflow(int64_t num_bytes); |
126 | |
127 | Status AppendNextOffset() { |
128 | const int64_t num_bytes = value_data_builder_.length(); |
129 | if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { |
130 | return AppendOverflow(num_bytes); |
131 | } |
132 | return offsets_builder_.Append(static_cast<int32_t>(num_bytes)); |
133 | } |
134 | |
135 | void UnsafeAppendNextOffset() { |
136 | const int64_t num_bytes = value_data_builder_.length(); |
137 | offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes)); |
138 | } |
139 | }; |
140 | |
141 | /// \class StringBuilder |
142 | /// \brief Builder class for UTF8 strings |
143 | class ARROW_EXPORT StringBuilder : public BinaryBuilder { |
144 | public: |
145 | using BinaryBuilder::BinaryBuilder; |
146 | explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); |
147 | |
148 | using BinaryBuilder::Append; |
149 | using BinaryBuilder::Reset; |
150 | using BinaryBuilder::UnsafeAppend; |
151 | |
152 | /// \brief Append a sequence of strings in one shot. |
153 | /// |
154 | /// \param[in] values a vector of strings |
155 | /// \param[in] valid_bytes an optional sequence of bytes where non-zero |
156 | /// indicates a valid (non-null) value |
157 | /// \return Status |
158 | Status AppendValues(const std::vector<std::string>& values, |
159 | const uint8_t* valid_bytes = NULLPTR); |
160 | |
161 | /// \brief Append a sequence of nul-terminated strings in one shot. |
162 | /// If one of the values is NULL, it is processed as a null |
163 | /// value even if the corresponding valid_bytes entry is 1. |
164 | /// |
165 | /// \param[in] values a contiguous C array of nul-terminated char * |
166 | /// \param[in] length the number of values to append |
167 | /// \param[in] valid_bytes an optional sequence of bytes where non-zero |
168 | /// indicates a valid (non-null) value |
169 | /// \return Status |
170 | Status AppendValues(const char** values, int64_t length, |
171 | const uint8_t* valid_bytes = NULLPTR); |
172 | }; |
173 | |
174 | // ---------------------------------------------------------------------- |
175 | // FixedSizeBinaryBuilder |
176 | |
177 | class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { |
178 | public: |
179 | FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type, |
180 | MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); |
181 | |
182 | Status Append(const uint8_t* value) { |
183 | ARROW_RETURN_NOT_OK(Reserve(1)); |
184 | UnsafeAppendToBitmap(true); |
185 | return byte_builder_.Append(value, byte_width_); |
186 | } |
187 | |
188 | Status Append(const char* value) { |
189 | return Append(reinterpret_cast<const uint8_t*>(value)); |
190 | } |
191 | |
192 | Status Append(const util::string_view& view) { |
193 | #ifndef NDEBUG |
194 | CheckValueSize(static_cast<int64_t>(view.size())); |
195 | #endif |
196 | return Append(reinterpret_cast<const uint8_t*>(view.data())); |
197 | } |
198 | |
199 | Status Append(const std::string& s) { |
200 | #ifndef NDEBUG |
201 | CheckValueSize(static_cast<int64_t>(s.size())); |
202 | #endif |
203 | return Append(reinterpret_cast<const uint8_t*>(s.data())); |
204 | } |
205 | |
206 | template <size_t NBYTES> |
207 | Status Append(const std::array<uint8_t, NBYTES>& value) { |
208 | ARROW_RETURN_NOT_OK(Reserve(1)); |
209 | UnsafeAppendToBitmap(true); |
210 | return byte_builder_.Append(value); |
211 | } |
212 | |
213 | Status AppendValues(const uint8_t* data, int64_t length, |
214 | const uint8_t* valid_bytes = NULLPTR); |
215 | Status AppendNull(); |
216 | |
217 | void Reset() override; |
218 | Status Resize(int64_t capacity) override; |
219 | Status FinishInternal(std::shared_ptr<ArrayData>* out) override; |
220 | |
221 | /// \return size of values buffer so far |
222 | int64_t value_data_length() const { return byte_builder_.length(); } |
223 | |
224 | int32_t byte_width() const { return byte_width_; } |
225 | |
226 | /// Temporary access to a value. |
227 | /// |
228 | /// This pointer becomes invalid on the next modifying operation. |
229 | const uint8_t* GetValue(int64_t i) const; |
230 | |
231 | /// Temporary access to a value. |
232 | /// |
233 | /// This view becomes invalid on the next modifying operation. |
234 | util::string_view GetView(int64_t i) const; |
235 | |
236 | protected: |
237 | int32_t byte_width_; |
238 | BufferBuilder byte_builder_; |
239 | |
240 | #ifndef NDEBUG |
241 | void CheckValueSize(int64_t size); |
242 | #endif |
243 | }; |
244 | |
245 | // ---------------------------------------------------------------------- |
246 | // Chunked builders: build a sequence of BinaryArray or StringArray that are |
247 | // limited to a particular size (to the upper limit of 2GB) |
248 | |
249 | namespace internal { |
250 | |
251 | class ARROW_EXPORT ChunkedBinaryBuilder { |
252 | public: |
253 | ChunkedBinaryBuilder(int32_t max_chunk_size, |
254 | MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); |
255 | |
256 | virtual ~ChunkedBinaryBuilder() = default; |
257 | |
258 | Status Append(const uint8_t* value, int32_t length) { |
259 | if (ARROW_PREDICT_FALSE(length + chunk_data_size_ > max_chunk_size_)) { |
260 | // Move onto next chunk, unless the builder length is currently 0, which |
261 | // means that max_chunk_size_ is less than the item length |
262 | if (builder_->length() > 0) { |
263 | ARROW_RETURN_NOT_OK(NextChunk()); |
264 | } |
265 | // else fall through |
266 | } |
267 | |
268 | chunk_data_size_ += length; |
269 | return builder_->Append(value, length); |
270 | } |
271 | |
272 | Status Append(const util::string_view& value) { |
273 | return Append(reinterpret_cast<const uint8_t*>(value.data()), |
274 | static_cast<int32_t>(value.size())); |
275 | } |
276 | |
277 | Status AppendNull() { |
278 | if (ARROW_PREDICT_FALSE(builder_->length() == std::numeric_limits<int32_t>::max())) { |
279 | ARROW_RETURN_NOT_OK(NextChunk()); |
280 | } |
281 | return builder_->AppendNull(); |
282 | } |
283 | |
284 | virtual Status Finish(ArrayVector* out); |
285 | |
286 | protected: |
287 | Status NextChunk(); |
288 | |
289 | int32_t max_chunk_size_; |
290 | int32_t chunk_data_size_; |
291 | |
292 | std::unique_ptr<BinaryBuilder> builder_; |
293 | std::vector<std::shared_ptr<Array>> chunks_; |
294 | }; |
295 | |
296 | class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder { |
297 | public: |
298 | using ChunkedBinaryBuilder::ChunkedBinaryBuilder; |
299 | |
300 | Status Finish(ArrayVector* out) override; |
301 | }; |
302 | |
303 | } // namespace internal |
304 | |
305 | } // namespace arrow |
306 | |