1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <limits>
21#include <memory>
22#include <sstream>
23#include <string>
24#include <vector>
25
26#include "arrow/array.h"
27#include "arrow/array/builder_base.h"
28#include "arrow/buffer-builder.h"
29#include "arrow/status.h"
30#include "arrow/type_traits.h"
31#include "arrow/util/macros.h"
32#include "arrow/util/string_view.h"
33
34namespace arrow {
35
36constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
37
38// ----------------------------------------------------------------------
39// Binary and String
40
41/// \class BinaryBuilder
42/// \brief Builder class for variable-length binary data
43class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
44 public:
45 explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
46
47 BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
48
49 Status Append(const uint8_t* value, int32_t length) {
50 ARROW_RETURN_NOT_OK(Reserve(1));
51 ARROW_RETURN_NOT_OK(AppendNextOffset());
52 ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
53
54 UnsafeAppendToBitmap(true);
55 return Status::OK();
56 }
57
58 Status AppendNull() {
59 ARROW_RETURN_NOT_OK(AppendNextOffset());
60 ARROW_RETURN_NOT_OK(Reserve(1));
61 UnsafeAppendToBitmap(false);
62 return Status::OK();
63 }
64
65 Status Append(const char* value, int32_t length) {
66 return Append(reinterpret_cast<const uint8_t*>(value), length);
67 }
68
69 Status Append(util::string_view value) {
70 return Append(value.data(), static_cast<int32_t>(value.size()));
71 }
72
73 /// \brief Append without checking capacity
74 ///
75 /// Offsets and data should have been presized using Reserve() and
76 /// ReserveData(), respectively.
77 void UnsafeAppend(const uint8_t* value, int32_t length) {
78 UnsafeAppendNextOffset();
79 value_data_builder_.UnsafeAppend(value, length);
80 UnsafeAppendToBitmap(true);
81 }
82
83 void UnsafeAppend(const char* value, int32_t length) {
84 UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
85 }
86
87 void UnsafeAppend(const std::string& value) {
88 UnsafeAppend(value.c_str(), static_cast<int32_t>(value.size()));
89 }
90
91 void UnsafeAppendNull() {
92 const int64_t num_bytes = value_data_builder_.length();
93 offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes));
94 UnsafeAppendToBitmap(false);
95 }
96
97 void Reset() override;
98 Status Resize(int64_t capacity) override;
99
100 /// \brief Ensures there is enough allocated capacity to append the indicated
101 /// number of bytes to the value data buffer without additional allocations
102 Status ReserveData(int64_t elements);
103
104 Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
105
106 /// \return size of values buffer so far
107 int64_t value_data_length() const { return value_data_builder_.length(); }
108 /// \return capacity of values buffer
109 int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
110
111 /// Temporary access to a value.
112 ///
113 /// This pointer becomes invalid on the next modifying operation.
114 const uint8_t* GetValue(int64_t i, int32_t* out_length) const;
115
116 /// Temporary access to a value.
117 ///
118 /// This view becomes invalid on the next modifying operation.
119 util::string_view GetView(int64_t i) const;
120
121 protected:
122 TypedBufferBuilder<int32_t> offsets_builder_;
123 TypedBufferBuilder<uint8_t> value_data_builder_;
124
125 Status AppendOverflow(int64_t num_bytes);
126
127 Status AppendNextOffset() {
128 const int64_t num_bytes = value_data_builder_.length();
129 if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) {
130 return AppendOverflow(num_bytes);
131 }
132 return offsets_builder_.Append(static_cast<int32_t>(num_bytes));
133 }
134
135 void UnsafeAppendNextOffset() {
136 const int64_t num_bytes = value_data_builder_.length();
137 offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes));
138 }
139};
140
141/// \class StringBuilder
142/// \brief Builder class for UTF8 strings
143class ARROW_EXPORT StringBuilder : public BinaryBuilder {
144 public:
145 using BinaryBuilder::BinaryBuilder;
146 explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
147
148 using BinaryBuilder::Append;
149 using BinaryBuilder::Reset;
150 using BinaryBuilder::UnsafeAppend;
151
152 /// \brief Append a sequence of strings in one shot.
153 ///
154 /// \param[in] values a vector of strings
155 /// \param[in] valid_bytes an optional sequence of bytes where non-zero
156 /// indicates a valid (non-null) value
157 /// \return Status
158 Status AppendValues(const std::vector<std::string>& values,
159 const uint8_t* valid_bytes = NULLPTR);
160
161 /// \brief Append a sequence of nul-terminated strings in one shot.
162 /// If one of the values is NULL, it is processed as a null
163 /// value even if the corresponding valid_bytes entry is 1.
164 ///
165 /// \param[in] values a contiguous C array of nul-terminated char *
166 /// \param[in] length the number of values to append
167 /// \param[in] valid_bytes an optional sequence of bytes where non-zero
168 /// indicates a valid (non-null) value
169 /// \return Status
170 Status AppendValues(const char** values, int64_t length,
171 const uint8_t* valid_bytes = NULLPTR);
172};
173
174// ----------------------------------------------------------------------
175// FixedSizeBinaryBuilder
176
177class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
178 public:
179 FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
180 MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
181
182 Status Append(const uint8_t* value) {
183 ARROW_RETURN_NOT_OK(Reserve(1));
184 UnsafeAppendToBitmap(true);
185 return byte_builder_.Append(value, byte_width_);
186 }
187
188 Status Append(const char* value) {
189 return Append(reinterpret_cast<const uint8_t*>(value));
190 }
191
192 Status Append(const util::string_view& view) {
193#ifndef NDEBUG
194 CheckValueSize(static_cast<int64_t>(view.size()));
195#endif
196 return Append(reinterpret_cast<const uint8_t*>(view.data()));
197 }
198
199 Status Append(const std::string& s) {
200#ifndef NDEBUG
201 CheckValueSize(static_cast<int64_t>(s.size()));
202#endif
203 return Append(reinterpret_cast<const uint8_t*>(s.data()));
204 }
205
206 template <size_t NBYTES>
207 Status Append(const std::array<uint8_t, NBYTES>& value) {
208 ARROW_RETURN_NOT_OK(Reserve(1));
209 UnsafeAppendToBitmap(true);
210 return byte_builder_.Append(value);
211 }
212
213 Status AppendValues(const uint8_t* data, int64_t length,
214 const uint8_t* valid_bytes = NULLPTR);
215 Status AppendNull();
216
217 void Reset() override;
218 Status Resize(int64_t capacity) override;
219 Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
220
221 /// \return size of values buffer so far
222 int64_t value_data_length() const { return byte_builder_.length(); }
223
224 int32_t byte_width() const { return byte_width_; }
225
226 /// Temporary access to a value.
227 ///
228 /// This pointer becomes invalid on the next modifying operation.
229 const uint8_t* GetValue(int64_t i) const;
230
231 /// Temporary access to a value.
232 ///
233 /// This view becomes invalid on the next modifying operation.
234 util::string_view GetView(int64_t i) const;
235
236 protected:
237 int32_t byte_width_;
238 BufferBuilder byte_builder_;
239
240#ifndef NDEBUG
241 void CheckValueSize(int64_t size);
242#endif
243};
244
245// ----------------------------------------------------------------------
246// Chunked builders: build a sequence of BinaryArray or StringArray that are
247// limited to a particular size (to the upper limit of 2GB)
248
249namespace internal {
250
251class ARROW_EXPORT ChunkedBinaryBuilder {
252 public:
253 ChunkedBinaryBuilder(int32_t max_chunk_size,
254 MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
255
256 virtual ~ChunkedBinaryBuilder() = default;
257
258 Status Append(const uint8_t* value, int32_t length) {
259 if (ARROW_PREDICT_FALSE(length + chunk_data_size_ > max_chunk_size_)) {
260 // Move onto next chunk, unless the builder length is currently 0, which
261 // means that max_chunk_size_ is less than the item length
262 if (builder_->length() > 0) {
263 ARROW_RETURN_NOT_OK(NextChunk());
264 }
265 // else fall through
266 }
267
268 chunk_data_size_ += length;
269 return builder_->Append(value, length);
270 }
271
272 Status Append(const util::string_view& value) {
273 return Append(reinterpret_cast<const uint8_t*>(value.data()),
274 static_cast<int32_t>(value.size()));
275 }
276
277 Status AppendNull() {
278 if (ARROW_PREDICT_FALSE(builder_->length() == std::numeric_limits<int32_t>::max())) {
279 ARROW_RETURN_NOT_OK(NextChunk());
280 }
281 return builder_->AppendNull();
282 }
283
284 virtual Status Finish(ArrayVector* out);
285
286 protected:
287 Status NextChunk();
288
289 int32_t max_chunk_size_;
290 int32_t chunk_data_size_;
291
292 std::unique_ptr<BinaryBuilder> builder_;
293 std::vector<std::shared_ptr<Array>> chunks_;
294};
295
296class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
297 public:
298 using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
299
300 Status Finish(ArrayVector* out) override;
301};
302
303} // namespace internal
304
305} // namespace arrow
306