1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef ARROW_BUFFER_BUILDER_H
19#define ARROW_BUFFER_BUILDER_H
20
21#include <algorithm>
22#include <array>
23#include <cstdint>
24#include <cstring>
25#include <memory>
26#include <string>
27
28#include "arrow/buffer.h"
29#include "arrow/status.h"
30#include "arrow/util/bit-util.h"
31#include "arrow/util/macros.h"
32#include "arrow/util/visibility.h"
33
34namespace arrow {
35
36// ----------------------------------------------------------------------
37// Buffer builder classes
38
39/// \class BufferBuilder
40/// \brief A class for incrementally building a contiguous chunk of in-memory
41/// data
42class ARROW_EXPORT BufferBuilder {
43 public:
44 explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
45 : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {}
46
47 /// \brief Resize the buffer to the nearest multiple of 64 bytes
48 ///
49 /// \param new_capacity the new capacity of the of the builder. Will be
50 /// rounded up to a multiple of 64 bytes for padding \param shrink_to_fit if
51 /// new capacity is smaller than the existing size, reallocate internal
52 /// buffer. Set to false to avoid reallocations when shrinking the builder.
53 /// \return Status
54 Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
55 // Resize(0) is a no-op
56 if (new_capacity == 0) {
57 return Status::OK();
58 }
59 int64_t old_capacity = capacity_;
60
61 if (buffer_ == NULLPTR) {
62 ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_capacity, &buffer_));
63 } else {
64 ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit));
65 }
66 capacity_ = buffer_->capacity();
67 data_ = buffer_->mutable_data();
68 if (capacity_ > old_capacity) {
69 memset(data_ + old_capacity, 0, capacity_ - old_capacity);
70 }
71 return Status::OK();
72 }
73
74 /// \brief Ensure that builder can accommodate the additional number of bytes
75 /// without the need to perform allocations
76 ///
77 /// \param[in] additional_bytes number of additional bytes to make space for
78 /// \param[in] grow_by_factor if true, round up allocations using the
79 /// strategy in BufferBuilder::GrowByFactor
80 /// \return Status
81 Status Reserve(const int64_t additional_bytes, bool grow_by_factor = false) {
82 auto min_capacity = size_ + additional_bytes;
83 if (min_capacity <= capacity_) return Status::OK();
84 if (grow_by_factor) {
85 min_capacity = GrowByFactor(min_capacity);
86 }
87 return Resize(min_capacity, false);
88 }
89
90 /// \brief Return a capacity expanded by a growth factor of 2
91 static int64_t GrowByFactor(const int64_t min_capacity) {
92 // If the capacity was not already a multiple of 2, do so here
93 // TODO(emkornfield) doubling isn't great default allocation practice
94 // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md
95 // for discussion
96 return BitUtil::NextPower2(min_capacity);
97 }
98
99 /// \brief Append the given data to the buffer
100 ///
101 /// The buffer is automatically expanded if necessary.
102 Status Append(const void* data, const int64_t length) {
103 if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) {
104 ARROW_RETURN_NOT_OK(Resize(GrowByFactor(size_ + length), false));
105 }
106 UnsafeAppend(data, length);
107 return Status::OK();
108 }
109
110 /// \brief Append copies of a value to the buffer
111 ///
112 /// The buffer is automatically expanded if necessary.
113 Status Append(const int64_t num_copies, uint8_t value) {
114 ARROW_RETURN_NOT_OK(Reserve(num_copies, true));
115 UnsafeAppend(num_copies, value);
116 return Status::OK();
117 }
118
119 /// \brief Append the given data to the buffer
120 ///
121 /// The buffer is automatically expanded if necessary.
122 template <size_t NBYTES>
123 Status Append(const std::array<uint8_t, NBYTES>& data) {
124 constexpr auto nbytes = static_cast<int64_t>(NBYTES);
125 ARROW_RETURN_NOT_OK(Reserve(NBYTES, true));
126 std::copy(data.cbegin(), data.cend(), data_ + size_);
127 size_ += nbytes;
128 return Status::OK();
129 }
130
131 // Advance pointer and zero out memory
132 Status Advance(const int64_t length) { return Append(length, 0); }
133
134 // Unsafe methods don't check existing size
135 void UnsafeAppend(const void* data, const int64_t length) {
136 memcpy(data_ + size_, data, static_cast<size_t>(length));
137 size_ += length;
138 }
139
140 void UnsafeAppend(const int64_t num_copies, uint8_t value) {
141 memset(data_ + size_, value, static_cast<size_t>(num_copies));
142 size_ += num_copies;
143 }
144
145 /// \brief Return result of builder as a Buffer object.
146 ///
147 /// The builder is reset and can be reused afterwards.
148 ///
149 /// \param[out] out the finalized Buffer object
150 /// \param shrink_to_fit if the buffer size is smaller than its capacity,
151 /// reallocate to fit more tightly in memory. Set to false to avoid
152 /// a reallocation, at the expense of potentially more memory consumption.
153 /// \return Status
154 Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
155 ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit));
156 *out = buffer_;
157 Reset();
158 return Status::OK();
159 }
160
161 void Reset() {
162 buffer_ = NULLPTR;
163 capacity_ = size_ = 0;
164 }
165
166 int64_t capacity() const { return capacity_; }
167 int64_t length() const { return size_; }
168 const uint8_t* data() const { return data_; }
169 uint8_t* mutable_data() { return data_; }
170
171 private:
172 std::shared_ptr<ResizableBuffer> buffer_;
173 MemoryPool* pool_;
174 uint8_t* data_;
175 int64_t capacity_;
176 int64_t size_;
177};
178
179template <typename T, typename Enable = void>
180class TypedBufferBuilder;
181
182/// \brief A BufferBuilder for building a buffer of arithmetic elements
183template <typename T>
184class TypedBufferBuilder<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
185 public:
186 explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
187 : bytes_builder_(pool) {}
188
189 Status Append(T value) {
190 return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value), sizeof(T));
191 }
192
193 Status Append(const T* values, int64_t num_elements) {
194 return bytes_builder_.Append(reinterpret_cast<const uint8_t*>(values),
195 num_elements * sizeof(T));
196 }
197
198 Status Append(const int64_t num_copies, T value) {
199 ARROW_RETURN_NOT_OK(Resize(GrowByFactor(num_copies + length()), false));
200 UnsafeAppend(num_copies, value);
201 return Status::OK();
202 }
203
204 void UnsafeAppend(T value) {
205 bytes_builder_.UnsafeAppend(reinterpret_cast<uint8_t*>(&value), sizeof(T));
206 }
207
208 void UnsafeAppend(const T* values, int64_t num_elements) {
209 bytes_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values),
210 num_elements * sizeof(T));
211 }
212
213 void UnsafeAppend(const int64_t num_copies, T value) {
214 auto data = mutable_data() + length();
215 bytes_builder_.UnsafeAppend(num_copies * sizeof(T), 0);
216 for (const auto end = data + num_copies; data != end; ++data) {
217 *data = value;
218 }
219 }
220
221 Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
222 return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit);
223 }
224
225 Status Reserve(const int64_t additional_elements) {
226 return bytes_builder_.Reserve(additional_elements * sizeof(T));
227 }
228
229 Status Advance(const int64_t length) {
230 return bytes_builder_.Advance(length * sizeof(T));
231 }
232
233 Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
234 return bytes_builder_.Finish(out, shrink_to_fit);
235 }
236
237 void Reset() { bytes_builder_.Reset(); }
238
239 int64_t length() const { return bytes_builder_.length() / sizeof(T); }
240 int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); }
241 const T* data() const { return reinterpret_cast<const T*>(bytes_builder_.data()); }
242 T* mutable_data() { return reinterpret_cast<T*>(bytes_builder_.mutable_data()); }
243
244 private:
245 BufferBuilder bytes_builder_;
246};
247
248/// \brief A BufferBuilder for building a buffer containing a bitmap
249template <>
250class TypedBufferBuilder<bool> {
251 public:
252 explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
253 : bytes_builder_(pool) {}
254
255 Status Append(bool value) {
256 ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + 1));
257 UnsafeAppend(value);
258 return Status::OK();
259 }
260
261 Status Append(const uint8_t* valid_bytes, int64_t num_elements) {
262 ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + num_elements));
263 UnsafeAppend(valid_bytes, num_elements);
264 return Status::OK();
265 }
266
267 Status Append(const int64_t num_copies, bool value) {
268 ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + num_copies));
269 UnsafeAppend(num_copies, value);
270 return Status::OK();
271 }
272
273 void UnsafeAppend(bool value) {
274 BitUtil::SetBitTo(mutable_data(), bit_length_, value);
275 if (!value) {
276 ++false_count_;
277 }
278 ++bit_length_;
279 }
280
281 void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) {
282 if (num_elements == 0) return;
283 int64_t i = 0;
284 internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
285 bool value = bytes[i++];
286 if (!value) ++false_count_;
287 return value;
288 });
289 bit_length_ += num_elements;
290 }
291
292 void UnsafeAppend(const int64_t num_copies, bool value) {
293 BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value);
294 if (!value) {
295 false_count_ += num_copies;
296 }
297 bit_length_ += num_copies;
298 }
299
300 Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
301 const int64_t old_byte_capacity = bytes_builder_.capacity();
302 const int64_t new_byte_capacity = BitUtil::BytesForBits(new_capacity);
303 ARROW_RETURN_NOT_OK(bytes_builder_.Resize(new_byte_capacity, shrink_to_fit));
304 if (new_byte_capacity > old_byte_capacity) {
305 memset(mutable_data() + old_byte_capacity, 0,
306 static_cast<size_t>(new_byte_capacity - old_byte_capacity));
307 }
308 return Status::OK();
309 }
310
311 Status Reserve(const int64_t additional_elements) {
312 return Resize(bit_length_ + additional_elements, false);
313 }
314
315 Status Advance(const int64_t length) {
316 bit_length_ += length;
317 false_count_ += length;
318 return ResizeWithGrowthFactor(bit_length_);
319 }
320
321 Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
322 bit_length_ = false_count_ = 0;
323 return bytes_builder_.Finish(out, shrink_to_fit);
324 }
325
326 void Reset() {
327 bytes_builder_.Reset();
328 bit_length_ = false_count_ = 0;
329 }
330
331 int64_t length() const { return bit_length_; }
332 int64_t capacity() const { return bytes_builder_.capacity() * 8; }
333 const uint8_t* data() const { return bytes_builder_.data(); }
334 uint8_t* mutable_data() { return bytes_builder_.mutable_data(); }
335 int64_t false_count() const { return false_count_; }
336
337 private:
338 Status ResizeWithGrowthFactor(const int64_t min_capacity) {
339 return Resize(BufferBuilder::GrowByFactor(min_capacity), false);
340 }
341 BufferBuilder bytes_builder_;
342 int64_t bit_length_ = 0;
343 int64_t false_count_ = 0;
344};
345
346} // namespace arrow
347
348#endif // ARROW_BUFFER_BUILDER_H
349