1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_BUFFER_BUILDER_H |
19 | #define ARROW_BUFFER_BUILDER_H |
20 | |
21 | #include <algorithm> |
22 | #include <array> |
23 | #include <cstdint> |
24 | #include <cstring> |
25 | #include <memory> |
26 | #include <string> |
27 | |
28 | #include "arrow/buffer.h" |
29 | #include "arrow/status.h" |
30 | #include "arrow/util/bit-util.h" |
31 | #include "arrow/util/macros.h" |
32 | #include "arrow/util/visibility.h" |
33 | |
34 | namespace arrow { |
35 | |
36 | // ---------------------------------------------------------------------- |
37 | // Buffer builder classes |
38 | |
39 | /// \class BufferBuilder |
40 | /// \brief A class for incrementally building a contiguous chunk of in-memory |
41 | /// data |
42 | class ARROW_EXPORT BufferBuilder { |
43 | public: |
44 | explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) |
45 | : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {} |
46 | |
47 | /// \brief Resize the buffer to the nearest multiple of 64 bytes |
48 | /// |
49 | /// \param new_capacity the new capacity of the of the builder. Will be |
50 | /// rounded up to a multiple of 64 bytes for padding \param shrink_to_fit if |
51 | /// new capacity is smaller than the existing size, reallocate internal |
52 | /// buffer. Set to false to avoid reallocations when shrinking the builder. |
53 | /// \return Status |
54 | Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { |
55 | // Resize(0) is a no-op |
56 | if (new_capacity == 0) { |
57 | return Status::OK(); |
58 | } |
59 | int64_t old_capacity = capacity_; |
60 | |
61 | if (buffer_ == NULLPTR) { |
62 | ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_capacity, &buffer_)); |
63 | } else { |
64 | ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit)); |
65 | } |
66 | capacity_ = buffer_->capacity(); |
67 | data_ = buffer_->mutable_data(); |
68 | if (capacity_ > old_capacity) { |
69 | memset(data_ + old_capacity, 0, capacity_ - old_capacity); |
70 | } |
71 | return Status::OK(); |
72 | } |
73 | |
74 | /// \brief Ensure that builder can accommodate the additional number of bytes |
75 | /// without the need to perform allocations |
76 | /// |
77 | /// \param[in] additional_bytes number of additional bytes to make space for |
78 | /// \param[in] grow_by_factor if true, round up allocations using the |
79 | /// strategy in BufferBuilder::GrowByFactor |
80 | /// \return Status |
81 | Status Reserve(const int64_t additional_bytes, bool grow_by_factor = false) { |
82 | auto min_capacity = size_ + additional_bytes; |
83 | if (min_capacity <= capacity_) return Status::OK(); |
84 | if (grow_by_factor) { |
85 | min_capacity = GrowByFactor(min_capacity); |
86 | } |
87 | return Resize(min_capacity, false); |
88 | } |
89 | |
90 | /// \brief Return a capacity expanded by a growth factor of 2 |
91 | static int64_t GrowByFactor(const int64_t min_capacity) { |
92 | // If the capacity was not already a multiple of 2, do so here |
93 | // TODO(emkornfield) doubling isn't great default allocation practice |
94 | // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md |
95 | // for discussion |
96 | return BitUtil::NextPower2(min_capacity); |
97 | } |
98 | |
99 | /// \brief Append the given data to the buffer |
100 | /// |
101 | /// The buffer is automatically expanded if necessary. |
102 | Status Append(const void* data, const int64_t length) { |
103 | if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) { |
104 | ARROW_RETURN_NOT_OK(Resize(GrowByFactor(size_ + length), false)); |
105 | } |
106 | UnsafeAppend(data, length); |
107 | return Status::OK(); |
108 | } |
109 | |
110 | /// \brief Append copies of a value to the buffer |
111 | /// |
112 | /// The buffer is automatically expanded if necessary. |
113 | Status Append(const int64_t num_copies, uint8_t value) { |
114 | ARROW_RETURN_NOT_OK(Reserve(num_copies, true)); |
115 | UnsafeAppend(num_copies, value); |
116 | return Status::OK(); |
117 | } |
118 | |
119 | /// \brief Append the given data to the buffer |
120 | /// |
121 | /// The buffer is automatically expanded if necessary. |
122 | template <size_t NBYTES> |
123 | Status Append(const std::array<uint8_t, NBYTES>& data) { |
124 | constexpr auto nbytes = static_cast<int64_t>(NBYTES); |
125 | ARROW_RETURN_NOT_OK(Reserve(NBYTES, true)); |
126 | std::copy(data.cbegin(), data.cend(), data_ + size_); |
127 | size_ += nbytes; |
128 | return Status::OK(); |
129 | } |
130 | |
131 | // Advance pointer and zero out memory |
132 | Status Advance(const int64_t length) { return Append(length, 0); } |
133 | |
134 | // Unsafe methods don't check existing size |
135 | void UnsafeAppend(const void* data, const int64_t length) { |
136 | memcpy(data_ + size_, data, static_cast<size_t>(length)); |
137 | size_ += length; |
138 | } |
139 | |
140 | void UnsafeAppend(const int64_t num_copies, uint8_t value) { |
141 | memset(data_ + size_, value, static_cast<size_t>(num_copies)); |
142 | size_ += num_copies; |
143 | } |
144 | |
145 | /// \brief Return result of builder as a Buffer object. |
146 | /// |
147 | /// The builder is reset and can be reused afterwards. |
148 | /// |
149 | /// \param[out] out the finalized Buffer object |
150 | /// \param shrink_to_fit if the buffer size is smaller than its capacity, |
151 | /// reallocate to fit more tightly in memory. Set to false to avoid |
152 | /// a reallocation, at the expense of potentially more memory consumption. |
153 | /// \return Status |
154 | Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) { |
155 | ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); |
156 | *out = buffer_; |
157 | Reset(); |
158 | return Status::OK(); |
159 | } |
160 | |
161 | void Reset() { |
162 | buffer_ = NULLPTR; |
163 | capacity_ = size_ = 0; |
164 | } |
165 | |
166 | int64_t capacity() const { return capacity_; } |
167 | int64_t length() const { return size_; } |
168 | const uint8_t* data() const { return data_; } |
169 | uint8_t* mutable_data() { return data_; } |
170 | |
171 | private: |
172 | std::shared_ptr<ResizableBuffer> buffer_; |
173 | MemoryPool* pool_; |
174 | uint8_t* data_; |
175 | int64_t capacity_; |
176 | int64_t size_; |
177 | }; |
178 | |
179 | template <typename T, typename Enable = void> |
180 | class TypedBufferBuilder; |
181 | |
182 | /// \brief A BufferBuilder for building a buffer of arithmetic elements |
183 | template <typename T> |
184 | class TypedBufferBuilder<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> { |
185 | public: |
186 | explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) |
187 | : bytes_builder_(pool) {} |
188 | |
189 | Status Append(T value) { |
190 | return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value), sizeof(T)); |
191 | } |
192 | |
193 | Status Append(const T* values, int64_t num_elements) { |
194 | return bytes_builder_.Append(reinterpret_cast<const uint8_t*>(values), |
195 | num_elements * sizeof(T)); |
196 | } |
197 | |
198 | Status Append(const int64_t num_copies, T value) { |
199 | ARROW_RETURN_NOT_OK(Resize(GrowByFactor(num_copies + length()), false)); |
200 | UnsafeAppend(num_copies, value); |
201 | return Status::OK(); |
202 | } |
203 | |
204 | void UnsafeAppend(T value) { |
205 | bytes_builder_.UnsafeAppend(reinterpret_cast<uint8_t*>(&value), sizeof(T)); |
206 | } |
207 | |
208 | void UnsafeAppend(const T* values, int64_t num_elements) { |
209 | bytes_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values), |
210 | num_elements * sizeof(T)); |
211 | } |
212 | |
213 | void UnsafeAppend(const int64_t num_copies, T value) { |
214 | auto data = mutable_data() + length(); |
215 | bytes_builder_.UnsafeAppend(num_copies * sizeof(T), 0); |
216 | for (const auto end = data + num_copies; data != end; ++data) { |
217 | *data = value; |
218 | } |
219 | } |
220 | |
221 | Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { |
222 | return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit); |
223 | } |
224 | |
225 | Status Reserve(const int64_t additional_elements) { |
226 | return bytes_builder_.Reserve(additional_elements * sizeof(T)); |
227 | } |
228 | |
229 | Status Advance(const int64_t length) { |
230 | return bytes_builder_.Advance(length * sizeof(T)); |
231 | } |
232 | |
233 | Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) { |
234 | return bytes_builder_.Finish(out, shrink_to_fit); |
235 | } |
236 | |
237 | void Reset() { bytes_builder_.Reset(); } |
238 | |
239 | int64_t length() const { return bytes_builder_.length() / sizeof(T); } |
240 | int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); } |
241 | const T* data() const { return reinterpret_cast<const T*>(bytes_builder_.data()); } |
242 | T* mutable_data() { return reinterpret_cast<T*>(bytes_builder_.mutable_data()); } |
243 | |
244 | private: |
245 | BufferBuilder bytes_builder_; |
246 | }; |
247 | |
248 | /// \brief A BufferBuilder for building a buffer containing a bitmap |
249 | template <> |
250 | class TypedBufferBuilder<bool> { |
251 | public: |
252 | explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) |
253 | : bytes_builder_(pool) {} |
254 | |
255 | Status Append(bool value) { |
256 | ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + 1)); |
257 | UnsafeAppend(value); |
258 | return Status::OK(); |
259 | } |
260 | |
261 | Status Append(const uint8_t* valid_bytes, int64_t num_elements) { |
262 | ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + num_elements)); |
263 | UnsafeAppend(valid_bytes, num_elements); |
264 | return Status::OK(); |
265 | } |
266 | |
267 | Status Append(const int64_t num_copies, bool value) { |
268 | ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + num_copies)); |
269 | UnsafeAppend(num_copies, value); |
270 | return Status::OK(); |
271 | } |
272 | |
273 | void UnsafeAppend(bool value) { |
274 | BitUtil::SetBitTo(mutable_data(), bit_length_, value); |
275 | if (!value) { |
276 | ++false_count_; |
277 | } |
278 | ++bit_length_; |
279 | } |
280 | |
281 | void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) { |
282 | if (num_elements == 0) return; |
283 | int64_t i = 0; |
284 | internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { |
285 | bool value = bytes[i++]; |
286 | if (!value) ++false_count_; |
287 | return value; |
288 | }); |
289 | bit_length_ += num_elements; |
290 | } |
291 | |
292 | void UnsafeAppend(const int64_t num_copies, bool value) { |
293 | BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value); |
294 | if (!value) { |
295 | false_count_ += num_copies; |
296 | } |
297 | bit_length_ += num_copies; |
298 | } |
299 | |
300 | Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { |
301 | const int64_t old_byte_capacity = bytes_builder_.capacity(); |
302 | const int64_t new_byte_capacity = BitUtil::BytesForBits(new_capacity); |
303 | ARROW_RETURN_NOT_OK(bytes_builder_.Resize(new_byte_capacity, shrink_to_fit)); |
304 | if (new_byte_capacity > old_byte_capacity) { |
305 | memset(mutable_data() + old_byte_capacity, 0, |
306 | static_cast<size_t>(new_byte_capacity - old_byte_capacity)); |
307 | } |
308 | return Status::OK(); |
309 | } |
310 | |
311 | Status Reserve(const int64_t additional_elements) { |
312 | return Resize(bit_length_ + additional_elements, false); |
313 | } |
314 | |
315 | Status Advance(const int64_t length) { |
316 | bit_length_ += length; |
317 | false_count_ += length; |
318 | return ResizeWithGrowthFactor(bit_length_); |
319 | } |
320 | |
321 | Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) { |
322 | bit_length_ = false_count_ = 0; |
323 | return bytes_builder_.Finish(out, shrink_to_fit); |
324 | } |
325 | |
326 | void Reset() { |
327 | bytes_builder_.Reset(); |
328 | bit_length_ = false_count_ = 0; |
329 | } |
330 | |
331 | int64_t length() const { return bit_length_; } |
332 | int64_t capacity() const { return bytes_builder_.capacity() * 8; } |
333 | const uint8_t* data() const { return bytes_builder_.data(); } |
334 | uint8_t* mutable_data() { return bytes_builder_.mutable_data(); } |
335 | int64_t false_count() const { return false_count_; } |
336 | |
337 | private: |
338 | Status ResizeWithGrowthFactor(const int64_t min_capacity) { |
339 | return Resize(BufferBuilder::GrowByFactor(min_capacity), false); |
340 | } |
341 | BufferBuilder bytes_builder_; |
342 | int64_t bit_length_ = 0; |
343 | int64_t false_count_ = 0; |
344 | }; |
345 | |
346 | } // namespace arrow |
347 | |
348 | #endif // ARROW_BUFFER_BUILDER_H |
349 | |