1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/array/builder_binary.h"
19
20#include <algorithm>
21#include <cstddef>
22#include <cstdint>
23#include <cstring>
24#include <numeric>
25#include <sstream>
26#include <string>
27#include <utility>
28#include <vector>
29
30#include "arrow/array.h"
31#include "arrow/buffer.h"
32#include "arrow/status.h"
33#include "arrow/type.h"
34#include "arrow/type_traits.h"
35#include "arrow/util/bit-util.h"
36#include "arrow/util/checked_cast.h"
37#include "arrow/util/decimal.h"
38#include "arrow/util/logging.h"
39
40namespace arrow {
41
42using internal::checked_cast;
43
44// ----------------------------------------------------------------------
45// String and binary
46
47BinaryBuilder::BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
48 : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {}
49
50BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {}
51
52Status BinaryBuilder::Resize(int64_t capacity) {
53 DCHECK_LE(capacity, kListMaximumElements);
54 RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
55
56 // one more then requested for offsets
57 RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
58 return ArrayBuilder::Resize(capacity);
59}
60
61Status BinaryBuilder::ReserveData(int64_t elements) {
62 const int64_t size = value_data_length() + elements;
63 ARROW_RETURN_IF(
64 size > kBinaryMemoryLimit,
65 Status::CapacityError("Cannot reserve capacity larger than 2^31 - 1 for binary"));
66
67 return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements)
68 : Status::OK();
69}
70
71Status BinaryBuilder::AppendOverflow(int64_t num_bytes) {
72 return Status::CapacityError("BinaryArray cannot contain more than ",
73 kBinaryMemoryLimit, " bytes, have ", num_bytes);
74}
75
76Status BinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
77 // Write final offset (values length)
78 RETURN_NOT_OK(AppendNextOffset());
79
80 // These buffers' padding zeroed by BufferBuilder
81 std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
82 RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
83 RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
84 RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
85
86 *out =
87 ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data}, null_count_, 0);
88 Reset();
89 return Status::OK();
90}
91
92void BinaryBuilder::Reset() {
93 ArrayBuilder::Reset();
94 offsets_builder_.Reset();
95 value_data_builder_.Reset();
96}
97
98const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const {
99 const int32_t* offsets = offsets_builder_.data();
100 int32_t offset = offsets[i];
101 if (i == (length_ - 1)) {
102 *out_length = static_cast<int32_t>(value_data_builder_.length()) - offset;
103 } else {
104 *out_length = offsets[i + 1] - offset;
105 }
106 return value_data_builder_.data() + offset;
107}
108
109util::string_view BinaryBuilder::GetView(int64_t i) const {
110 const int32_t* offsets = offsets_builder_.data();
111 int32_t offset = offsets[i];
112 int32_t value_length;
113 if (i == (length_ - 1)) {
114 value_length = static_cast<int32_t>(value_data_builder_.length()) - offset;
115 } else {
116 value_length = offsets[i + 1] - offset;
117 }
118 return util::string_view(
119 reinterpret_cast<const char*>(value_data_builder_.data() + offset), value_length);
120}
121
122StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {}
123
124Status StringBuilder::AppendValues(const std::vector<std::string>& values,
125 const uint8_t* valid_bytes) {
126 std::size_t total_length = std::accumulate(
127 values.begin(), values.end(), 0ULL,
128 [](uint64_t sum, const std::string& str) { return sum + str.size(); });
129 RETURN_NOT_OK(Reserve(values.size()));
130 RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
131 RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));
132
133 if (valid_bytes) {
134 for (std::size_t i = 0; i < values.size(); ++i) {
135 UnsafeAppendNextOffset();
136 if (valid_bytes[i]) {
137 value_data_builder_.UnsafeAppend(
138 reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
139 }
140 }
141 } else {
142 for (std::size_t i = 0; i < values.size(); ++i) {
143 UnsafeAppendNextOffset();
144 value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i].data()),
145 values[i].size());
146 }
147 }
148
149 UnsafeAppendToBitmap(valid_bytes, values.size());
150 return Status::OK();
151}
152
153Status StringBuilder::AppendValues(const char** values, int64_t length,
154 const uint8_t* valid_bytes) {
155 std::size_t total_length = 0;
156 std::vector<std::size_t> value_lengths(length);
157 bool have_null_value = false;
158 for (int64_t i = 0; i < length; ++i) {
159 if (values[i]) {
160 auto value_length = strlen(values[i]);
161 value_lengths[i] = value_length;
162 total_length += value_length;
163 } else {
164 have_null_value = true;
165 }
166 }
167 RETURN_NOT_OK(Reserve(length));
168 RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
169 RETURN_NOT_OK(offsets_builder_.Reserve(length));
170
171 if (valid_bytes) {
172 int64_t valid_bytes_offset = 0;
173 for (int64_t i = 0; i < length; ++i) {
174 UnsafeAppendNextOffset();
175 if (valid_bytes[i]) {
176 if (values[i]) {
177 value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
178 value_lengths[i]);
179 } else {
180 UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset);
181 UnsafeAppendToBitmap(false);
182 valid_bytes_offset = i + 1;
183 }
184 }
185 }
186 UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
187 } else {
188 if (have_null_value) {
189 std::vector<uint8_t> valid_vector(length, 0);
190 for (int64_t i = 0; i < length; ++i) {
191 UnsafeAppendNextOffset();
192 if (values[i]) {
193 value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
194 value_lengths[i]);
195 valid_vector[i] = 1;
196 }
197 }
198 UnsafeAppendToBitmap(valid_vector.data(), length);
199 } else {
200 for (int64_t i = 0; i < length; ++i) {
201 UnsafeAppendNextOffset();
202 value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
203 value_lengths[i]);
204 }
205 UnsafeAppendToBitmap(nullptr, length);
206 }
207 }
208 return Status::OK();
209}
210
211// ----------------------------------------------------------------------
212// Fixed width binary
213
214FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
215 MemoryPool* pool)
216 : ArrayBuilder(type, pool),
217 byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()),
218 byte_builder_(pool) {}
219
220#ifndef NDEBUG
221void FixedSizeBinaryBuilder::CheckValueSize(int64_t size) {
222 DCHECK_EQ(size, byte_width_) << "Appending wrong size to FixedSizeBinaryBuilder";
223}
224#endif
225
226Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length,
227 const uint8_t* valid_bytes) {
228 RETURN_NOT_OK(Reserve(length));
229 UnsafeAppendToBitmap(valid_bytes, length);
230 return byte_builder_.Append(data, length * byte_width_);
231}
232
233Status FixedSizeBinaryBuilder::AppendNull() {
234 RETURN_NOT_OK(Reserve(1));
235 UnsafeAppendToBitmap(false);
236 return byte_builder_.Advance(byte_width_);
237}
238
239void FixedSizeBinaryBuilder::Reset() {
240 ArrayBuilder::Reset();
241 byte_builder_.Reset();
242}
243
244Status FixedSizeBinaryBuilder::Resize(int64_t capacity) {
245 RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
246 RETURN_NOT_OK(byte_builder_.Resize(capacity * byte_width_));
247 return ArrayBuilder::Resize(capacity);
248}
249
250Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
251 std::shared_ptr<Buffer> data;
252 RETURN_NOT_OK(byte_builder_.Finish(&data));
253
254 std::shared_ptr<Buffer> null_bitmap;
255 RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
256 *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_);
257
258 capacity_ = length_ = null_count_ = 0;
259 return Status::OK();
260}
261
262const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const {
263 const uint8_t* data_ptr = byte_builder_.data();
264 return data_ptr + i * byte_width_;
265}
266
267util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const {
268 const uint8_t* data_ptr = byte_builder_.data();
269 return util::string_view(reinterpret_cast<const char*>(data_ptr + i * byte_width_),
270 byte_width_);
271}
272
273// ----------------------------------------------------------------------
274// ChunkedArray builders
275
276namespace internal {
277
278ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_size, MemoryPool* pool)
279 : max_chunk_size_(max_chunk_size),
280 chunk_data_size_(0),
281 builder_(new BinaryBuilder(pool)) {}
282
283Status ChunkedBinaryBuilder::Finish(ArrayVector* out) {
284 if (builder_->length() > 0 || chunks_.size() == 0) {
285 std::shared_ptr<Array> chunk;
286 RETURN_NOT_OK(builder_->Finish(&chunk));
287 chunks_.emplace_back(std::move(chunk));
288 }
289 *out = std::move(chunks_);
290 return Status::OK();
291}
292
293Status ChunkedBinaryBuilder::NextChunk() {
294 std::shared_ptr<Array> chunk;
295 RETURN_NOT_OK(builder_->Finish(&chunk));
296 chunks_.emplace_back(std::move(chunk));
297
298 chunk_data_size_ = 0;
299 return Status::OK();
300}
301
302Status ChunkedStringBuilder::Finish(ArrayVector* out) {
303 RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out));
304
305 // Change data type to string/utf8
306 for (size_t i = 0; i < out->size(); ++i) {
307 std::shared_ptr<ArrayData> data = (*out)[i]->data();
308 data->type = ::arrow::utf8();
309 (*out)[i] = std::make_shared<StringArray>(data);
310 }
311 return Status::OK();
312}
313
314} // namespace internal
315
316} // namespace arrow
317