1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "arrow/array/builder_binary.h" |
19 | |
20 | #include <algorithm> |
21 | #include <cstddef> |
22 | #include <cstdint> |
23 | #include <cstring> |
24 | #include <numeric> |
25 | #include <sstream> |
26 | #include <string> |
27 | #include <utility> |
28 | #include <vector> |
29 | |
30 | #include "arrow/array.h" |
31 | #include "arrow/buffer.h" |
32 | #include "arrow/status.h" |
33 | #include "arrow/type.h" |
34 | #include "arrow/type_traits.h" |
35 | #include "arrow/util/bit-util.h" |
36 | #include "arrow/util/checked_cast.h" |
37 | #include "arrow/util/decimal.h" |
38 | #include "arrow/util/logging.h" |
39 | |
40 | namespace arrow { |
41 | |
42 | using internal::checked_cast; |
43 | |
44 | // ---------------------------------------------------------------------- |
45 | // String and binary |
46 | |
47 | BinaryBuilder::BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool) |
48 | : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} |
49 | |
50 | BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} |
51 | |
52 | Status BinaryBuilder::Resize(int64_t capacity) { |
53 | DCHECK_LE(capacity, kListMaximumElements); |
54 | RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); |
55 | |
56 | // one more then requested for offsets |
57 | RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); |
58 | return ArrayBuilder::Resize(capacity); |
59 | } |
60 | |
61 | Status BinaryBuilder::ReserveData(int64_t elements) { |
62 | const int64_t size = value_data_length() + elements; |
63 | ARROW_RETURN_IF( |
64 | size > kBinaryMemoryLimit, |
65 | Status::CapacityError("Cannot reserve capacity larger than 2^31 - 1 for binary" )); |
66 | |
67 | return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) |
68 | : Status::OK(); |
69 | } |
70 | |
71 | Status BinaryBuilder::AppendOverflow(int64_t num_bytes) { |
72 | return Status::CapacityError("BinaryArray cannot contain more than " , |
73 | kBinaryMemoryLimit, " bytes, have " , num_bytes); |
74 | } |
75 | |
76 | Status BinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { |
77 | // Write final offset (values length) |
78 | RETURN_NOT_OK(AppendNextOffset()); |
79 | |
80 | // These buffers' padding zeroed by BufferBuilder |
81 | std::shared_ptr<Buffer> offsets, value_data, null_bitmap; |
82 | RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); |
83 | RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); |
84 | RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); |
85 | |
86 | *out = |
87 | ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data}, null_count_, 0); |
88 | Reset(); |
89 | return Status::OK(); |
90 | } |
91 | |
92 | void BinaryBuilder::Reset() { |
93 | ArrayBuilder::Reset(); |
94 | offsets_builder_.Reset(); |
95 | value_data_builder_.Reset(); |
96 | } |
97 | |
98 | const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const { |
99 | const int32_t* offsets = offsets_builder_.data(); |
100 | int32_t offset = offsets[i]; |
101 | if (i == (length_ - 1)) { |
102 | *out_length = static_cast<int32_t>(value_data_builder_.length()) - offset; |
103 | } else { |
104 | *out_length = offsets[i + 1] - offset; |
105 | } |
106 | return value_data_builder_.data() + offset; |
107 | } |
108 | |
109 | util::string_view BinaryBuilder::GetView(int64_t i) const { |
110 | const int32_t* offsets = offsets_builder_.data(); |
111 | int32_t offset = offsets[i]; |
112 | int32_t value_length; |
113 | if (i == (length_ - 1)) { |
114 | value_length = static_cast<int32_t>(value_data_builder_.length()) - offset; |
115 | } else { |
116 | value_length = offsets[i + 1] - offset; |
117 | } |
118 | return util::string_view( |
119 | reinterpret_cast<const char*>(value_data_builder_.data() + offset), value_length); |
120 | } |
121 | |
122 | StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {} |
123 | |
124 | Status StringBuilder::AppendValues(const std::vector<std::string>& values, |
125 | const uint8_t* valid_bytes) { |
126 | std::size_t total_length = std::accumulate( |
127 | values.begin(), values.end(), 0ULL, |
128 | [](uint64_t sum, const std::string& str) { return sum + str.size(); }); |
129 | RETURN_NOT_OK(Reserve(values.size())); |
130 | RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); |
131 | RETURN_NOT_OK(offsets_builder_.Reserve(values.size())); |
132 | |
133 | if (valid_bytes) { |
134 | for (std::size_t i = 0; i < values.size(); ++i) { |
135 | UnsafeAppendNextOffset(); |
136 | if (valid_bytes[i]) { |
137 | value_data_builder_.UnsafeAppend( |
138 | reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()); |
139 | } |
140 | } |
141 | } else { |
142 | for (std::size_t i = 0; i < values.size(); ++i) { |
143 | UnsafeAppendNextOffset(); |
144 | value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i].data()), |
145 | values[i].size()); |
146 | } |
147 | } |
148 | |
149 | UnsafeAppendToBitmap(valid_bytes, values.size()); |
150 | return Status::OK(); |
151 | } |
152 | |
153 | Status StringBuilder::AppendValues(const char** values, int64_t length, |
154 | const uint8_t* valid_bytes) { |
155 | std::size_t total_length = 0; |
156 | std::vector<std::size_t> value_lengths(length); |
157 | bool have_null_value = false; |
158 | for (int64_t i = 0; i < length; ++i) { |
159 | if (values[i]) { |
160 | auto value_length = strlen(values[i]); |
161 | value_lengths[i] = value_length; |
162 | total_length += value_length; |
163 | } else { |
164 | have_null_value = true; |
165 | } |
166 | } |
167 | RETURN_NOT_OK(Reserve(length)); |
168 | RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); |
169 | RETURN_NOT_OK(offsets_builder_.Reserve(length)); |
170 | |
171 | if (valid_bytes) { |
172 | int64_t valid_bytes_offset = 0; |
173 | for (int64_t i = 0; i < length; ++i) { |
174 | UnsafeAppendNextOffset(); |
175 | if (valid_bytes[i]) { |
176 | if (values[i]) { |
177 | value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), |
178 | value_lengths[i]); |
179 | } else { |
180 | UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset); |
181 | UnsafeAppendToBitmap(false); |
182 | valid_bytes_offset = i + 1; |
183 | } |
184 | } |
185 | } |
186 | UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); |
187 | } else { |
188 | if (have_null_value) { |
189 | std::vector<uint8_t> valid_vector(length, 0); |
190 | for (int64_t i = 0; i < length; ++i) { |
191 | UnsafeAppendNextOffset(); |
192 | if (values[i]) { |
193 | value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), |
194 | value_lengths[i]); |
195 | valid_vector[i] = 1; |
196 | } |
197 | } |
198 | UnsafeAppendToBitmap(valid_vector.data(), length); |
199 | } else { |
200 | for (int64_t i = 0; i < length; ++i) { |
201 | UnsafeAppendNextOffset(); |
202 | value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), |
203 | value_lengths[i]); |
204 | } |
205 | UnsafeAppendToBitmap(nullptr, length); |
206 | } |
207 | } |
208 | return Status::OK(); |
209 | } |
210 | |
211 | // ---------------------------------------------------------------------- |
212 | // Fixed width binary |
213 | |
214 | FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type, |
215 | MemoryPool* pool) |
216 | : ArrayBuilder(type, pool), |
217 | byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()), |
218 | byte_builder_(pool) {} |
219 | |
220 | #ifndef NDEBUG |
221 | void FixedSizeBinaryBuilder::CheckValueSize(int64_t size) { |
222 | DCHECK_EQ(size, byte_width_) << "Appending wrong size to FixedSizeBinaryBuilder" ; |
223 | } |
224 | #endif |
225 | |
226 | Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length, |
227 | const uint8_t* valid_bytes) { |
228 | RETURN_NOT_OK(Reserve(length)); |
229 | UnsafeAppendToBitmap(valid_bytes, length); |
230 | return byte_builder_.Append(data, length * byte_width_); |
231 | } |
232 | |
233 | Status FixedSizeBinaryBuilder::AppendNull() { |
234 | RETURN_NOT_OK(Reserve(1)); |
235 | UnsafeAppendToBitmap(false); |
236 | return byte_builder_.Advance(byte_width_); |
237 | } |
238 | |
239 | void FixedSizeBinaryBuilder::Reset() { |
240 | ArrayBuilder::Reset(); |
241 | byte_builder_.Reset(); |
242 | } |
243 | |
244 | Status FixedSizeBinaryBuilder::Resize(int64_t capacity) { |
245 | RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); |
246 | RETURN_NOT_OK(byte_builder_.Resize(capacity * byte_width_)); |
247 | return ArrayBuilder::Resize(capacity); |
248 | } |
249 | |
250 | Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { |
251 | std::shared_ptr<Buffer> data; |
252 | RETURN_NOT_OK(byte_builder_.Finish(&data)); |
253 | |
254 | std::shared_ptr<Buffer> null_bitmap; |
255 | RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); |
256 | *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_); |
257 | |
258 | capacity_ = length_ = null_count_ = 0; |
259 | return Status::OK(); |
260 | } |
261 | |
262 | const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { |
263 | const uint8_t* data_ptr = byte_builder_.data(); |
264 | return data_ptr + i * byte_width_; |
265 | } |
266 | |
267 | util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { |
268 | const uint8_t* data_ptr = byte_builder_.data(); |
269 | return util::string_view(reinterpret_cast<const char*>(data_ptr + i * byte_width_), |
270 | byte_width_); |
271 | } |
272 | |
273 | // ---------------------------------------------------------------------- |
274 | // ChunkedArray builders |
275 | |
276 | namespace internal { |
277 | |
278 | ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_size, MemoryPool* pool) |
279 | : max_chunk_size_(max_chunk_size), |
280 | chunk_data_size_(0), |
281 | builder_(new BinaryBuilder(pool)) {} |
282 | |
283 | Status ChunkedBinaryBuilder::Finish(ArrayVector* out) { |
284 | if (builder_->length() > 0 || chunks_.size() == 0) { |
285 | std::shared_ptr<Array> chunk; |
286 | RETURN_NOT_OK(builder_->Finish(&chunk)); |
287 | chunks_.emplace_back(std::move(chunk)); |
288 | } |
289 | *out = std::move(chunks_); |
290 | return Status::OK(); |
291 | } |
292 | |
293 | Status ChunkedBinaryBuilder::NextChunk() { |
294 | std::shared_ptr<Array> chunk; |
295 | RETURN_NOT_OK(builder_->Finish(&chunk)); |
296 | chunks_.emplace_back(std::move(chunk)); |
297 | |
298 | chunk_data_size_ = 0; |
299 | return Status::OK(); |
300 | } |
301 | |
302 | Status ChunkedStringBuilder::Finish(ArrayVector* out) { |
303 | RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); |
304 | |
305 | // Change data type to string/utf8 |
306 | for (size_t i = 0; i < out->size(); ++i) { |
307 | std::shared_ptr<ArrayData> data = (*out)[i]->data(); |
308 | data->type = ::arrow::utf8(); |
309 | (*out)[i] = std::make_shared<StringArray>(data); |
310 | } |
311 | return Status::OK(); |
312 | } |
313 | |
314 | } // namespace internal |
315 | |
316 | } // namespace arrow |
317 | |