1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <cstdint> |
19 | #include <cstring> |
20 | #include <memory> |
21 | #include <string> |
22 | #include <vector> |
23 | |
24 | #include <gtest/gtest.h> |
25 | |
26 | #include "arrow/array.h" |
27 | #include "arrow/buffer.h" |
28 | #include "arrow/builder.h" |
29 | #include "arrow/memory_pool.h" |
30 | #include "arrow/status.h" |
31 | #include "arrow/test-common.h" |
32 | #include "arrow/test-util.h" |
33 | #include "arrow/type.h" |
34 | #include "arrow/type_traits.h" |
35 | #include "arrow/util/bit-util.h" |
36 | #include "arrow/util/checked_cast.h" |
37 | |
38 | namespace arrow { |
39 | |
40 | using std::string; |
41 | using std::vector; |
42 | |
43 | using internal::checked_cast; |
44 | |
45 | // ---------------------------------------------------------------------- |
46 | // String / Binary tests |
47 | |
48 | class TestStringArray : public ::testing::Test { |
49 | public: |
50 | void SetUp() { |
51 | chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; |
52 | offsets_ = {0, 1, 1, 1, 3, 6}; |
53 | valid_bytes_ = {1, 1, 0, 1, 1}; |
54 | expected_ = {"a" , "" , "" , "bb" , "ccc" }; |
55 | |
56 | MakeArray(); |
57 | } |
58 | |
59 | void MakeArray() { |
60 | length_ = static_cast<int64_t>(offsets_.size()) - 1; |
61 | value_buf_ = Buffer::Wrap(chars_); |
62 | offsets_buf_ = Buffer::Wrap(offsets_); |
63 | ASSERT_OK(BitUtil::BytesToBits(valid_bytes_, default_memory_pool(), &null_bitmap_)); |
64 | null_count_ = CountNulls(valid_bytes_); |
65 | |
66 | strings_ = std::make_shared<StringArray>(length_, offsets_buf_, value_buf_, |
67 | null_bitmap_, null_count_); |
68 | } |
69 | |
70 | protected: |
71 | vector<int32_t> offsets_; |
72 | vector<char> chars_; |
73 | vector<uint8_t> valid_bytes_; |
74 | |
75 | vector<string> expected_; |
76 | |
77 | std::shared_ptr<Buffer> value_buf_; |
78 | std::shared_ptr<Buffer> offsets_buf_; |
79 | std::shared_ptr<Buffer> null_bitmap_; |
80 | |
81 | int64_t null_count_; |
82 | int64_t length_; |
83 | |
84 | std::shared_ptr<StringArray> strings_; |
85 | }; |
86 | |
87 | TEST_F(TestStringArray, TestArrayBasics) { |
88 | ASSERT_EQ(length_, strings_->length()); |
89 | ASSERT_EQ(1, strings_->null_count()); |
90 | ASSERT_OK(ValidateArray(*strings_)); |
91 | } |
92 | |
93 | TEST_F(TestStringArray, TestType) { |
94 | std::shared_ptr<DataType> type = strings_->type(); |
95 | |
96 | ASSERT_EQ(Type::STRING, type->id()); |
97 | ASSERT_EQ(Type::STRING, strings_->type_id()); |
98 | } |
99 | |
100 | TEST_F(TestStringArray, TestListFunctions) { |
101 | int pos = 0; |
102 | for (size_t i = 0; i < expected_.size(); ++i) { |
103 | ASSERT_EQ(pos, strings_->value_offset(i)); |
104 | ASSERT_EQ(static_cast<int>(expected_[i].size()), strings_->value_length(i)); |
105 | pos += static_cast<int>(expected_[i].size()); |
106 | } |
107 | } |
108 | |
109 | TEST_F(TestStringArray, TestDestructor) { |
110 | auto arr = std::make_shared<StringArray>(length_, offsets_buf_, value_buf_, |
111 | null_bitmap_, null_count_); |
112 | } |
113 | |
114 | TEST_F(TestStringArray, TestGetString) { |
115 | for (size_t i = 0; i < expected_.size(); ++i) { |
116 | if (valid_bytes_[i] == 0) { |
117 | ASSERT_TRUE(strings_->IsNull(i)); |
118 | } else { |
119 | ASSERT_EQ(expected_[i], strings_->GetString(i)); |
120 | } |
121 | } |
122 | } |
123 | |
124 | TEST_F(TestStringArray, TestEmptyStringComparison) { |
125 | offsets_ = {0, 0, 0, 0, 0, 0}; |
126 | offsets_buf_ = Buffer::Wrap(offsets_); |
127 | length_ = static_cast<int64_t>(offsets_.size() - 1); |
128 | |
129 | auto strings_a = std::make_shared<StringArray>(length_, offsets_buf_, nullptr, |
130 | null_bitmap_, null_count_); |
131 | auto strings_b = std::make_shared<StringArray>(length_, offsets_buf_, nullptr, |
132 | null_bitmap_, null_count_); |
133 | ASSERT_TRUE(strings_a->Equals(strings_b)); |
134 | } |
135 | |
136 | TEST_F(TestStringArray, CompareNullByteSlots) { |
137 | StringBuilder builder; |
138 | StringBuilder builder2; |
139 | StringBuilder builder3; |
140 | |
141 | ASSERT_OK(builder.Append("foo" )); |
142 | ASSERT_OK(builder2.Append("foo" )); |
143 | ASSERT_OK(builder3.Append("foo" )); |
144 | |
145 | ASSERT_OK(builder.Append("bar" )); |
146 | ASSERT_OK(builder2.AppendNull()); |
147 | |
148 | // same length, but different |
149 | ASSERT_OK(builder3.Append("xyz" )); |
150 | |
151 | ASSERT_OK(builder.Append("baz" )); |
152 | ASSERT_OK(builder2.Append("baz" )); |
153 | ASSERT_OK(builder3.Append("baz" )); |
154 | |
155 | std::shared_ptr<Array> array, array2, array3; |
156 | FinishAndCheckPadding(&builder, &array); |
157 | ASSERT_OK(builder2.Finish(&array2)); |
158 | ASSERT_OK(builder3.Finish(&array3)); |
159 | |
160 | const auto& a1 = checked_cast<const StringArray&>(*array); |
161 | const auto& a2 = checked_cast<const StringArray&>(*array2); |
162 | const auto& a3 = checked_cast<const StringArray&>(*array3); |
163 | |
164 | // The validity bitmaps are the same, the data is different, but the unequal |
165 | // portion is masked out |
166 | StringArray equal_array(3, a1.value_offsets(), a1.value_data(), a2.null_bitmap(), 1); |
167 | StringArray equal_array2(3, a3.value_offsets(), a3.value_data(), a2.null_bitmap(), 1); |
168 | |
169 | ASSERT_TRUE(equal_array.Equals(equal_array2)); |
170 | ASSERT_TRUE(a2.RangeEquals(equal_array2, 0, 3, 0)); |
171 | |
172 | ASSERT_TRUE(equal_array.Array::Slice(1)->Equals(equal_array2.Array::Slice(1))); |
173 | ASSERT_TRUE( |
174 | equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1))); |
175 | } |
176 | |
177 | TEST_F(TestStringArray, TestSliceGetString) { |
178 | StringBuilder builder; |
179 | |
180 | ASSERT_OK(builder.Append("a" )); |
181 | ASSERT_OK(builder.Append("b" )); |
182 | ASSERT_OK(builder.Append("c" )); |
183 | |
184 | std::shared_ptr<Array> array; |
185 | ASSERT_OK(builder.Finish(&array)); |
186 | auto s = array->Slice(1, 10); |
187 | auto arr = std::dynamic_pointer_cast<StringArray>(s); |
188 | ASSERT_EQ(arr->GetString(0), "b" ); |
189 | } |
190 | |
191 | // ---------------------------------------------------------------------- |
192 | // String builder tests |
193 | |
194 | class TestStringBuilder : public TestBuilder { |
195 | public: |
196 | void SetUp() { |
197 | TestBuilder::SetUp(); |
198 | builder_.reset(new StringBuilder(pool_)); |
199 | } |
200 | |
201 | void Done() { |
202 | std::shared_ptr<Array> out; |
203 | FinishAndCheckPadding(builder_.get(), &out); |
204 | |
205 | result_ = std::dynamic_pointer_cast<StringArray>(out); |
206 | ASSERT_OK(ValidateArray(*result_)); |
207 | } |
208 | |
209 | protected: |
210 | std::unique_ptr<StringBuilder> builder_; |
211 | std::shared_ptr<StringArray> result_; |
212 | }; |
213 | |
214 | TEST_F(TestStringBuilder, TestScalarAppend) { |
215 | vector<string> strings = {"" , "bb" , "a" , "" , "ccc" }; |
216 | vector<uint8_t> is_null = {0, 0, 0, 1, 0}; |
217 | |
218 | int N = static_cast<int>(strings.size()); |
219 | int reps = 1000; |
220 | |
221 | for (int j = 0; j < reps; ++j) { |
222 | for (int i = 0; i < N; ++i) { |
223 | if (is_null[i]) { |
224 | ASSERT_OK(builder_->AppendNull()); |
225 | } else { |
226 | ASSERT_OK(builder_->Append(strings[i])); |
227 | } |
228 | } |
229 | } |
230 | Done(); |
231 | |
232 | ASSERT_EQ(reps * N, result_->length()); |
233 | ASSERT_EQ(reps, result_->null_count()); |
234 | ASSERT_EQ(reps * 6, result_->value_data()->size()); |
235 | |
236 | int32_t length; |
237 | int32_t pos = 0; |
238 | for (int i = 0; i < N * reps; ++i) { |
239 | if (is_null[i % N]) { |
240 | ASSERT_TRUE(result_->IsNull(i)); |
241 | } else { |
242 | ASSERT_FALSE(result_->IsNull(i)); |
243 | result_->GetValue(i, &length); |
244 | ASSERT_EQ(pos, result_->value_offset(i)); |
245 | ASSERT_EQ(static_cast<int>(strings[i % N].size()), length); |
246 | ASSERT_EQ(strings[i % N], result_->GetString(i)); |
247 | |
248 | pos += length; |
249 | } |
250 | } |
251 | } |
252 | |
253 | TEST_F(TestStringBuilder, TestAppendVector) { |
254 | vector<string> strings = {"" , "bb" , "a" , "" , "ccc" }; |
255 | vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1}; |
256 | |
257 | int N = static_cast<int>(strings.size()); |
258 | int reps = 1000; |
259 | |
260 | for (int j = 0; j < reps; ++j) { |
261 | ASSERT_OK(builder_->AppendValues(strings, valid_bytes.data())); |
262 | } |
263 | Done(); |
264 | |
265 | ASSERT_EQ(reps * N, result_->length()); |
266 | ASSERT_EQ(reps, result_->null_count()); |
267 | ASSERT_EQ(reps * 6, result_->value_data()->size()); |
268 | |
269 | int32_t length; |
270 | int32_t pos = 0; |
271 | for (int i = 0; i < N * reps; ++i) { |
272 | if (valid_bytes[i % N]) { |
273 | ASSERT_FALSE(result_->IsNull(i)); |
274 | result_->GetValue(i, &length); |
275 | ASSERT_EQ(pos, result_->value_offset(i)); |
276 | ASSERT_EQ(static_cast<int>(strings[i % N].size()), length); |
277 | ASSERT_EQ(strings[i % N], result_->GetString(i)); |
278 | |
279 | pos += length; |
280 | } else { |
281 | ASSERT_TRUE(result_->IsNull(i)); |
282 | } |
283 | } |
284 | } |
285 | |
286 | TEST_F(TestStringBuilder, TestAppendCStringsWithValidBytes) { |
287 | const char* strings[] = {nullptr, "aaa" , nullptr, "ignored" , "" }; |
288 | vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1}; |
289 | |
290 | int N = static_cast<int>(sizeof(strings) / sizeof(strings[0])); |
291 | int reps = 1000; |
292 | |
293 | for (int j = 0; j < reps; ++j) { |
294 | ASSERT_OK(builder_->AppendValues(strings, N, valid_bytes.data())); |
295 | } |
296 | Done(); |
297 | |
298 | ASSERT_EQ(reps * N, result_->length()); |
299 | ASSERT_EQ(reps * 3, result_->null_count()); |
300 | ASSERT_EQ(reps * 3, result_->value_data()->size()); |
301 | |
302 | int32_t length; |
303 | int32_t pos = 0; |
304 | for (int i = 0; i < N * reps; ++i) { |
305 | auto string = strings[i % N]; |
306 | if (string && valid_bytes[i % N]) { |
307 | ASSERT_FALSE(result_->IsNull(i)); |
308 | result_->GetValue(i, &length); |
309 | ASSERT_EQ(pos, result_->value_offset(i)); |
310 | ASSERT_EQ(static_cast<int32_t>(strlen(string)), length); |
311 | ASSERT_EQ(strings[i % N], result_->GetString(i)); |
312 | |
313 | pos += length; |
314 | } else { |
315 | ASSERT_TRUE(result_->IsNull(i)); |
316 | } |
317 | } |
318 | } |
319 | |
320 | TEST_F(TestStringBuilder, TestAppendCStringsWithoutValidBytes) { |
321 | const char* strings[] = {"" , "bb" , "a" , nullptr, "ccc" }; |
322 | |
323 | int N = static_cast<int>(sizeof(strings) / sizeof(strings[0])); |
324 | int reps = 1000; |
325 | |
326 | for (int j = 0; j < reps; ++j) { |
327 | ASSERT_OK(builder_->AppendValues(strings, N)); |
328 | } |
329 | Done(); |
330 | |
331 | ASSERT_EQ(reps * N, result_->length()); |
332 | ASSERT_EQ(reps, result_->null_count()); |
333 | ASSERT_EQ(reps * 6, result_->value_data()->size()); |
334 | |
335 | int32_t length; |
336 | int32_t pos = 0; |
337 | for (int i = 0; i < N * reps; ++i) { |
338 | if (strings[i % N]) { |
339 | ASSERT_FALSE(result_->IsNull(i)); |
340 | result_->GetValue(i, &length); |
341 | ASSERT_EQ(pos, result_->value_offset(i)); |
342 | ASSERT_EQ(static_cast<int32_t>(strlen(strings[i % N])), length); |
343 | ASSERT_EQ(strings[i % N], result_->GetString(i)); |
344 | |
345 | pos += length; |
346 | } else { |
347 | ASSERT_TRUE(result_->IsNull(i)); |
348 | } |
349 | } |
350 | } |
351 | |
352 | TEST_F(TestStringBuilder, TestZeroLength) { |
353 | // All buffers are null |
354 | Done(); |
355 | } |
356 | |
357 | // Binary container type |
358 | // TODO(emkornfield) there should be some way to refactor these to avoid code duplicating |
359 | // with String |
360 | class TestBinaryArray : public ::testing::Test { |
361 | public: |
362 | void SetUp() { |
363 | chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; |
364 | offsets_ = {0, 1, 1, 1, 3, 6}; |
365 | valid_bytes_ = {1, 1, 0, 1, 1}; |
366 | expected_ = {"a" , "" , "" , "bb" , "ccc" }; |
367 | |
368 | MakeArray(); |
369 | } |
370 | |
371 | void MakeArray() { |
372 | length_ = static_cast<int64_t>(offsets_.size() - 1); |
373 | value_buf_ = Buffer::Wrap(chars_); |
374 | offsets_buf_ = Buffer::Wrap(offsets_); |
375 | |
376 | ASSERT_OK(BitUtil::BytesToBits(valid_bytes_, default_memory_pool(), &null_bitmap_)); |
377 | null_count_ = CountNulls(valid_bytes_); |
378 | |
379 | strings_ = std::make_shared<BinaryArray>(length_, offsets_buf_, value_buf_, |
380 | null_bitmap_, null_count_); |
381 | } |
382 | |
383 | protected: |
384 | vector<int32_t> offsets_; |
385 | vector<char> chars_; |
386 | vector<uint8_t> valid_bytes_; |
387 | |
388 | vector<string> expected_; |
389 | |
390 | std::shared_ptr<Buffer> value_buf_; |
391 | std::shared_ptr<Buffer> offsets_buf_; |
392 | std::shared_ptr<Buffer> null_bitmap_; |
393 | |
394 | int64_t null_count_; |
395 | int64_t length_; |
396 | |
397 | std::shared_ptr<BinaryArray> strings_; |
398 | }; |
399 | |
400 | TEST_F(TestBinaryArray, TestArrayBasics) { |
401 | ASSERT_EQ(length_, strings_->length()); |
402 | ASSERT_EQ(1, strings_->null_count()); |
403 | ASSERT_OK(ValidateArray(*strings_)); |
404 | } |
405 | |
406 | TEST_F(TestBinaryArray, TestType) { |
407 | std::shared_ptr<DataType> type = strings_->type(); |
408 | |
409 | ASSERT_EQ(Type::BINARY, type->id()); |
410 | ASSERT_EQ(Type::BINARY, strings_->type_id()); |
411 | } |
412 | |
413 | TEST_F(TestBinaryArray, TestListFunctions) { |
414 | size_t pos = 0; |
415 | for (size_t i = 0; i < expected_.size(); ++i) { |
416 | ASSERT_EQ(pos, strings_->value_offset(i)); |
417 | ASSERT_EQ(static_cast<int>(expected_[i].size()), strings_->value_length(i)); |
418 | pos += expected_[i].size(); |
419 | } |
420 | } |
421 | |
422 | TEST_F(TestBinaryArray, TestDestructor) { |
423 | auto arr = std::make_shared<BinaryArray>(length_, offsets_buf_, value_buf_, |
424 | null_bitmap_, null_count_); |
425 | } |
426 | |
427 | TEST_F(TestBinaryArray, TestGetValue) { |
428 | for (size_t i = 0; i < expected_.size(); ++i) { |
429 | if (valid_bytes_[i] == 0) { |
430 | ASSERT_TRUE(strings_->IsNull(i)); |
431 | } else { |
432 | ASSERT_FALSE(strings_->IsNull(i)); |
433 | ASSERT_EQ(strings_->GetString(i), expected_[i]); |
434 | } |
435 | } |
436 | } |
437 | |
438 | TEST_F(TestBinaryArray, TestNullValuesInitialized) { |
439 | for (size_t i = 0; i < expected_.size(); ++i) { |
440 | if (valid_bytes_[i] == 0) { |
441 | ASSERT_TRUE(strings_->IsNull(i)); |
442 | } else { |
443 | ASSERT_FALSE(strings_->IsNull(i)); |
444 | ASSERT_EQ(strings_->GetString(i), expected_[i]); |
445 | } |
446 | } |
447 | TestInitialized(*strings_); |
448 | } |
449 | |
450 | TEST_F(TestBinaryArray, TestPaddingZeroed) { AssertZeroPadded(*strings_); } |
451 | |
452 | TEST_F(TestBinaryArray, TestGetString) { |
453 | for (size_t i = 0; i < expected_.size(); ++i) { |
454 | if (valid_bytes_[i] == 0) { |
455 | ASSERT_TRUE(strings_->IsNull(i)); |
456 | } else { |
457 | std::string val = strings_->GetString(i); |
458 | ASSERT_EQ(0, std::memcmp(expected_[i].data(), val.c_str(), val.size())); |
459 | } |
460 | } |
461 | } |
462 | |
463 | TEST_F(TestBinaryArray, TestEqualsEmptyStrings) { |
464 | BinaryBuilder builder; |
465 | |
466 | string empty_string("" ); |
467 | for (int i = 0; i < 5; ++i) { |
468 | ASSERT_OK(builder.Append(empty_string)); |
469 | } |
470 | |
471 | std::shared_ptr<Array> left_arr; |
472 | FinishAndCheckPadding(&builder, &left_arr); |
473 | |
474 | const BinaryArray& left = checked_cast<const BinaryArray&>(*left_arr); |
475 | std::shared_ptr<Array> right = |
476 | std::make_shared<BinaryArray>(left.length(), left.value_offsets(), nullptr, |
477 | left.null_bitmap(), left.null_count()); |
478 | |
479 | ASSERT_TRUE(left.Equals(right)); |
480 | ASSERT_TRUE(left.RangeEquals(0, left.length(), 0, right)); |
481 | } |
482 | |
483 | class TestBinaryBuilder : public TestBuilder { |
484 | public: |
485 | void SetUp() { |
486 | TestBuilder::SetUp(); |
487 | builder_.reset(new BinaryBuilder(pool_)); |
488 | } |
489 | |
490 | void Done() { |
491 | std::shared_ptr<Array> out; |
492 | FinishAndCheckPadding(builder_.get(), &out); |
493 | |
494 | result_ = std::dynamic_pointer_cast<BinaryArray>(out); |
495 | ASSERT_OK(ValidateArray(*result_)); |
496 | } |
497 | |
498 | protected: |
499 | std::unique_ptr<BinaryBuilder> builder_; |
500 | std::shared_ptr<BinaryArray> result_; |
501 | }; |
502 | |
503 | TEST_F(TestBinaryBuilder, TestScalarAppend) { |
504 | vector<string> strings = {"" , "bb" , "a" , "" , "ccc" }; |
505 | vector<uint8_t> is_null = {0, 0, 0, 1, 0}; |
506 | |
507 | int N = static_cast<int>(strings.size()); |
508 | int reps = 10; |
509 | |
510 | for (int j = 0; j < reps; ++j) { |
511 | for (int i = 0; i < N; ++i) { |
512 | if (is_null[i]) { |
513 | ASSERT_OK(builder_->AppendNull()); |
514 | } else { |
515 | ASSERT_OK(builder_->Append(strings[i])); |
516 | } |
517 | } |
518 | } |
519 | Done(); |
520 | ASSERT_OK(ValidateArray(*result_)); |
521 | ASSERT_EQ(reps * N, result_->length()); |
522 | ASSERT_EQ(reps, result_->null_count()); |
523 | ASSERT_EQ(reps * 6, result_->value_data()->size()); |
524 | |
525 | int32_t length; |
526 | for (int i = 0; i < N * reps; ++i) { |
527 | if (is_null[i % N]) { |
528 | ASSERT_TRUE(result_->IsNull(i)); |
529 | } else { |
530 | ASSERT_FALSE(result_->IsNull(i)); |
531 | const uint8_t* vals = result_->GetValue(i, &length); |
532 | ASSERT_EQ(static_cast<int>(strings[i % N].size()), length); |
533 | ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length)); |
534 | } |
535 | } |
536 | } |
537 | |
538 | TEST_F(TestBinaryBuilder, TestScalarAppendUnsafe) { |
539 | vector<string> strings = {"" , "bb" , "a" , "" , "ccc" }; |
540 | vector<uint8_t> is_null = {0, 0, 0, 1, 0}; |
541 | |
542 | int N = static_cast<int>(strings.size()); |
543 | int reps = 13; |
544 | int total_length = 0; |
545 | for (auto&& s : strings) total_length += static_cast<int>(s.size()); |
546 | |
547 | ASSERT_OK(builder_->Reserve(N * reps)); |
548 | ASSERT_OK(builder_->ReserveData(total_length * reps)); |
549 | |
550 | for (int j = 0; j < reps; ++j) { |
551 | for (int i = 0; i < N; ++i) { |
552 | if (is_null[i]) { |
553 | builder_->UnsafeAppendNull(); |
554 | } else { |
555 | builder_->UnsafeAppend(strings[i]); |
556 | } |
557 | } |
558 | } |
559 | ASSERT_EQ(builder_->value_data_length(), total_length * reps); |
560 | Done(); |
561 | ASSERT_OK(ValidateArray(*result_)); |
562 | ASSERT_EQ(reps * N, result_->length()); |
563 | ASSERT_EQ(reps, result_->null_count()); |
564 | ASSERT_EQ(reps * total_length, result_->value_data()->size()); |
565 | |
566 | int32_t length; |
567 | for (int i = 0; i < N * reps; ++i) { |
568 | if (is_null[i % N]) { |
569 | ASSERT_TRUE(result_->IsNull(i)); |
570 | } else { |
571 | ASSERT_FALSE(result_->IsNull(i)); |
572 | const uint8_t* vals = result_->GetValue(i, &length); |
573 | ASSERT_EQ(static_cast<int>(strings[i % N].size()), length); |
574 | ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length)); |
575 | } |
576 | } |
577 | } |
578 | |
579 | TEST_F(TestBinaryBuilder, TestCapacityReserve) { |
580 | vector<string> strings = {"aaaaa" , "bbbbbbbbbb" , "ccccccccccccccc" , "dddddddddd" }; |
581 | int N = static_cast<int>(strings.size()); |
582 | int reps = 15; |
583 | int64_t length = 0; |
584 | int64_t capacity = 1000; |
585 | int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity); |
586 | |
587 | ASSERT_OK(builder_->ReserveData(capacity)); |
588 | |
589 | ASSERT_EQ(length, builder_->value_data_length()); |
590 | ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); |
591 | |
592 | for (int j = 0; j < reps; ++j) { |
593 | for (int i = 0; i < N; ++i) { |
594 | ASSERT_OK(builder_->Append(strings[i])); |
595 | length += static_cast<int>(strings[i].size()); |
596 | |
597 | ASSERT_EQ(length, builder_->value_data_length()); |
598 | ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); |
599 | } |
600 | } |
601 | |
602 | int = 500; |
603 | expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity); |
604 | |
605 | ASSERT_OK(builder_->ReserveData(extra_capacity)); |
606 | |
607 | ASSERT_EQ(length, builder_->value_data_length()); |
608 | ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); |
609 | |
610 | Done(); |
611 | |
612 | ASSERT_EQ(reps * N, result_->length()); |
613 | ASSERT_EQ(0, result_->null_count()); |
614 | ASSERT_EQ(reps * 40, result_->value_data()->size()); |
615 | |
616 | // Capacity is shrunk after `Finish` |
617 | ASSERT_EQ(640, result_->value_data()->capacity()); |
618 | } |
619 | |
620 | TEST_F(TestBinaryBuilder, TestZeroLength) { |
621 | // All buffers are null |
622 | Done(); |
623 | } |
624 | |
625 | // ---------------------------------------------------------------------- |
626 | // Slice tests |
627 | |
628 | template <typename TYPE> |
629 | void CheckSliceEquality() { |
630 | using Traits = TypeTraits<TYPE>; |
631 | using BuilderType = typename Traits::BuilderType; |
632 | |
633 | BuilderType builder; |
634 | |
635 | vector<string> strings = {"foo" , "" , "bar" , "baz" , "qux" , "" }; |
636 | vector<uint8_t> is_null = {0, 1, 0, 1, 0, 0}; |
637 | |
638 | int N = static_cast<int>(strings.size()); |
639 | int reps = 10; |
640 | |
641 | for (int j = 0; j < reps; ++j) { |
642 | for (int i = 0; i < N; ++i) { |
643 | if (is_null[i]) { |
644 | ASSERT_OK(builder.AppendNull()); |
645 | } else { |
646 | ASSERT_OK(builder.Append(strings[i])); |
647 | } |
648 | } |
649 | } |
650 | |
651 | std::shared_ptr<Array> array; |
652 | FinishAndCheckPadding(&builder, &array); |
653 | |
654 | std::shared_ptr<Array> slice, slice2; |
655 | |
656 | slice = array->Slice(5); |
657 | slice2 = array->Slice(5); |
658 | ASSERT_EQ(N * reps - 5, slice->length()); |
659 | |
660 | ASSERT_TRUE(slice->Equals(slice2)); |
661 | ASSERT_TRUE(array->RangeEquals(5, slice->length(), 0, slice)); |
662 | |
663 | // Chained slices |
664 | slice2 = array->Slice(2)->Slice(3); |
665 | ASSERT_TRUE(slice->Equals(slice2)); |
666 | |
667 | slice = array->Slice(5, 20); |
668 | slice2 = array->Slice(5, 20); |
669 | ASSERT_EQ(20, slice->length()); |
670 | |
671 | ASSERT_TRUE(slice->Equals(slice2)); |
672 | ASSERT_TRUE(array->RangeEquals(5, 25, 0, slice)); |
673 | } |
674 | |
675 | TEST_F(TestBinaryArray, TestSliceEquality) { CheckSliceEquality<BinaryType>(); } |
676 | |
677 | TEST_F(TestStringArray, TestSliceEquality) { CheckSliceEquality<BinaryType>(); } |
678 | |
679 | TEST_F(TestBinaryArray, LengthZeroCtor) { BinaryArray array(0, nullptr, nullptr); } |
680 | |
681 | // ---------------------------------------------------------------------- |
682 | // ChunkedBinaryBuilder tests |
683 | |
684 | class TestChunkedBinaryBuilder : public ::testing::Test { |
685 | public: |
686 | void SetUp() {} |
687 | |
688 | void Init(int32_t chunksize) { |
689 | builder_.reset(new internal::ChunkedBinaryBuilder(chunksize)); |
690 | } |
691 | |
692 | protected: |
693 | std::unique_ptr<internal::ChunkedBinaryBuilder> builder_; |
694 | }; |
695 | |
696 | TEST_F(TestChunkedBinaryBuilder, BasicOperation) { |
697 | const int32_t chunksize = 1000; |
698 | Init(chunksize); |
699 | |
700 | const int elem_size = 10; |
701 | uint8_t buf[elem_size]; |
702 | |
703 | BinaryBuilder unchunked_builder; |
704 | |
705 | const int iterations = 1000; |
706 | for (int i = 0; i < iterations; ++i) { |
707 | random_bytes(elem_size, i, buf); |
708 | |
709 | ASSERT_OK(unchunked_builder.Append(buf, elem_size)); |
710 | ASSERT_OK(builder_->Append(buf, elem_size)); |
711 | } |
712 | |
713 | std::shared_ptr<Array> unchunked; |
714 | ASSERT_OK(unchunked_builder.Finish(&unchunked)); |
715 | |
716 | ArrayVector chunks; |
717 | ASSERT_OK(builder_->Finish(&chunks)); |
718 | |
719 | // This assumes that everything is evenly divisible |
720 | ArrayVector expected_chunks; |
721 | const int elems_per_chunk = chunksize / elem_size; |
722 | for (int i = 0; i < iterations / elems_per_chunk; ++i) { |
723 | expected_chunks.emplace_back(unchunked->Slice(i * elems_per_chunk, elems_per_chunk)); |
724 | } |
725 | |
726 | ASSERT_EQ(expected_chunks.size(), chunks.size()); |
727 | for (size_t i = 0; i < chunks.size(); ++i) { |
728 | AssertArraysEqual(*expected_chunks[i], *chunks[i]); |
729 | } |
730 | } |
731 | |
732 | TEST_F(TestChunkedBinaryBuilder, NoData) { |
733 | Init(1000); |
734 | |
735 | ArrayVector chunks; |
736 | ASSERT_OK(builder_->Finish(&chunks)); |
737 | |
738 | ASSERT_EQ(1, chunks.size()); |
739 | ASSERT_EQ(0, chunks[0]->length()); |
740 | } |
741 | |
742 | TEST_F(TestChunkedBinaryBuilder, LargeElements) { |
743 | Init(100); |
744 | |
745 | const int bufsize = 101; |
746 | uint8_t buf[bufsize]; |
747 | |
748 | const int iterations = 100; |
749 | for (int i = 0; i < iterations; ++i) { |
750 | random_bytes(bufsize, i, buf); |
751 | ASSERT_OK(builder_->Append(buf, bufsize)); |
752 | } |
753 | |
754 | ArrayVector chunks; |
755 | ASSERT_OK(builder_->Finish(&chunks)); |
756 | ASSERT_EQ(iterations, static_cast<int>(chunks.size())); |
757 | |
758 | int64_t total_data_size = 0; |
759 | for (auto chunk : chunks) { |
760 | ASSERT_EQ(1, chunk->length()); |
761 | total_data_size += |
762 | static_cast<int64_t>(static_cast<const BinaryArray&>(*chunk).GetView(0).size()); |
763 | } |
764 | ASSERT_EQ(iterations * bufsize, total_data_size); |
765 | } |
766 | |
767 | TEST(TestChunkedStringBuilder, BasicOperation) { |
768 | const int chunksize = 100; |
769 | internal::ChunkedStringBuilder builder(chunksize); |
770 | |
771 | std::string value = "0123456789" ; |
772 | |
773 | const int iterations = 100; |
774 | for (int i = 0; i < iterations; ++i) { |
775 | ASSERT_OK(builder.Append(value)); |
776 | } |
777 | |
778 | ArrayVector chunks; |
779 | ASSERT_OK(builder.Finish(&chunks)); |
780 | |
781 | ASSERT_EQ(10, chunks.size()); |
782 | |
783 | // Type is correct |
784 | for (auto chunk : chunks) { |
785 | ASSERT_TRUE(chunk->type()->Equals(*::arrow::utf8())); |
786 | } |
787 | } |
788 | |
789 | } // namespace arrow |
790 | |