1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <array>
19#include <cstdint>
20#include <memory>
21#include <ostream>
22#include <string>
23#include <vector>
24
25#include <gtest/gtest.h>
26
27#include "arrow/array.h"
28#include "arrow/builder.h"
29#include "arrow/memory_pool.h"
30#include "arrow/status.h"
31#include "arrow/test-common.h"
32#include "arrow/test-util.h"
33#include "arrow/type.h"
34#include "arrow/util/checked_cast.h"
35#include "arrow/util/decimal.h"
36
37namespace arrow {
38
39using std::string;
40using std::vector;
41
42using internal::checked_cast;
43
44// ----------------------------------------------------------------------
45// Dictionary tests
46
47template <typename Type>
48class TestDictionaryBuilder : public TestBuilder {};
49
50typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
51 UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType>
52 PrimitiveDictionaries;
53
54TYPED_TEST_CASE(TestDictionaryBuilder, PrimitiveDictionaries);
55
56TYPED_TEST(TestDictionaryBuilder, Basic) {
57 DictionaryBuilder<TypeParam> builder(default_memory_pool());
58 ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
59 ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
60 ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
61 ASSERT_OK(builder.AppendNull());
62
63 ASSERT_EQ(builder.length(), 4);
64 ASSERT_EQ(builder.null_count(), 1);
65
66 std::shared_ptr<Array> result;
67 ASSERT_OK(builder.Finish(&result));
68
69 // Build expected data
70 auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]");
71 auto dict_type = std::make_shared<DictionaryType>(int8(), dict_array);
72
73 auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]");
74 DictionaryArray expected(dict_type, int_array);
75
76 ASSERT_TRUE(expected.Equals(result));
77}
78
79TYPED_TEST(TestDictionaryBuilder, ArrayConversion) {
80 auto type = std::make_shared<TypeParam>();
81
82 auto intermediate_result = ArrayFromJSON(type, "[1, 2, 1]");
83 DictionaryBuilder<TypeParam> dictionary_builder(default_memory_pool());
84 ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result));
85 std::shared_ptr<Array> result;
86 ASSERT_OK(dictionary_builder.Finish(&result));
87
88 // Build expected data
89 auto dict_array = ArrayFromJSON(type, "[1, 2]");
90 auto dict_type = std::make_shared<DictionaryType>(int8(), dict_array);
91
92 auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
93 DictionaryArray expected(dict_type, int_array);
94
95 ASSERT_TRUE(expected.Equals(result));
96}
97
98TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) {
99 using Scalar = typename TypeParam::c_type;
100 // Skip this test for (u)int8
101 if (sizeof(Scalar) > 1) {
102 // Build the dictionary Array
103 DictionaryBuilder<TypeParam> builder(default_memory_pool());
104 // Build expected data
105 NumericBuilder<TypeParam> dict_builder;
106 Int16Builder int_builder;
107
108 // Fill with 1024 different values
109 for (int64_t i = 0; i < 1024; i++) {
110 ASSERT_OK(builder.Append(static_cast<Scalar>(i)));
111 ASSERT_OK(dict_builder.Append(static_cast<Scalar>(i)));
112 ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
113 }
114 // Fill with an already existing value
115 for (int64_t i = 0; i < 1024; i++) {
116 ASSERT_OK(builder.Append(static_cast<Scalar>(1)));
117 ASSERT_OK(int_builder.Append(1));
118 }
119
120 // Finalize result
121 std::shared_ptr<Array> result;
122 FinishAndCheckPadding(&builder, &result);
123
124 // Finalize expected data
125 std::shared_ptr<Array> dict_array;
126 ASSERT_OK(dict_builder.Finish(&dict_array));
127 auto dtype = std::make_shared<DictionaryType>(int16(), dict_array);
128 std::shared_ptr<Array> int_array;
129 ASSERT_OK(int_builder.Finish(&int_array));
130
131 DictionaryArray expected(dtype, int_array);
132 ASSERT_TRUE(expected.Equals(result));
133 }
134}
135
136TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) {
137 using c_type = typename TypeParam::c_type;
138 auto type = std::make_shared<TypeParam>();
139
140 DictionaryBuilder<TypeParam> builder(default_memory_pool());
141
142 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
143 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
144 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
145 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
146 std::shared_ptr<Array> result;
147 FinishAndCheckPadding(&builder, &result);
148
149 // Build expected data for the initial dictionary
150 auto dict_type1 = dictionary(int8(), ArrayFromJSON(type, "[1, 2]"));
151 DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]"));
152
153 ASSERT_TRUE(expected.Equals(result));
154
155 // extend the dictionary builder with new data
156 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
157 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
158 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
159 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
160 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
161
162 std::shared_ptr<Array> result_delta;
163 ASSERT_OK(builder.Finish(&result_delta));
164
165 // Build expected data for the delta dictionary
166 auto dict_type2 = dictionary(int8(), ArrayFromJSON(type, "[3]"));
167 DictionaryArray expected_delta(dict_type2, ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]"));
168
169 ASSERT_TRUE(expected_delta.Equals(result_delta));
170}
171
172TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) {
173 using c_type = typename TypeParam::c_type;
174 auto type = std::make_shared<TypeParam>();
175
176 DictionaryBuilder<TypeParam> builder(default_memory_pool());
177
178 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
179 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
180 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
181 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
182 std::shared_ptr<Array> result;
183 FinishAndCheckPadding(&builder, &result);
184
185 // Build expected data for the initial dictionary
186 auto dict_type1 = dictionary(int8(), ArrayFromJSON(type, "[1, 2]"));
187 DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]"));
188
189 ASSERT_TRUE(expected.Equals(result));
190
191 // extend the dictionary builder with new data
192 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
193 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
194 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
195 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
196 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
197
198 std::shared_ptr<Array> result_delta1;
199 ASSERT_OK(builder.Finish(&result_delta1));
200
201 // Build expected data for the delta dictionary
202 auto dict_type2 = dictionary(int8(), ArrayFromJSON(type, "[3]"));
203 DictionaryArray expected_delta1(dict_type2, ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]"));
204
205 ASSERT_TRUE(expected_delta1.Equals(result_delta1));
206
207 // extend the dictionary builder with new data again
208 ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
209 ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
210 ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(3)));
211 ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(4)));
212 ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(5)));
213
214 std::shared_ptr<Array> result_delta2;
215 ASSERT_OK(builder.Finish(&result_delta2));
216
217 // Build expected data for the delta dictionary again
218 auto dict_type3 = dictionary(int8(), ArrayFromJSON(type, "[4, 5]"));
219 DictionaryArray expected_delta2(dict_type3, ArrayFromJSON(int8(), "[0, 1, 2, 3, 4]"));
220
221 ASSERT_TRUE(expected_delta2.Equals(result_delta2));
222}
223
224TEST(TestStringDictionaryBuilder, Basic) {
225 // Build the dictionary Array
226 StringDictionaryBuilder builder(default_memory_pool());
227 ASSERT_OK(builder.Append("test"));
228 ASSERT_OK(builder.Append("test2"));
229 ASSERT_OK(builder.Append("test"));
230
231 std::shared_ptr<Array> result;
232 ASSERT_OK(builder.Finish(&result));
233
234 // Build expected data
235 auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test\", \"test2\"]"));
236 auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
237 DictionaryArray expected(dtype, int_array);
238
239 ASSERT_TRUE(expected.Equals(result));
240}
241
242// ARROW-4367
243TEST(TestStringDictionaryBuilder, OnlyNull) {
244 // Build the dictionary Array
245 StringDictionaryBuilder builder(default_memory_pool());
246 ASSERT_OK(builder.AppendNull());
247
248 std::shared_ptr<Array> result;
249 ASSERT_OK(builder.Finish(&result));
250
251 // Build expected data
252 auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[]"));
253 auto int_array = ArrayFromJSON(int8(), "[null]");
254 DictionaryArray expected(dtype, int_array);
255
256 ASSERT_TRUE(expected.Equals(result));
257}
258
259TEST(TestStringDictionaryBuilder, DoubleTableSize) {
260 // Build the dictionary Array
261 StringDictionaryBuilder builder(default_memory_pool());
262 // Build expected data
263 StringBuilder str_builder;
264 Int16Builder int_builder;
265
266 // Fill with 1024 different values
267 for (int64_t i = 0; i < 1024; i++) {
268 std::stringstream ss;
269 ss << "test" << i;
270 ASSERT_OK(builder.Append(ss.str()));
271 ASSERT_OK(str_builder.Append(ss.str()));
272 ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
273 }
274 // Fill with an already existing value
275 for (int64_t i = 0; i < 1024; i++) {
276 ASSERT_OK(builder.Append("test1"));
277 ASSERT_OK(int_builder.Append(1));
278 }
279
280 // Finalize result
281 std::shared_ptr<Array> result;
282 FinishAndCheckPadding(&builder, &result);
283
284 // Finalize expected data
285 std::shared_ptr<Array> str_array;
286 ASSERT_OK(str_builder.Finish(&str_array));
287 auto dtype = std::make_shared<DictionaryType>(int16(), str_array);
288 std::shared_ptr<Array> int_array;
289 ASSERT_OK(int_builder.Finish(&int_array));
290
291 DictionaryArray expected(dtype, int_array);
292 ASSERT_TRUE(expected.Equals(result));
293}
294
295TEST(TestStringDictionaryBuilder, DeltaDictionary) {
296 // Build the dictionary Array
297 StringDictionaryBuilder builder(default_memory_pool());
298 ASSERT_OK(builder.Append("test"));
299 ASSERT_OK(builder.Append("test2"));
300 ASSERT_OK(builder.Append("test"));
301
302 std::shared_ptr<Array> result;
303 ASSERT_OK(builder.Finish(&result));
304
305 // Build expected data
306 auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test\", \"test2\"]"));
307 auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
308 DictionaryArray expected(dtype, int_array);
309
310 ASSERT_TRUE(expected.Equals(result));
311
312 // build a delta dictionary
313 ASSERT_OK(builder.Append("test2"));
314 ASSERT_OK(builder.Append("test3"));
315 ASSERT_OK(builder.Append("test2"));
316
317 std::shared_ptr<Array> result_delta;
318 FinishAndCheckPadding(&builder, &result_delta);
319
320 // Build expected data
321 auto dtype2 = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test3\"]"));
322 auto int_array2 = ArrayFromJSON(int8(), "[1, 2, 1]");
323 DictionaryArray expected_delta(dtype2, int_array2);
324
325 ASSERT_TRUE(expected_delta.Equals(result_delta));
326}
327
328TEST(TestStringDictionaryBuilder, BigDeltaDictionary) {
329 constexpr int16_t kTestLength = 2048;
330 // Build the dictionary Array
331 StringDictionaryBuilder builder(default_memory_pool());
332
333 StringBuilder str_builder1;
334 Int16Builder int_builder1;
335
336 for (int16_t idx = 0; idx < kTestLength; ++idx) {
337 std::stringstream sstream;
338 sstream << "test" << idx;
339 ASSERT_OK(builder.Append(sstream.str()));
340 ASSERT_OK(str_builder1.Append(sstream.str()));
341 ASSERT_OK(int_builder1.Append(idx));
342 }
343
344 std::shared_ptr<Array> result;
345 FinishAndCheckPadding(&builder, &result);
346
347 std::shared_ptr<Array> str_array1;
348 ASSERT_OK(str_builder1.Finish(&str_array1));
349 auto dtype1 = std::make_shared<DictionaryType>(int16(), str_array1);
350
351 std::shared_ptr<Array> int_array1;
352 ASSERT_OK(int_builder1.Finish(&int_array1));
353
354 DictionaryArray expected(dtype1, int_array1);
355 ASSERT_TRUE(expected.Equals(result));
356
357 // build delta 1
358 StringBuilder str_builder2;
359 Int16Builder int_builder2;
360
361 for (int16_t idx = 0; idx < kTestLength; ++idx) {
362 ASSERT_OK(builder.Append("test1"));
363 ASSERT_OK(int_builder2.Append(1));
364 }
365
366 for (int16_t idx = 0; idx < kTestLength; ++idx) {
367 ASSERT_OK(builder.Append("test_new_value1"));
368 ASSERT_OK(int_builder2.Append(kTestLength));
369 }
370 ASSERT_OK(str_builder2.Append("test_new_value1"));
371
372 std::shared_ptr<Array> result2;
373 ASSERT_OK(builder.Finish(&result2));
374
375 std::shared_ptr<Array> str_array2;
376 ASSERT_OK(str_builder2.Finish(&str_array2));
377 auto dtype2 = std::make_shared<DictionaryType>(int16(), str_array2);
378
379 std::shared_ptr<Array> int_array2;
380 ASSERT_OK(int_builder2.Finish(&int_array2));
381
382 DictionaryArray expected2(dtype2, int_array2);
383 ASSERT_TRUE(expected2.Equals(result2));
384
385 // build delta 2
386 StringBuilder str_builder3;
387 Int16Builder int_builder3;
388
389 for (int16_t idx = 0; idx < kTestLength; ++idx) {
390 ASSERT_OK(builder.Append("test2"));
391 ASSERT_OK(int_builder3.Append(2));
392 }
393
394 for (int16_t idx = 0; idx < kTestLength; ++idx) {
395 ASSERT_OK(builder.Append("test_new_value2"));
396 ASSERT_OK(int_builder3.Append(kTestLength + 1));
397 }
398 ASSERT_OK(str_builder3.Append("test_new_value2"));
399
400 std::shared_ptr<Array> result3;
401 ASSERT_OK(builder.Finish(&result3));
402
403 std::shared_ptr<Array> str_array3;
404 ASSERT_OK(str_builder3.Finish(&str_array3));
405 auto dtype3 = std::make_shared<DictionaryType>(int16(), str_array3);
406
407 std::shared_ptr<Array> int_array3;
408 ASSERT_OK(int_builder3.Finish(&int_array3));
409
410 DictionaryArray expected3(dtype3, int_array3);
411 ASSERT_TRUE(expected3.Equals(result3));
412}
413
414TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
415 // Build the dictionary Array
416 DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
417 default_memory_pool());
418 std::vector<uint8_t> test{12, 12, 11, 12};
419 std::vector<uint8_t> test2{12, 12, 11, 11};
420 ASSERT_OK(builder.Append(test.data()));
421 ASSERT_OK(builder.Append(test2.data()));
422 ASSERT_OK(builder.Append(test.data()));
423
424 std::shared_ptr<Array> result;
425 FinishAndCheckPadding(&builder, &result);
426
427 // Build expected data
428 FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
429 ASSERT_OK(fsb_builder.Append(test.data()));
430 ASSERT_OK(fsb_builder.Append(test2.data()));
431 std::shared_ptr<Array> fsb_array;
432 ASSERT_OK(fsb_builder.Finish(&fsb_array));
433 auto dtype = std::make_shared<DictionaryType>(int8(), fsb_array);
434
435 Int8Builder int_builder;
436 ASSERT_OK(int_builder.Append(0));
437 ASSERT_OK(int_builder.Append(1));
438 ASSERT_OK(int_builder.Append(0));
439 std::shared_ptr<Array> int_array;
440 ASSERT_OK(int_builder.Finish(&int_array));
441
442 DictionaryArray expected(dtype, int_array);
443 ASSERT_TRUE(expected.Equals(result));
444}
445
446TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) {
447 // Build the dictionary Array
448 DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
449 default_memory_pool());
450 std::vector<uint8_t> test{12, 12, 11, 12};
451 std::vector<uint8_t> test2{12, 12, 11, 11};
452 std::vector<uint8_t> test3{12, 12, 11, 10};
453
454 ASSERT_OK(builder.Append(test.data()));
455 ASSERT_OK(builder.Append(test2.data()));
456 ASSERT_OK(builder.Append(test.data()));
457
458 std::shared_ptr<Array> result1;
459 FinishAndCheckPadding(&builder, &result1);
460
461 // Build expected data
462 FixedSizeBinaryBuilder fsb_builder1(arrow::fixed_size_binary(4));
463 ASSERT_OK(fsb_builder1.Append(test.data()));
464 ASSERT_OK(fsb_builder1.Append(test2.data()));
465 std::shared_ptr<Array> fsb_array1;
466 ASSERT_OK(fsb_builder1.Finish(&fsb_array1));
467 auto dtype1 = std::make_shared<DictionaryType>(int8(), fsb_array1);
468
469 Int8Builder int_builder1;
470 ASSERT_OK(int_builder1.Append(0));
471 ASSERT_OK(int_builder1.Append(1));
472 ASSERT_OK(int_builder1.Append(0));
473 std::shared_ptr<Array> int_array1;
474 ASSERT_OK(int_builder1.Finish(&int_array1));
475
476 DictionaryArray expected1(dtype1, int_array1);
477 ASSERT_TRUE(expected1.Equals(result1));
478
479 // build delta dictionary
480 ASSERT_OK(builder.Append(test.data()));
481 ASSERT_OK(builder.Append(test2.data()));
482 ASSERT_OK(builder.Append(test3.data()));
483
484 std::shared_ptr<Array> result2;
485 FinishAndCheckPadding(&builder, &result2);
486
487 // Build expected data
488 FixedSizeBinaryBuilder fsb_builder2(arrow::fixed_size_binary(4));
489 ASSERT_OK(fsb_builder2.Append(test3.data()));
490 std::shared_ptr<Array> fsb_array2;
491 ASSERT_OK(fsb_builder2.Finish(&fsb_array2));
492 auto dtype2 = std::make_shared<DictionaryType>(int8(), fsb_array2);
493
494 Int8Builder int_builder2;
495 ASSERT_OK(int_builder2.Append(0));
496 ASSERT_OK(int_builder2.Append(1));
497 ASSERT_OK(int_builder2.Append(2));
498 std::shared_ptr<Array> int_array2;
499 ASSERT_OK(int_builder2.Finish(&int_array2));
500
501 DictionaryArray expected2(dtype2, int_array2);
502 ASSERT_TRUE(expected2.Equals(result2));
503}
504
505TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) {
506 // Build the dictionary Array
507 DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
508 default_memory_pool());
509 // Build expected data
510 FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
511 Int16Builder int_builder;
512
513 // Fill with 1024 different values
514 for (int64_t i = 0; i < 1024; i++) {
515 std::vector<uint8_t> value{12, 12, static_cast<uint8_t>(i / 128),
516 static_cast<uint8_t>(i % 128)};
517 ASSERT_OK(builder.Append(value.data()));
518 ASSERT_OK(fsb_builder.Append(value.data()));
519 ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
520 }
521 // Fill with an already existing value
522 std::vector<uint8_t> known_value{12, 12, 0, 1};
523 for (int64_t i = 0; i < 1024; i++) {
524 ASSERT_OK(builder.Append(known_value.data()));
525 ASSERT_OK(int_builder.Append(1));
526 }
527
528 // Finalize result
529 std::shared_ptr<Array> result;
530 ASSERT_OK(builder.Finish(&result));
531
532 // Finalize expected data
533 std::shared_ptr<Array> fsb_array;
534 ASSERT_OK(fsb_builder.Finish(&fsb_array));
535 auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
536 std::shared_ptr<Array> int_array;
537 ASSERT_OK(int_builder.Finish(&int_array));
538
539 DictionaryArray expected(dtype, int_array);
540 ASSERT_TRUE(expected.Equals(result));
541}
542
543TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) {
544 // Build the dictionary Array
545 DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
546 default_memory_pool());
547 // Build an array with different byte width
548 FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(5));
549 std::vector<uint8_t> value{100, 1, 1, 1, 1};
550 ASSERT_OK(fsb_builder.Append(value.data()));
551 std::shared_ptr<Array> fsb_array;
552 ASSERT_OK(fsb_builder.Finish(&fsb_array));
553
554 ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array));
555}
556
557TEST(TestDecimalDictionaryBuilder, Basic) {
558 // Build the dictionary Array
559 auto decimal_type = arrow::decimal(2, 0);
560 DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());
561
562 // Test data
563 std::vector<Decimal128> test{12, 12, 11, 12};
564 for (const auto& value : test) {
565 ASSERT_OK(builder.Append(value.ToBytes().data()));
566 }
567
568 std::shared_ptr<Array> result;
569 ASSERT_OK(builder.Finish(&result));
570
571 // Build expected data
572 auto dtype = dictionary(int8(), ArrayFromJSON(decimal_type, "[\"12\", \"11\"]"));
573 DictionaryArray expected(dtype, ArrayFromJSON(int8(), "[0, 0, 1, 0]"));
574
575 ASSERT_TRUE(expected.Equals(result));
576}
577
578TEST(TestDecimalDictionaryBuilder, DoubleTableSize) {
579 const auto& decimal_type = arrow::decimal(21, 0);
580
581 // Build the dictionary Array
582 DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());
583
584 // Build expected data
585 FixedSizeBinaryBuilder fsb_builder(decimal_type);
586 Int16Builder int_builder;
587
588 // Fill with 1024 different values
589 for (int64_t i = 0; i < 1024; i++) {
590 const uint8_t bytes[] = {0,
591 0,
592 0,
593 0,
594 0,
595 0,
596 0,
597 0,
598 0,
599 0,
600 0,
601 0,
602 12,
603 12,
604 static_cast<uint8_t>(i / 128),
605 static_cast<uint8_t>(i % 128)};
606 ASSERT_OK(builder.Append(bytes));
607 ASSERT_OK(fsb_builder.Append(bytes));
608 ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
609 }
610 // Fill with an already existing value
611 const uint8_t known_value[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1};
612 for (int64_t i = 0; i < 1024; i++) {
613 ASSERT_OK(builder.Append(known_value));
614 ASSERT_OK(int_builder.Append(1));
615 }
616
617 // Finalize result
618 std::shared_ptr<Array> result;
619 ASSERT_OK(builder.Finish(&result));
620
621 // Finalize expected data
622 std::shared_ptr<Array> fsb_array;
623 ASSERT_OK(fsb_builder.Finish(&fsb_array));
624
625 auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
626 std::shared_ptr<Array> int_array;
627 ASSERT_OK(int_builder.Finish(&int_array));
628
629 DictionaryArray expected(dtype, int_array);
630 ASSERT_TRUE(expected.Equals(result));
631}
632
633// ----------------------------------------------------------------------
634// DictionaryArray tests
635
636TEST(TestDictionary, Basics) {
637 vector<int32_t> values = {100, 1000, 10000, 100000};
638 std::shared_ptr<Array> dict;
639 ArrayFromVector<Int32Type, int32_t>(values, &dict);
640
641 std::shared_ptr<DictionaryType> type1 =
642 std::dynamic_pointer_cast<DictionaryType>(dictionary(int16(), dict));
643
644 auto type2 =
645 std::dynamic_pointer_cast<DictionaryType>(::arrow::dictionary(int16(), dict, true));
646
647 ASSERT_TRUE(int16()->Equals(type1->index_type()));
648 ASSERT_TRUE(type1->dictionary()->Equals(dict));
649
650 ASSERT_TRUE(int16()->Equals(type2->index_type()));
651 ASSERT_TRUE(type2->dictionary()->Equals(dict));
652
653 ASSERT_EQ("dictionary<values=int32, indices=int16, ordered=0>", type1->ToString());
654 ASSERT_EQ("dictionary<values=int32, indices=int16, ordered=1>", type2->ToString());
655}
656
657TEST(TestDictionary, Equals) {
658 vector<bool> is_valid = {true, true, false, true, true, true};
659 std::shared_ptr<Array> dict, dict2, indices, indices2, indices3;
660
661 dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
662 std::shared_ptr<DataType> dict_type = dictionary(int16(), dict);
663
664 dict2 = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\", \"qux\"]");
665 std::shared_ptr<DataType> dict2_type = dictionary(int16(), dict2);
666
667 vector<int16_t> indices_values = {1, 2, -1, 0, 2, 0};
668 ArrayFromVector<Int16Type, int16_t>(is_valid, indices_values, &indices);
669
670 vector<int16_t> indices2_values = {1, 2, 0, 0, 2, 0};
671 ArrayFromVector<Int16Type, int16_t>(is_valid, indices2_values, &indices2);
672
673 vector<int16_t> indices3_values = {1, 1, 0, 0, 2, 0};
674 ArrayFromVector<Int16Type, int16_t>(is_valid, indices3_values, &indices3);
675
676 auto array = std::make_shared<DictionaryArray>(dict_type, indices);
677 auto array2 = std::make_shared<DictionaryArray>(dict_type, indices2);
678 auto array3 = std::make_shared<DictionaryArray>(dict2_type, indices);
679 auto array4 = std::make_shared<DictionaryArray>(dict_type, indices3);
680
681 ASSERT_TRUE(array->Equals(array));
682
683 // Equal, because the unequal index is masked by null
684 ASSERT_TRUE(array->Equals(array2));
685
686 // Unequal dictionaries
687 ASSERT_FALSE(array->Equals(array3));
688
689 // Unequal indices
690 ASSERT_FALSE(array->Equals(array4));
691
692 // RangeEquals
693 ASSERT_TRUE(array->RangeEquals(3, 6, 3, array4));
694 ASSERT_FALSE(array->RangeEquals(1, 3, 1, array4));
695
696 // ARROW-33 Test slices
697 const int64_t size = array->length();
698
699 std::shared_ptr<Array> slice, slice2;
700 slice = array->Array::Slice(2);
701 slice2 = array->Array::Slice(2);
702 ASSERT_EQ(size - 2, slice->length());
703
704 ASSERT_TRUE(slice->Equals(slice2));
705 ASSERT_TRUE(array->RangeEquals(2, array->length(), 0, slice));
706
707 // Chained slices
708 slice2 = array->Array::Slice(1)->Array::Slice(1);
709 ASSERT_TRUE(slice->Equals(slice2));
710
711 slice = array->Slice(1, 3);
712 slice2 = array->Slice(1, 3);
713 ASSERT_EQ(3, slice->length());
714
715 ASSERT_TRUE(slice->Equals(slice2));
716 ASSERT_TRUE(array->RangeEquals(1, 4, 0, slice));
717}
718
719TEST(TestDictionary, Validate) {
720 auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
721 std::shared_ptr<DataType> dict_type = dictionary(int16(), dict);
722
723 auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0, 2, 0]");
724 std::shared_ptr<Array> arr = std::make_shared<DictionaryArray>(dict_type, indices);
725
726 // Only checking index type for now
727 ASSERT_OK(ValidateArray(*arr));
728
729 // TODO(wesm) In ARROW-1199, there is now a DCHECK to compare the indices
730 // type with the dict_type. How can we test for this?
731
732 // std::shared_ptr<Array> indices2;
733 // vector<float> indices2_values = {1., 2., 0., 0., 2., 0.};
734 // ArrayFromVector<FloatType, float>(is_valid, indices2_values, &indices2);
735
736 // std::shared_ptr<Array> indices3;
737 // vector<int64_t> indices3_values = {1, 2, 0, 0, 2, 0};
738 // ArrayFromVector<Int64Type, int64_t>(is_valid, indices3_values, &indices3);
739 // std::shared_ptr<Array> arr2 = std::make_shared<DictionaryArray>(dict_type, indices2);
740 // std::shared_ptr<Array> arr3 = std::make_shared<DictionaryArray>(dict_type, indices3);
741 // ASSERT_OK(ValidateArray(*arr3));
742}
743
744TEST(TestDictionary, FromArray) {
745 auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
746 std::shared_ptr<DataType> dict_type = dictionary(int16(), dict);
747
748 auto indices1 = ArrayFromJSON(int16(), "[1, 2, 0, 0, 2, 0]");
749 auto indices2 = ArrayFromJSON(int16(), "[1, 2, 0, 3, 2, 0]");
750
751 // Invalid index is masked by null
752 std::shared_ptr<Array> indices3;
753 vector<bool> is_valid3 = {true, true, false, true, true, true};
754 vector<int16_t> indices_values3 = {1, 2, -1, 0, 2, 0};
755 ArrayFromVector<Int16Type, int16_t>(is_valid3, indices_values3, &indices3);
756
757 // Index out of bounds
758 auto indices4 = ArrayFromJSON(int16(), "[1, 2, null, 3, 2, 0]");
759
760 std::shared_ptr<Array> arr1, arr2, arr3, arr4;
761 ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices1, &arr1));
762 ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices2, &arr2));
763 ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices3, &arr3));
764 ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, &arr4));
765}
766
767TEST(TestDictionary, TransposeBasic) {
768 std::shared_ptr<Array> arr, out, expected;
769
770 auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
771 auto dict_type = dictionary(int16(), dict);
772 auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]");
773 // ["B", "C", "A", "A"]
774 ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr));
775
776 // Transpose to same index type
777 {
778 auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
779 auto out_dict_type = dictionary(int16(), out_dict);
780
781 const std::vector<int32_t> transpose_map{1, 3, 2};
782 ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
783 default_memory_pool(), out_dict_type, transpose_map, &out));
784
785 auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]");
786 ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
787 AssertArraysEqual(*out, *expected);
788 }
789
790 // Transpose to other type
791 {
792 auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
793 auto out_dict_type = dictionary(int8(), out_dict);
794
795 const std::vector<int32_t> transpose_map{1, 3, 2};
796 ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
797 default_memory_pool(), out_dict_type, transpose_map, &out));
798
799 auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]");
800 ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
801 AssertArraysEqual(*expected, *out);
802 }
803}
804
805TEST(TestDictionary, TransposeNulls) {
806 std::shared_ptr<Array> arr, out, expected;
807
808 auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
809 auto dict_type = dictionary(int16(), dict);
810 auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]");
811 // ["B", "C", null, "A"]
812 ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr));
813
814 auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
815 auto out_dict_type = dictionary(int16(), out_dict);
816
817 const std::vector<int32_t> transpose_map{1, 3, 2};
818 ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
819 default_memory_pool(), out_dict_type, transpose_map, &out));
820
821 auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]");
822 ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
823 AssertArraysEqual(*expected, *out);
824}
825
826} // namespace arrow
827