1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <array> |
19 | #include <cstdint> |
20 | #include <memory> |
21 | #include <ostream> |
22 | #include <string> |
23 | #include <vector> |
24 | |
25 | #include <gtest/gtest.h> |
26 | |
27 | #include "arrow/array.h" |
28 | #include "arrow/builder.h" |
29 | #include "arrow/memory_pool.h" |
30 | #include "arrow/status.h" |
31 | #include "arrow/test-common.h" |
32 | #include "arrow/test-util.h" |
33 | #include "arrow/type.h" |
34 | #include "arrow/util/checked_cast.h" |
35 | #include "arrow/util/decimal.h" |
36 | |
37 | namespace arrow { |
38 | |
39 | using std::string; |
40 | using std::vector; |
41 | |
42 | using internal::checked_cast; |
43 | |
44 | // ---------------------------------------------------------------------- |
45 | // Dictionary tests |
46 | |
47 | template <typename Type> |
48 | class TestDictionaryBuilder : public TestBuilder {}; |
49 | |
50 | typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type, |
51 | UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType> |
52 | PrimitiveDictionaries; |
53 | |
54 | TYPED_TEST_CASE(TestDictionaryBuilder, PrimitiveDictionaries); |
55 | |
56 | TYPED_TEST(TestDictionaryBuilder, Basic) { |
57 | DictionaryBuilder<TypeParam> builder(default_memory_pool()); |
58 | ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1))); |
59 | ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2))); |
60 | ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1))); |
61 | ASSERT_OK(builder.AppendNull()); |
62 | |
63 | ASSERT_EQ(builder.length(), 4); |
64 | ASSERT_EQ(builder.null_count(), 1); |
65 | |
66 | std::shared_ptr<Array> result; |
67 | ASSERT_OK(builder.Finish(&result)); |
68 | |
69 | // Build expected data |
70 | auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]" ); |
71 | auto dict_type = std::make_shared<DictionaryType>(int8(), dict_array); |
72 | |
73 | auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]" ); |
74 | DictionaryArray expected(dict_type, int_array); |
75 | |
76 | ASSERT_TRUE(expected.Equals(result)); |
77 | } |
78 | |
79 | TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { |
80 | auto type = std::make_shared<TypeParam>(); |
81 | |
82 | auto intermediate_result = ArrayFromJSON(type, "[1, 2, 1]" ); |
83 | DictionaryBuilder<TypeParam> dictionary_builder(default_memory_pool()); |
84 | ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result)); |
85 | std::shared_ptr<Array> result; |
86 | ASSERT_OK(dictionary_builder.Finish(&result)); |
87 | |
88 | // Build expected data |
89 | auto dict_array = ArrayFromJSON(type, "[1, 2]" ); |
90 | auto dict_type = std::make_shared<DictionaryType>(int8(), dict_array); |
91 | |
92 | auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]" ); |
93 | DictionaryArray expected(dict_type, int_array); |
94 | |
95 | ASSERT_TRUE(expected.Equals(result)); |
96 | } |
97 | |
98 | TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) { |
99 | using Scalar = typename TypeParam::c_type; |
100 | // Skip this test for (u)int8 |
101 | if (sizeof(Scalar) > 1) { |
102 | // Build the dictionary Array |
103 | DictionaryBuilder<TypeParam> builder(default_memory_pool()); |
104 | // Build expected data |
105 | NumericBuilder<TypeParam> dict_builder; |
106 | Int16Builder int_builder; |
107 | |
108 | // Fill with 1024 different values |
109 | for (int64_t i = 0; i < 1024; i++) { |
110 | ASSERT_OK(builder.Append(static_cast<Scalar>(i))); |
111 | ASSERT_OK(dict_builder.Append(static_cast<Scalar>(i))); |
112 | ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i))); |
113 | } |
114 | // Fill with an already existing value |
115 | for (int64_t i = 0; i < 1024; i++) { |
116 | ASSERT_OK(builder.Append(static_cast<Scalar>(1))); |
117 | ASSERT_OK(int_builder.Append(1)); |
118 | } |
119 | |
120 | // Finalize result |
121 | std::shared_ptr<Array> result; |
122 | FinishAndCheckPadding(&builder, &result); |
123 | |
124 | // Finalize expected data |
125 | std::shared_ptr<Array> dict_array; |
126 | ASSERT_OK(dict_builder.Finish(&dict_array)); |
127 | auto dtype = std::make_shared<DictionaryType>(int16(), dict_array); |
128 | std::shared_ptr<Array> int_array; |
129 | ASSERT_OK(int_builder.Finish(&int_array)); |
130 | |
131 | DictionaryArray expected(dtype, int_array); |
132 | ASSERT_TRUE(expected.Equals(result)); |
133 | } |
134 | } |
135 | |
136 | TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) { |
137 | using c_type = typename TypeParam::c_type; |
138 | auto type = std::make_shared<TypeParam>(); |
139 | |
140 | DictionaryBuilder<TypeParam> builder(default_memory_pool()); |
141 | |
142 | ASSERT_OK(builder.Append(static_cast<c_type>(1))); |
143 | ASSERT_OK(builder.Append(static_cast<c_type>(2))); |
144 | ASSERT_OK(builder.Append(static_cast<c_type>(1))); |
145 | ASSERT_OK(builder.Append(static_cast<c_type>(2))); |
146 | std::shared_ptr<Array> result; |
147 | FinishAndCheckPadding(&builder, &result); |
148 | |
149 | // Build expected data for the initial dictionary |
150 | auto dict_type1 = dictionary(int8(), ArrayFromJSON(type, "[1, 2]" )); |
151 | DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]" )); |
152 | |
153 | ASSERT_TRUE(expected.Equals(result)); |
154 | |
155 | // extend the dictionary builder with new data |
156 | ASSERT_OK(builder.Append(static_cast<c_type>(2))); |
157 | ASSERT_OK(builder.Append(static_cast<c_type>(3))); |
158 | ASSERT_OK(builder.Append(static_cast<c_type>(3))); |
159 | ASSERT_OK(builder.Append(static_cast<c_type>(1))); |
160 | ASSERT_OK(builder.Append(static_cast<c_type>(3))); |
161 | |
162 | std::shared_ptr<Array> result_delta; |
163 | ASSERT_OK(builder.Finish(&result_delta)); |
164 | |
165 | // Build expected data for the delta dictionary |
166 | auto dict_type2 = dictionary(int8(), ArrayFromJSON(type, "[3]" )); |
167 | DictionaryArray expected_delta(dict_type2, ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]" )); |
168 | |
169 | ASSERT_TRUE(expected_delta.Equals(result_delta)); |
170 | } |
171 | |
172 | TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) { |
173 | using c_type = typename TypeParam::c_type; |
174 | auto type = std::make_shared<TypeParam>(); |
175 | |
176 | DictionaryBuilder<TypeParam> builder(default_memory_pool()); |
177 | |
178 | ASSERT_OK(builder.Append(static_cast<c_type>(1))); |
179 | ASSERT_OK(builder.Append(static_cast<c_type>(2))); |
180 | ASSERT_OK(builder.Append(static_cast<c_type>(1))); |
181 | ASSERT_OK(builder.Append(static_cast<c_type>(2))); |
182 | std::shared_ptr<Array> result; |
183 | FinishAndCheckPadding(&builder, &result); |
184 | |
185 | // Build expected data for the initial dictionary |
186 | auto dict_type1 = dictionary(int8(), ArrayFromJSON(type, "[1, 2]" )); |
187 | DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]" )); |
188 | |
189 | ASSERT_TRUE(expected.Equals(result)); |
190 | |
191 | // extend the dictionary builder with new data |
192 | ASSERT_OK(builder.Append(static_cast<c_type>(2))); |
193 | ASSERT_OK(builder.Append(static_cast<c_type>(3))); |
194 | ASSERT_OK(builder.Append(static_cast<c_type>(3))); |
195 | ASSERT_OK(builder.Append(static_cast<c_type>(1))); |
196 | ASSERT_OK(builder.Append(static_cast<c_type>(3))); |
197 | |
198 | std::shared_ptr<Array> result_delta1; |
199 | ASSERT_OK(builder.Finish(&result_delta1)); |
200 | |
201 | // Build expected data for the delta dictionary |
202 | auto dict_type2 = dictionary(int8(), ArrayFromJSON(type, "[3]" )); |
203 | DictionaryArray expected_delta1(dict_type2, ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]" )); |
204 | |
205 | ASSERT_TRUE(expected_delta1.Equals(result_delta1)); |
206 | |
207 | // extend the dictionary builder with new data again |
208 | ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1))); |
209 | ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2))); |
210 | ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(3))); |
211 | ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(4))); |
212 | ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(5))); |
213 | |
214 | std::shared_ptr<Array> result_delta2; |
215 | ASSERT_OK(builder.Finish(&result_delta2)); |
216 | |
217 | // Build expected data for the delta dictionary again |
218 | auto dict_type3 = dictionary(int8(), ArrayFromJSON(type, "[4, 5]" )); |
219 | DictionaryArray expected_delta2(dict_type3, ArrayFromJSON(int8(), "[0, 1, 2, 3, 4]" )); |
220 | |
221 | ASSERT_TRUE(expected_delta2.Equals(result_delta2)); |
222 | } |
223 | |
224 | TEST(TestStringDictionaryBuilder, Basic) { |
225 | // Build the dictionary Array |
226 | StringDictionaryBuilder builder(default_memory_pool()); |
227 | ASSERT_OK(builder.Append("test" )); |
228 | ASSERT_OK(builder.Append("test2" )); |
229 | ASSERT_OK(builder.Append("test" )); |
230 | |
231 | std::shared_ptr<Array> result; |
232 | ASSERT_OK(builder.Finish(&result)); |
233 | |
234 | // Build expected data |
235 | auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test\", \"test2\"]" )); |
236 | auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]" ); |
237 | DictionaryArray expected(dtype, int_array); |
238 | |
239 | ASSERT_TRUE(expected.Equals(result)); |
240 | } |
241 | |
242 | // ARROW-4367 |
243 | TEST(TestStringDictionaryBuilder, OnlyNull) { |
244 | // Build the dictionary Array |
245 | StringDictionaryBuilder builder(default_memory_pool()); |
246 | ASSERT_OK(builder.AppendNull()); |
247 | |
248 | std::shared_ptr<Array> result; |
249 | ASSERT_OK(builder.Finish(&result)); |
250 | |
251 | // Build expected data |
252 | auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[]" )); |
253 | auto int_array = ArrayFromJSON(int8(), "[null]" ); |
254 | DictionaryArray expected(dtype, int_array); |
255 | |
256 | ASSERT_TRUE(expected.Equals(result)); |
257 | } |
258 | |
259 | TEST(TestStringDictionaryBuilder, DoubleTableSize) { |
260 | // Build the dictionary Array |
261 | StringDictionaryBuilder builder(default_memory_pool()); |
262 | // Build expected data |
263 | StringBuilder str_builder; |
264 | Int16Builder int_builder; |
265 | |
266 | // Fill with 1024 different values |
267 | for (int64_t i = 0; i < 1024; i++) { |
268 | std::stringstream ss; |
269 | ss << "test" << i; |
270 | ASSERT_OK(builder.Append(ss.str())); |
271 | ASSERT_OK(str_builder.Append(ss.str())); |
272 | ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i))); |
273 | } |
274 | // Fill with an already existing value |
275 | for (int64_t i = 0; i < 1024; i++) { |
276 | ASSERT_OK(builder.Append("test1" )); |
277 | ASSERT_OK(int_builder.Append(1)); |
278 | } |
279 | |
280 | // Finalize result |
281 | std::shared_ptr<Array> result; |
282 | FinishAndCheckPadding(&builder, &result); |
283 | |
284 | // Finalize expected data |
285 | std::shared_ptr<Array> str_array; |
286 | ASSERT_OK(str_builder.Finish(&str_array)); |
287 | auto dtype = std::make_shared<DictionaryType>(int16(), str_array); |
288 | std::shared_ptr<Array> int_array; |
289 | ASSERT_OK(int_builder.Finish(&int_array)); |
290 | |
291 | DictionaryArray expected(dtype, int_array); |
292 | ASSERT_TRUE(expected.Equals(result)); |
293 | } |
294 | |
295 | TEST(TestStringDictionaryBuilder, DeltaDictionary) { |
296 | // Build the dictionary Array |
297 | StringDictionaryBuilder builder(default_memory_pool()); |
298 | ASSERT_OK(builder.Append("test" )); |
299 | ASSERT_OK(builder.Append("test2" )); |
300 | ASSERT_OK(builder.Append("test" )); |
301 | |
302 | std::shared_ptr<Array> result; |
303 | ASSERT_OK(builder.Finish(&result)); |
304 | |
305 | // Build expected data |
306 | auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test\", \"test2\"]" )); |
307 | auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]" ); |
308 | DictionaryArray expected(dtype, int_array); |
309 | |
310 | ASSERT_TRUE(expected.Equals(result)); |
311 | |
312 | // build a delta dictionary |
313 | ASSERT_OK(builder.Append("test2" )); |
314 | ASSERT_OK(builder.Append("test3" )); |
315 | ASSERT_OK(builder.Append("test2" )); |
316 | |
317 | std::shared_ptr<Array> result_delta; |
318 | FinishAndCheckPadding(&builder, &result_delta); |
319 | |
320 | // Build expected data |
321 | auto dtype2 = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test3\"]" )); |
322 | auto int_array2 = ArrayFromJSON(int8(), "[1, 2, 1]" ); |
323 | DictionaryArray expected_delta(dtype2, int_array2); |
324 | |
325 | ASSERT_TRUE(expected_delta.Equals(result_delta)); |
326 | } |
327 | |
328 | TEST(TestStringDictionaryBuilder, BigDeltaDictionary) { |
329 | constexpr int16_t kTestLength = 2048; |
330 | // Build the dictionary Array |
331 | StringDictionaryBuilder builder(default_memory_pool()); |
332 | |
333 | StringBuilder str_builder1; |
334 | Int16Builder int_builder1; |
335 | |
336 | for (int16_t idx = 0; idx < kTestLength; ++idx) { |
337 | std::stringstream sstream; |
338 | sstream << "test" << idx; |
339 | ASSERT_OK(builder.Append(sstream.str())); |
340 | ASSERT_OK(str_builder1.Append(sstream.str())); |
341 | ASSERT_OK(int_builder1.Append(idx)); |
342 | } |
343 | |
344 | std::shared_ptr<Array> result; |
345 | FinishAndCheckPadding(&builder, &result); |
346 | |
347 | std::shared_ptr<Array> str_array1; |
348 | ASSERT_OK(str_builder1.Finish(&str_array1)); |
349 | auto dtype1 = std::make_shared<DictionaryType>(int16(), str_array1); |
350 | |
351 | std::shared_ptr<Array> int_array1; |
352 | ASSERT_OK(int_builder1.Finish(&int_array1)); |
353 | |
354 | DictionaryArray expected(dtype1, int_array1); |
355 | ASSERT_TRUE(expected.Equals(result)); |
356 | |
357 | // build delta 1 |
358 | StringBuilder str_builder2; |
359 | Int16Builder int_builder2; |
360 | |
361 | for (int16_t idx = 0; idx < kTestLength; ++idx) { |
362 | ASSERT_OK(builder.Append("test1" )); |
363 | ASSERT_OK(int_builder2.Append(1)); |
364 | } |
365 | |
366 | for (int16_t idx = 0; idx < kTestLength; ++idx) { |
367 | ASSERT_OK(builder.Append("test_new_value1" )); |
368 | ASSERT_OK(int_builder2.Append(kTestLength)); |
369 | } |
370 | ASSERT_OK(str_builder2.Append("test_new_value1" )); |
371 | |
372 | std::shared_ptr<Array> result2; |
373 | ASSERT_OK(builder.Finish(&result2)); |
374 | |
375 | std::shared_ptr<Array> str_array2; |
376 | ASSERT_OK(str_builder2.Finish(&str_array2)); |
377 | auto dtype2 = std::make_shared<DictionaryType>(int16(), str_array2); |
378 | |
379 | std::shared_ptr<Array> int_array2; |
380 | ASSERT_OK(int_builder2.Finish(&int_array2)); |
381 | |
382 | DictionaryArray expected2(dtype2, int_array2); |
383 | ASSERT_TRUE(expected2.Equals(result2)); |
384 | |
385 | // build delta 2 |
386 | StringBuilder str_builder3; |
387 | Int16Builder int_builder3; |
388 | |
389 | for (int16_t idx = 0; idx < kTestLength; ++idx) { |
390 | ASSERT_OK(builder.Append("test2" )); |
391 | ASSERT_OK(int_builder3.Append(2)); |
392 | } |
393 | |
394 | for (int16_t idx = 0; idx < kTestLength; ++idx) { |
395 | ASSERT_OK(builder.Append("test_new_value2" )); |
396 | ASSERT_OK(int_builder3.Append(kTestLength + 1)); |
397 | } |
398 | ASSERT_OK(str_builder3.Append("test_new_value2" )); |
399 | |
400 | std::shared_ptr<Array> result3; |
401 | ASSERT_OK(builder.Finish(&result3)); |
402 | |
403 | std::shared_ptr<Array> str_array3; |
404 | ASSERT_OK(str_builder3.Finish(&str_array3)); |
405 | auto dtype3 = std::make_shared<DictionaryType>(int16(), str_array3); |
406 | |
407 | std::shared_ptr<Array> int_array3; |
408 | ASSERT_OK(int_builder3.Finish(&int_array3)); |
409 | |
410 | DictionaryArray expected3(dtype3, int_array3); |
411 | ASSERT_TRUE(expected3.Equals(result3)); |
412 | } |
413 | |
414 | TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) { |
415 | // Build the dictionary Array |
416 | DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4), |
417 | default_memory_pool()); |
418 | std::vector<uint8_t> test{12, 12, 11, 12}; |
419 | std::vector<uint8_t> test2{12, 12, 11, 11}; |
420 | ASSERT_OK(builder.Append(test.data())); |
421 | ASSERT_OK(builder.Append(test2.data())); |
422 | ASSERT_OK(builder.Append(test.data())); |
423 | |
424 | std::shared_ptr<Array> result; |
425 | FinishAndCheckPadding(&builder, &result); |
426 | |
427 | // Build expected data |
428 | FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4)); |
429 | ASSERT_OK(fsb_builder.Append(test.data())); |
430 | ASSERT_OK(fsb_builder.Append(test2.data())); |
431 | std::shared_ptr<Array> fsb_array; |
432 | ASSERT_OK(fsb_builder.Finish(&fsb_array)); |
433 | auto dtype = std::make_shared<DictionaryType>(int8(), fsb_array); |
434 | |
435 | Int8Builder int_builder; |
436 | ASSERT_OK(int_builder.Append(0)); |
437 | ASSERT_OK(int_builder.Append(1)); |
438 | ASSERT_OK(int_builder.Append(0)); |
439 | std::shared_ptr<Array> int_array; |
440 | ASSERT_OK(int_builder.Finish(&int_array)); |
441 | |
442 | DictionaryArray expected(dtype, int_array); |
443 | ASSERT_TRUE(expected.Equals(result)); |
444 | } |
445 | |
446 | TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) { |
447 | // Build the dictionary Array |
448 | DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4), |
449 | default_memory_pool()); |
450 | std::vector<uint8_t> test{12, 12, 11, 12}; |
451 | std::vector<uint8_t> test2{12, 12, 11, 11}; |
452 | std::vector<uint8_t> test3{12, 12, 11, 10}; |
453 | |
454 | ASSERT_OK(builder.Append(test.data())); |
455 | ASSERT_OK(builder.Append(test2.data())); |
456 | ASSERT_OK(builder.Append(test.data())); |
457 | |
458 | std::shared_ptr<Array> result1; |
459 | FinishAndCheckPadding(&builder, &result1); |
460 | |
461 | // Build expected data |
462 | FixedSizeBinaryBuilder fsb_builder1(arrow::fixed_size_binary(4)); |
463 | ASSERT_OK(fsb_builder1.Append(test.data())); |
464 | ASSERT_OK(fsb_builder1.Append(test2.data())); |
465 | std::shared_ptr<Array> fsb_array1; |
466 | ASSERT_OK(fsb_builder1.Finish(&fsb_array1)); |
467 | auto dtype1 = std::make_shared<DictionaryType>(int8(), fsb_array1); |
468 | |
469 | Int8Builder int_builder1; |
470 | ASSERT_OK(int_builder1.Append(0)); |
471 | ASSERT_OK(int_builder1.Append(1)); |
472 | ASSERT_OK(int_builder1.Append(0)); |
473 | std::shared_ptr<Array> int_array1; |
474 | ASSERT_OK(int_builder1.Finish(&int_array1)); |
475 | |
476 | DictionaryArray expected1(dtype1, int_array1); |
477 | ASSERT_TRUE(expected1.Equals(result1)); |
478 | |
479 | // build delta dictionary |
480 | ASSERT_OK(builder.Append(test.data())); |
481 | ASSERT_OK(builder.Append(test2.data())); |
482 | ASSERT_OK(builder.Append(test3.data())); |
483 | |
484 | std::shared_ptr<Array> result2; |
485 | FinishAndCheckPadding(&builder, &result2); |
486 | |
487 | // Build expected data |
488 | FixedSizeBinaryBuilder fsb_builder2(arrow::fixed_size_binary(4)); |
489 | ASSERT_OK(fsb_builder2.Append(test3.data())); |
490 | std::shared_ptr<Array> fsb_array2; |
491 | ASSERT_OK(fsb_builder2.Finish(&fsb_array2)); |
492 | auto dtype2 = std::make_shared<DictionaryType>(int8(), fsb_array2); |
493 | |
494 | Int8Builder int_builder2; |
495 | ASSERT_OK(int_builder2.Append(0)); |
496 | ASSERT_OK(int_builder2.Append(1)); |
497 | ASSERT_OK(int_builder2.Append(2)); |
498 | std::shared_ptr<Array> int_array2; |
499 | ASSERT_OK(int_builder2.Finish(&int_array2)); |
500 | |
501 | DictionaryArray expected2(dtype2, int_array2); |
502 | ASSERT_TRUE(expected2.Equals(result2)); |
503 | } |
504 | |
505 | TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) { |
506 | // Build the dictionary Array |
507 | DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4), |
508 | default_memory_pool()); |
509 | // Build expected data |
510 | FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4)); |
511 | Int16Builder int_builder; |
512 | |
513 | // Fill with 1024 different values |
514 | for (int64_t i = 0; i < 1024; i++) { |
515 | std::vector<uint8_t> value{12, 12, static_cast<uint8_t>(i / 128), |
516 | static_cast<uint8_t>(i % 128)}; |
517 | ASSERT_OK(builder.Append(value.data())); |
518 | ASSERT_OK(fsb_builder.Append(value.data())); |
519 | ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i))); |
520 | } |
521 | // Fill with an already existing value |
522 | std::vector<uint8_t> known_value{12, 12, 0, 1}; |
523 | for (int64_t i = 0; i < 1024; i++) { |
524 | ASSERT_OK(builder.Append(known_value.data())); |
525 | ASSERT_OK(int_builder.Append(1)); |
526 | } |
527 | |
528 | // Finalize result |
529 | std::shared_ptr<Array> result; |
530 | ASSERT_OK(builder.Finish(&result)); |
531 | |
532 | // Finalize expected data |
533 | std::shared_ptr<Array> fsb_array; |
534 | ASSERT_OK(fsb_builder.Finish(&fsb_array)); |
535 | auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array); |
536 | std::shared_ptr<Array> int_array; |
537 | ASSERT_OK(int_builder.Finish(&int_array)); |
538 | |
539 | DictionaryArray expected(dtype, int_array); |
540 | ASSERT_TRUE(expected.Equals(result)); |
541 | } |
542 | |
543 | TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) { |
544 | // Build the dictionary Array |
545 | DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4), |
546 | default_memory_pool()); |
547 | // Build an array with different byte width |
548 | FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(5)); |
549 | std::vector<uint8_t> value{100, 1, 1, 1, 1}; |
550 | ASSERT_OK(fsb_builder.Append(value.data())); |
551 | std::shared_ptr<Array> fsb_array; |
552 | ASSERT_OK(fsb_builder.Finish(&fsb_array)); |
553 | |
554 | ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array)); |
555 | } |
556 | |
557 | TEST(TestDecimalDictionaryBuilder, Basic) { |
558 | // Build the dictionary Array |
559 | auto decimal_type = arrow::decimal(2, 0); |
560 | DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool()); |
561 | |
562 | // Test data |
563 | std::vector<Decimal128> test{12, 12, 11, 12}; |
564 | for (const auto& value : test) { |
565 | ASSERT_OK(builder.Append(value.ToBytes().data())); |
566 | } |
567 | |
568 | std::shared_ptr<Array> result; |
569 | ASSERT_OK(builder.Finish(&result)); |
570 | |
571 | // Build expected data |
572 | auto dtype = dictionary(int8(), ArrayFromJSON(decimal_type, "[\"12\", \"11\"]" )); |
573 | DictionaryArray expected(dtype, ArrayFromJSON(int8(), "[0, 0, 1, 0]" )); |
574 | |
575 | ASSERT_TRUE(expected.Equals(result)); |
576 | } |
577 | |
578 | TEST(TestDecimalDictionaryBuilder, DoubleTableSize) { |
579 | const auto& decimal_type = arrow::decimal(21, 0); |
580 | |
581 | // Build the dictionary Array |
582 | DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool()); |
583 | |
584 | // Build expected data |
585 | FixedSizeBinaryBuilder fsb_builder(decimal_type); |
586 | Int16Builder int_builder; |
587 | |
588 | // Fill with 1024 different values |
589 | for (int64_t i = 0; i < 1024; i++) { |
590 | const uint8_t bytes[] = {0, |
591 | 0, |
592 | 0, |
593 | 0, |
594 | 0, |
595 | 0, |
596 | 0, |
597 | 0, |
598 | 0, |
599 | 0, |
600 | 0, |
601 | 0, |
602 | 12, |
603 | 12, |
604 | static_cast<uint8_t>(i / 128), |
605 | static_cast<uint8_t>(i % 128)}; |
606 | ASSERT_OK(builder.Append(bytes)); |
607 | ASSERT_OK(fsb_builder.Append(bytes)); |
608 | ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i))); |
609 | } |
610 | // Fill with an already existing value |
611 | const uint8_t known_value[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1}; |
612 | for (int64_t i = 0; i < 1024; i++) { |
613 | ASSERT_OK(builder.Append(known_value)); |
614 | ASSERT_OK(int_builder.Append(1)); |
615 | } |
616 | |
617 | // Finalize result |
618 | std::shared_ptr<Array> result; |
619 | ASSERT_OK(builder.Finish(&result)); |
620 | |
621 | // Finalize expected data |
622 | std::shared_ptr<Array> fsb_array; |
623 | ASSERT_OK(fsb_builder.Finish(&fsb_array)); |
624 | |
625 | auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array); |
626 | std::shared_ptr<Array> int_array; |
627 | ASSERT_OK(int_builder.Finish(&int_array)); |
628 | |
629 | DictionaryArray expected(dtype, int_array); |
630 | ASSERT_TRUE(expected.Equals(result)); |
631 | } |
632 | |
633 | // ---------------------------------------------------------------------- |
634 | // DictionaryArray tests |
635 | |
636 | TEST(TestDictionary, Basics) { |
637 | vector<int32_t> values = {100, 1000, 10000, 100000}; |
638 | std::shared_ptr<Array> dict; |
639 | ArrayFromVector<Int32Type, int32_t>(values, &dict); |
640 | |
641 | std::shared_ptr<DictionaryType> type1 = |
642 | std::dynamic_pointer_cast<DictionaryType>(dictionary(int16(), dict)); |
643 | |
644 | auto type2 = |
645 | std::dynamic_pointer_cast<DictionaryType>(::arrow::dictionary(int16(), dict, true)); |
646 | |
647 | ASSERT_TRUE(int16()->Equals(type1->index_type())); |
648 | ASSERT_TRUE(type1->dictionary()->Equals(dict)); |
649 | |
650 | ASSERT_TRUE(int16()->Equals(type2->index_type())); |
651 | ASSERT_TRUE(type2->dictionary()->Equals(dict)); |
652 | |
653 | ASSERT_EQ("dictionary<values=int32, indices=int16, ordered=0>" , type1->ToString()); |
654 | ASSERT_EQ("dictionary<values=int32, indices=int16, ordered=1>" , type2->ToString()); |
655 | } |
656 | |
657 | TEST(TestDictionary, Equals) { |
658 | vector<bool> is_valid = {true, true, false, true, true, true}; |
659 | std::shared_ptr<Array> dict, dict2, indices, indices2, indices3; |
660 | |
661 | dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]" ); |
662 | std::shared_ptr<DataType> dict_type = dictionary(int16(), dict); |
663 | |
664 | dict2 = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\", \"qux\"]" ); |
665 | std::shared_ptr<DataType> dict2_type = dictionary(int16(), dict2); |
666 | |
667 | vector<int16_t> indices_values = {1, 2, -1, 0, 2, 0}; |
668 | ArrayFromVector<Int16Type, int16_t>(is_valid, indices_values, &indices); |
669 | |
670 | vector<int16_t> indices2_values = {1, 2, 0, 0, 2, 0}; |
671 | ArrayFromVector<Int16Type, int16_t>(is_valid, indices2_values, &indices2); |
672 | |
673 | vector<int16_t> indices3_values = {1, 1, 0, 0, 2, 0}; |
674 | ArrayFromVector<Int16Type, int16_t>(is_valid, indices3_values, &indices3); |
675 | |
676 | auto array = std::make_shared<DictionaryArray>(dict_type, indices); |
677 | auto array2 = std::make_shared<DictionaryArray>(dict_type, indices2); |
678 | auto array3 = std::make_shared<DictionaryArray>(dict2_type, indices); |
679 | auto array4 = std::make_shared<DictionaryArray>(dict_type, indices3); |
680 | |
681 | ASSERT_TRUE(array->Equals(array)); |
682 | |
683 | // Equal, because the unequal index is masked by null |
684 | ASSERT_TRUE(array->Equals(array2)); |
685 | |
686 | // Unequal dictionaries |
687 | ASSERT_FALSE(array->Equals(array3)); |
688 | |
689 | // Unequal indices |
690 | ASSERT_FALSE(array->Equals(array4)); |
691 | |
692 | // RangeEquals |
693 | ASSERT_TRUE(array->RangeEquals(3, 6, 3, array4)); |
694 | ASSERT_FALSE(array->RangeEquals(1, 3, 1, array4)); |
695 | |
696 | // ARROW-33 Test slices |
697 | const int64_t size = array->length(); |
698 | |
699 | std::shared_ptr<Array> slice, slice2; |
700 | slice = array->Array::Slice(2); |
701 | slice2 = array->Array::Slice(2); |
702 | ASSERT_EQ(size - 2, slice->length()); |
703 | |
704 | ASSERT_TRUE(slice->Equals(slice2)); |
705 | ASSERT_TRUE(array->RangeEquals(2, array->length(), 0, slice)); |
706 | |
707 | // Chained slices |
708 | slice2 = array->Array::Slice(1)->Array::Slice(1); |
709 | ASSERT_TRUE(slice->Equals(slice2)); |
710 | |
711 | slice = array->Slice(1, 3); |
712 | slice2 = array->Slice(1, 3); |
713 | ASSERT_EQ(3, slice->length()); |
714 | |
715 | ASSERT_TRUE(slice->Equals(slice2)); |
716 | ASSERT_TRUE(array->RangeEquals(1, 4, 0, slice)); |
717 | } |
718 | |
719 | TEST(TestDictionary, Validate) { |
720 | auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]" ); |
721 | std::shared_ptr<DataType> dict_type = dictionary(int16(), dict); |
722 | |
723 | auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0, 2, 0]" ); |
724 | std::shared_ptr<Array> arr = std::make_shared<DictionaryArray>(dict_type, indices); |
725 | |
726 | // Only checking index type for now |
727 | ASSERT_OK(ValidateArray(*arr)); |
728 | |
729 | // TODO(wesm) In ARROW-1199, there is now a DCHECK to compare the indices |
730 | // type with the dict_type. How can we test for this? |
731 | |
732 | // std::shared_ptr<Array> indices2; |
733 | // vector<float> indices2_values = {1., 2., 0., 0., 2., 0.}; |
734 | // ArrayFromVector<FloatType, float>(is_valid, indices2_values, &indices2); |
735 | |
736 | // std::shared_ptr<Array> indices3; |
737 | // vector<int64_t> indices3_values = {1, 2, 0, 0, 2, 0}; |
738 | // ArrayFromVector<Int64Type, int64_t>(is_valid, indices3_values, &indices3); |
739 | // std::shared_ptr<Array> arr2 = std::make_shared<DictionaryArray>(dict_type, indices2); |
740 | // std::shared_ptr<Array> arr3 = std::make_shared<DictionaryArray>(dict_type, indices3); |
741 | // ASSERT_OK(ValidateArray(*arr3)); |
742 | } |
743 | |
744 | TEST(TestDictionary, FromArray) { |
745 | auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]" ); |
746 | std::shared_ptr<DataType> dict_type = dictionary(int16(), dict); |
747 | |
748 | auto indices1 = ArrayFromJSON(int16(), "[1, 2, 0, 0, 2, 0]" ); |
749 | auto indices2 = ArrayFromJSON(int16(), "[1, 2, 0, 3, 2, 0]" ); |
750 | |
751 | // Invalid index is masked by null |
752 | std::shared_ptr<Array> indices3; |
753 | vector<bool> is_valid3 = {true, true, false, true, true, true}; |
754 | vector<int16_t> indices_values3 = {1, 2, -1, 0, 2, 0}; |
755 | ArrayFromVector<Int16Type, int16_t>(is_valid3, indices_values3, &indices3); |
756 | |
757 | // Index out of bounds |
758 | auto indices4 = ArrayFromJSON(int16(), "[1, 2, null, 3, 2, 0]" ); |
759 | |
760 | std::shared_ptr<Array> arr1, arr2, arr3, arr4; |
761 | ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices1, &arr1)); |
762 | ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices2, &arr2)); |
763 | ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices3, &arr3)); |
764 | ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, &arr4)); |
765 | } |
766 | |
767 | TEST(TestDictionary, TransposeBasic) { |
768 | std::shared_ptr<Array> arr, out, expected; |
769 | |
770 | auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]" ); |
771 | auto dict_type = dictionary(int16(), dict); |
772 | auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]" ); |
773 | // ["B", "C", "A", "A"] |
774 | ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr)); |
775 | |
776 | // Transpose to same index type |
777 | { |
778 | auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]" ); |
779 | auto out_dict_type = dictionary(int16(), out_dict); |
780 | |
781 | const std::vector<int32_t> transpose_map{1, 3, 2}; |
782 | ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose( |
783 | default_memory_pool(), out_dict_type, transpose_map, &out)); |
784 | |
785 | auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]" ); |
786 | ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); |
787 | AssertArraysEqual(*out, *expected); |
788 | } |
789 | |
790 | // Transpose to other type |
791 | { |
792 | auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]" ); |
793 | auto out_dict_type = dictionary(int8(), out_dict); |
794 | |
795 | const std::vector<int32_t> transpose_map{1, 3, 2}; |
796 | ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose( |
797 | default_memory_pool(), out_dict_type, transpose_map, &out)); |
798 | |
799 | auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]" ); |
800 | ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); |
801 | AssertArraysEqual(*expected, *out); |
802 | } |
803 | } |
804 | |
805 | TEST(TestDictionary, TransposeNulls) { |
806 | std::shared_ptr<Array> arr, out, expected; |
807 | |
808 | auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]" ); |
809 | auto dict_type = dictionary(int16(), dict); |
810 | auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]" ); |
811 | // ["B", "C", null, "A"] |
812 | ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr)); |
813 | |
814 | auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]" ); |
815 | auto out_dict_type = dictionary(int16(), out_dict); |
816 | |
817 | const std::vector<int32_t> transpose_map{1, 3, 2}; |
818 | ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose( |
819 | default_memory_pool(), out_dict_type, transpose_map, &out)); |
820 | |
821 | auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]" ); |
822 | ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); |
823 | AssertArraysEqual(*expected, *out); |
824 | } |
825 | |
826 | } // namespace arrow |
827 | |