1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #pragma once |
19 | |
20 | #include <limits> |
21 | #include <memory> |
22 | #include <random> |
23 | #include <string> |
24 | #include <utility> |
25 | #include <vector> |
26 | |
27 | #include "arrow/api.h" |
28 | #include "arrow/test-util.h" |
29 | #include "arrow/type_traits.h" |
30 | #include "arrow/util/decimal.h" |
31 | |
32 | #include "parquet/arrow/record_reader.h" |
33 | |
34 | namespace parquet { |
35 | |
36 | using internal::RecordReader; |
37 | |
38 | namespace arrow { |
39 | |
40 | using ::arrow::Array; |
41 | using ::arrow::Status; |
42 | |
43 | template <int32_t PRECISION> |
44 | struct DecimalWithPrecisionAndScale { |
45 | static_assert(PRECISION >= 1 && PRECISION <= 38, "Invalid precision value" ); |
46 | |
47 | using type = ::arrow::Decimal128Type; |
48 | static constexpr ::arrow::Type::type type_id = ::arrow::Decimal128Type::type_id; |
49 | static constexpr int32_t precision = PRECISION; |
50 | static constexpr int32_t scale = PRECISION - 1; |
51 | }; |
52 | |
53 | template <typename ArrowType> |
54 | using is_arrow_float = std::is_floating_point<typename ArrowType::c_type>; |
55 | |
56 | template <typename ArrowType> |
57 | using is_arrow_int = std::is_integral<typename ArrowType::c_type>; |
58 | |
59 | template <typename ArrowType> |
60 | using is_arrow_date = std::is_same<ArrowType, ::arrow::Date64Type>; |
61 | |
62 | template <typename ArrowType> |
63 | using is_arrow_string = std::is_same<ArrowType, ::arrow::StringType>; |
64 | |
65 | template <typename ArrowType> |
66 | using is_arrow_binary = std::is_same<ArrowType, ::arrow::BinaryType>; |
67 | |
68 | template <typename ArrowType> |
69 | using is_arrow_fixed_size_binary = std::is_same<ArrowType, ::arrow::FixedSizeBinaryType>; |
70 | |
71 | template <typename ArrowType> |
72 | using is_arrow_bool = std::is_same<ArrowType, ::arrow::BooleanType>; |
73 | |
74 | template <class ArrowType> |
75 | typename std::enable_if<is_arrow_float<ArrowType>::value, Status>::type NonNullArray( |
76 | size_t size, std::shared_ptr<Array>* out) { |
77 | using c_type = typename ArrowType::c_type; |
78 | std::vector<c_type> values; |
79 | ::arrow::random_real(size, 0, static_cast<c_type>(0), static_cast<c_type>(1), &values); |
80 | ::arrow::NumericBuilder<ArrowType> builder; |
81 | RETURN_NOT_OK(builder.AppendValues(values.data(), values.size())); |
82 | return builder.Finish(out); |
83 | } |
84 | |
85 | template <class ArrowType> |
86 | typename std::enable_if< |
87 | is_arrow_int<ArrowType>::value && !is_arrow_date<ArrowType>::value, Status>::type |
88 | NonNullArray(size_t size, std::shared_ptr<Array>* out) { |
89 | std::vector<typename ArrowType::c_type> values; |
90 | ::arrow::randint(size, 0, 64, &values); |
91 | |
92 | // Passing data type so this will work with TimestampType too |
93 | ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(), |
94 | ::arrow::default_memory_pool()); |
95 | RETURN_NOT_OK(builder.AppendValues(values.data(), values.size())); |
96 | return builder.Finish(out); |
97 | } |
98 | |
99 | template <class ArrowType> |
100 | typename std::enable_if<is_arrow_date<ArrowType>::value, Status>::type NonNullArray( |
101 | size_t size, std::shared_ptr<Array>* out) { |
102 | std::vector<typename ArrowType::c_type> values; |
103 | ::arrow::randint(size, 0, 64, &values); |
104 | for (size_t i = 0; i < size; i++) { |
105 | values[i] *= 86400000; |
106 | } |
107 | |
108 | // Passing data type so this will work with TimestampType too |
109 | ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(), |
110 | ::arrow::default_memory_pool()); |
111 | builder.AppendValues(values.data(), values.size()); |
112 | return builder.Finish(out); |
113 | } |
114 | |
115 | template <class ArrowType> |
116 | typename std::enable_if< |
117 | is_arrow_string<ArrowType>::value || is_arrow_binary<ArrowType>::value, Status>::type |
118 | NonNullArray(size_t size, std::shared_ptr<Array>* out) { |
119 | using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType; |
120 | BuilderType builder; |
121 | for (size_t i = 0; i < size; i++) { |
122 | RETURN_NOT_OK(builder.Append("test-string" )); |
123 | } |
124 | return builder.Finish(out); |
125 | } |
126 | |
127 | template <typename ArrowType> |
128 | typename std::enable_if<is_arrow_fixed_size_binary<ArrowType>::value, Status>::type |
129 | NonNullArray(size_t size, std::shared_ptr<Array>* out) { |
130 | using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType; |
131 | // set byte_width to the length of "fixed": 5 |
132 | // todo: find a way to generate test data with more diversity. |
133 | BuilderType builder(::arrow::fixed_size_binary(5)); |
134 | for (size_t i = 0; i < size; i++) { |
135 | RETURN_NOT_OK(builder.Append("fixed" )); |
136 | } |
137 | return builder.Finish(out); |
138 | } |
139 | |
140 | static inline void random_decimals(int64_t n, uint32_t seed, int32_t precision, |
141 | uint8_t* out) { |
142 | std::default_random_engine gen(seed); |
143 | std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max()); |
144 | const int32_t required_bytes = ::arrow::DecimalSize(precision); |
145 | constexpr int32_t byte_width = 16; |
146 | std::fill(out, out + byte_width * n, '\0'); |
147 | |
148 | for (int64_t i = 0; i < n; ++i, out += byte_width) { |
149 | std::generate(out, out + required_bytes, |
150 | [&d, &gen] { return static_cast<uint8_t>(d(gen)); }); |
151 | |
152 | // sign extend if the sign bit is set for the last byte generated |
153 | // 0b10000000 == 0x80 == 128 |
154 | if ((out[required_bytes - 1] & '\x80') != 0) { |
155 | std::fill(out + required_bytes, out + byte_width, '\xFF'); |
156 | } |
157 | } |
158 | } |
159 | |
160 | template <typename ArrowType, int32_t precision = ArrowType::precision> |
161 | typename std::enable_if< |
162 | std::is_same<ArrowType, DecimalWithPrecisionAndScale<precision>>::value, Status>::type |
163 | NonNullArray(size_t size, std::shared_ptr<Array>* out) { |
164 | constexpr int32_t kDecimalPrecision = precision; |
165 | constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale<precision>::scale; |
166 | |
167 | const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale); |
168 | ::arrow::Decimal128Builder builder(type); |
169 | const int32_t byte_width = |
170 | static_cast<const ::arrow::Decimal128Type&>(*type).byte_width(); |
171 | |
172 | constexpr int32_t seed = 0; |
173 | |
174 | std::shared_ptr<Buffer> out_buf; |
175 | RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), size * byte_width, |
176 | &out_buf)); |
177 | random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data()); |
178 | |
179 | RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size)); |
180 | return builder.Finish(out); |
181 | } |
182 | |
183 | template <class ArrowType> |
184 | typename std::enable_if<is_arrow_bool<ArrowType>::value, Status>::type NonNullArray( |
185 | size_t size, std::shared_ptr<Array>* out) { |
186 | std::vector<uint8_t> values; |
187 | ::arrow::randint(size, 0, 1, &values); |
188 | ::arrow::BooleanBuilder builder; |
189 | RETURN_NOT_OK(builder.AppendValues(values.data(), values.size())); |
190 | return builder.Finish(out); |
191 | } |
192 | |
193 | // This helper function only supports (size/2) nulls. |
194 | template <typename ArrowType> |
195 | typename std::enable_if<is_arrow_float<ArrowType>::value, Status>::type NullableArray( |
196 | size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) { |
197 | using c_type = typename ArrowType::c_type; |
198 | std::vector<c_type> values; |
199 | ::arrow::random_real(size, seed, static_cast<c_type>(-1e10), static_cast<c_type>(1e10), |
200 | &values); |
201 | std::vector<uint8_t> valid_bytes(size, 1); |
202 | |
203 | for (size_t i = 0; i < num_nulls; i++) { |
204 | valid_bytes[i * 2] = 0; |
205 | } |
206 | |
207 | ::arrow::NumericBuilder<ArrowType> builder; |
208 | RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); |
209 | return builder.Finish(out); |
210 | } |
211 | |
212 | // This helper function only supports (size/2) nulls. |
213 | template <typename ArrowType> |
214 | typename std::enable_if< |
215 | is_arrow_int<ArrowType>::value && !is_arrow_date<ArrowType>::value, Status>::type |
216 | NullableArray(size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) { |
217 | std::vector<typename ArrowType::c_type> values; |
218 | |
219 | // Seed is random in Arrow right now |
220 | (void)seed; |
221 | ::arrow::randint(size, 0, 64, &values); |
222 | std::vector<uint8_t> valid_bytes(size, 1); |
223 | |
224 | for (size_t i = 0; i < num_nulls; i++) { |
225 | valid_bytes[i * 2] = 0; |
226 | } |
227 | |
228 | // Passing data type so this will work with TimestampType too |
229 | ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(), |
230 | ::arrow::default_memory_pool()); |
231 | RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); |
232 | return builder.Finish(out); |
233 | } |
234 | |
235 | template <typename ArrowType> |
236 | typename std::enable_if<is_arrow_date<ArrowType>::value, Status>::type NullableArray( |
237 | size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) { |
238 | std::vector<typename ArrowType::c_type> values; |
239 | |
240 | // Seed is random in Arrow right now |
241 | (void)seed; |
242 | ::arrow::randint(size, 0, 64, &values); |
243 | for (size_t i = 0; i < size; i++) { |
244 | values[i] *= 86400000; |
245 | } |
246 | std::vector<uint8_t> valid_bytes(size, 1); |
247 | |
248 | for (size_t i = 0; i < num_nulls; i++) { |
249 | valid_bytes[i * 2] = 0; |
250 | } |
251 | |
252 | // Passing data type so this will work with TimestampType too |
253 | ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(), |
254 | ::arrow::default_memory_pool()); |
255 | builder.AppendValues(values.data(), values.size(), valid_bytes.data()); |
256 | return builder.Finish(out); |
257 | } |
258 | |
259 | // This helper function only supports (size/2) nulls yet. |
260 | template <typename ArrowType> |
261 | typename std::enable_if< |
262 | is_arrow_string<ArrowType>::value || is_arrow_binary<ArrowType>::value, Status>::type |
263 | NullableArray(size_t size, size_t num_nulls, uint32_t seed, |
264 | std::shared_ptr<::arrow::Array>* out) { |
265 | std::vector<uint8_t> valid_bytes(size, 1); |
266 | |
267 | for (size_t i = 0; i < num_nulls; i++) { |
268 | valid_bytes[i * 2] = 0; |
269 | } |
270 | |
271 | using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType; |
272 | BuilderType builder; |
273 | |
274 | const int kBufferSize = 10; |
275 | uint8_t buffer[kBufferSize]; |
276 | for (size_t i = 0; i < size; i++) { |
277 | if (!valid_bytes[i]) { |
278 | RETURN_NOT_OK(builder.AppendNull()); |
279 | } else { |
280 | ::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer); |
281 | RETURN_NOT_OK(builder.Append(buffer, kBufferSize)); |
282 | } |
283 | } |
284 | return builder.Finish(out); |
285 | } |
286 | |
287 | // This helper function only supports (size/2) nulls yet, |
288 | // same as NullableArray<String|Binary>(..) |
289 | template <typename ArrowType> |
290 | typename std::enable_if<is_arrow_fixed_size_binary<ArrowType>::value, Status>::type |
291 | NullableArray(size_t size, size_t num_nulls, uint32_t seed, |
292 | std::shared_ptr<::arrow::Array>* out) { |
293 | std::vector<uint8_t> valid_bytes(size, 1); |
294 | |
295 | for (size_t i = 0; i < num_nulls; i++) { |
296 | valid_bytes[i * 2] = 0; |
297 | } |
298 | |
299 | using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType; |
300 | const int byte_width = 10; |
301 | BuilderType builder(::arrow::fixed_size_binary(byte_width)); |
302 | |
303 | const int kBufferSize = byte_width; |
304 | uint8_t buffer[kBufferSize]; |
305 | for (size_t i = 0; i < size; i++) { |
306 | if (!valid_bytes[i]) { |
307 | RETURN_NOT_OK(builder.AppendNull()); |
308 | } else { |
309 | ::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer); |
310 | RETURN_NOT_OK(builder.Append(buffer)); |
311 | } |
312 | } |
313 | return builder.Finish(out); |
314 | } |
315 | |
316 | template <typename ArrowType, int32_t precision = ArrowType::precision> |
317 | typename std::enable_if< |
318 | std::is_same<ArrowType, DecimalWithPrecisionAndScale<precision>>::value, Status>::type |
319 | NullableArray(size_t size, size_t num_nulls, uint32_t seed, |
320 | std::shared_ptr<::arrow::Array>* out) { |
321 | std::vector<uint8_t> valid_bytes(size, '\1'); |
322 | |
323 | for (size_t i = 0; i < num_nulls; ++i) { |
324 | valid_bytes[i * 2] = '\0'; |
325 | } |
326 | |
327 | constexpr int32_t kDecimalPrecision = precision; |
328 | constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale<precision>::scale; |
329 | const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale); |
330 | const int32_t byte_width = |
331 | static_cast<const ::arrow::Decimal128Type&>(*type).byte_width(); |
332 | |
333 | std::shared_ptr<::arrow::Buffer> out_buf; |
334 | RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), size * byte_width, |
335 | &out_buf)); |
336 | |
337 | random_decimals(size, seed, precision, out_buf->mutable_data()); |
338 | |
339 | ::arrow::Decimal128Builder builder(type); |
340 | RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data())); |
341 | return builder.Finish(out); |
342 | } |
343 | |
344 | // This helper function only supports (size/2) nulls yet. |
345 | template <class ArrowType> |
346 | typename std::enable_if<is_arrow_bool<ArrowType>::value, Status>::type NullableArray( |
347 | size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) { |
348 | std::vector<uint8_t> values; |
349 | |
350 | // Seed is random in Arrow right now |
351 | (void)seed; |
352 | |
353 | ::arrow::randint(size, 0, 1, &values); |
354 | std::vector<uint8_t> valid_bytes(size, 1); |
355 | |
356 | for (size_t i = 0; i < num_nulls; i++) { |
357 | valid_bytes[i * 2] = 0; |
358 | } |
359 | |
360 | ::arrow::BooleanBuilder builder; |
361 | RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); |
362 | return builder.Finish(out); |
363 | } |
364 | |
365 | /// Wrap an Array into a ListArray by splitting it up into size lists. |
366 | /// |
367 | /// This helper function only supports (size/2) nulls. |
368 | Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size, |
369 | int64_t null_count, bool nullable_values, |
370 | std::shared_ptr<::arrow::ListArray>* out) { |
371 | // We always include an empty list |
372 | int64_t non_null_entries = size - null_count - 1; |
373 | int64_t length_per_entry = values->length() / non_null_entries; |
374 | |
375 | auto offsets = AllocateBuffer(); |
376 | RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t))); |
377 | int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data()); |
378 | |
379 | auto null_bitmap = AllocateBuffer(); |
380 | int64_t bitmap_size = ::arrow::BitUtil::BytesForBits(size); |
381 | RETURN_NOT_OK(null_bitmap->Resize(bitmap_size)); |
382 | uint8_t* null_bitmap_ptr = null_bitmap->mutable_data(); |
383 | memset(null_bitmap_ptr, 0, bitmap_size); |
384 | |
385 | int32_t current_offset = 0; |
386 | for (int64_t i = 0; i < size; i++) { |
387 | offsets_ptr[i] = current_offset; |
388 | if (!(((i % 2) == 0) && ((i / 2) < null_count))) { |
389 | // Non-null list (list with index 1 is always empty). |
390 | ::arrow::BitUtil::SetBit(null_bitmap_ptr, i); |
391 | if (i != 1) { |
392 | current_offset += static_cast<int32_t>(length_per_entry); |
393 | } |
394 | } |
395 | } |
396 | offsets_ptr[size] = static_cast<int32_t>(values->length()); |
397 | |
398 | auto value_field = ::arrow::field("item" , values->type(), nullable_values); |
399 | *out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets, |
400 | values, null_bitmap, null_count); |
401 | |
402 | return Status::OK(); |
403 | } |
404 | |
405 | // Make an array containing only empty lists, with a null values array |
406 | Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) { |
407 | // Allocate an offsets buffer containing only zeroes |
408 | std::shared_ptr<Buffer> offsets_buffer; |
409 | const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t); |
410 | RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), offsets_nbytes, |
411 | &offsets_buffer)); |
412 | memset(offsets_buffer->mutable_data(), 0, offsets_nbytes); |
413 | |
414 | auto value_field = |
415 | ::arrow::field("item" , ::arrow::float64(), false /* nullable_values */); |
416 | auto list_type = ::arrow::list(value_field); |
417 | |
418 | std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */, |
419 | nullptr /* values */}; |
420 | auto child_data = |
421 | ::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers)); |
422 | |
423 | std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */, offsets_buffer}; |
424 | auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers)); |
425 | array_data->child_data.push_back(child_data); |
426 | |
427 | *out_array = ::arrow::MakeArray(array_data); |
428 | return Status::OK(); |
429 | } |
430 | |
431 | static inline std::shared_ptr<::arrow::Column> MakeColumn( |
432 | const std::string& name, const std::shared_ptr<Array>& array, bool nullable) { |
433 | auto field = ::arrow::field(name, array->type(), nullable); |
434 | return std::make_shared<::arrow::Column>(field, array); |
435 | } |
436 | |
437 | static inline std::shared_ptr<::arrow::Column> MakeColumn( |
438 | const std::string& name, const std::vector<std::shared_ptr<Array>>& arrays, |
439 | bool nullable) { |
440 | auto field = ::arrow::field(name, arrays[0]->type(), nullable); |
441 | return std::make_shared<::arrow::Column>(field, arrays); |
442 | } |
443 | |
444 | std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr<Array>& values, |
445 | bool nullable) { |
446 | std::shared_ptr<::arrow::Column> column = MakeColumn("col" , values, nullable); |
447 | std::vector<std::shared_ptr<::arrow::Column>> columns({column}); |
448 | std::vector<std::shared_ptr<::arrow::Field>> fields({column->field()}); |
449 | auto schema = std::make_shared<::arrow::Schema>(fields); |
450 | return ::arrow::Table::Make(schema, columns); |
451 | } |
452 | |
453 | template <typename T> |
454 | void ExpectArray(T* expected, Array* result) { |
455 | auto p_array = static_cast<::arrow::PrimitiveArray*>(result); |
456 | for (int i = 0; i < result->length(); i++) { |
457 | EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]); |
458 | } |
459 | } |
460 | |
461 | template <typename ArrowType> |
462 | void ExpectArrayT(void* expected, Array* result) { |
463 | ::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result); |
464 | for (int64_t i = 0; i < result->length(); i++) { |
465 | EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i], |
466 | reinterpret_cast<const typename ArrowType::c_type*>( |
467 | p_array->values()->data())[i]); |
468 | } |
469 | } |
470 | |
471 | template <> |
472 | void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) { |
473 | ::arrow::BooleanBuilder builder; |
474 | EXPECT_OK(builder.AppendValues(reinterpret_cast<uint8_t*>(expected), result->length())); |
475 | |
476 | std::shared_ptr<Array> expected_array; |
477 | EXPECT_OK(builder.Finish(&expected_array)); |
478 | EXPECT_TRUE(result->Equals(*expected_array)); |
479 | } |
480 | |
481 | } // namespace arrow |
482 | |
483 | } // namespace parquet |
484 | |