1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <limits>
21#include <memory>
22#include <random>
23#include <string>
24#include <utility>
25#include <vector>
26
27#include "arrow/api.h"
28#include "arrow/test-util.h"
29#include "arrow/type_traits.h"
30#include "arrow/util/decimal.h"
31
32#include "parquet/arrow/record_reader.h"
33
34namespace parquet {
35
36using internal::RecordReader;
37
38namespace arrow {
39
40using ::arrow::Array;
41using ::arrow::Status;
42
43template <int32_t PRECISION>
44struct DecimalWithPrecisionAndScale {
45 static_assert(PRECISION >= 1 && PRECISION <= 38, "Invalid precision value");
46
47 using type = ::arrow::Decimal128Type;
48 static constexpr ::arrow::Type::type type_id = ::arrow::Decimal128Type::type_id;
49 static constexpr int32_t precision = PRECISION;
50 static constexpr int32_t scale = PRECISION - 1;
51};
52
53template <typename ArrowType>
54using is_arrow_float = std::is_floating_point<typename ArrowType::c_type>;
55
56template <typename ArrowType>
57using is_arrow_int = std::is_integral<typename ArrowType::c_type>;
58
59template <typename ArrowType>
60using is_arrow_date = std::is_same<ArrowType, ::arrow::Date64Type>;
61
62template <typename ArrowType>
63using is_arrow_string = std::is_same<ArrowType, ::arrow::StringType>;
64
65template <typename ArrowType>
66using is_arrow_binary = std::is_same<ArrowType, ::arrow::BinaryType>;
67
68template <typename ArrowType>
69using is_arrow_fixed_size_binary = std::is_same<ArrowType, ::arrow::FixedSizeBinaryType>;
70
71template <typename ArrowType>
72using is_arrow_bool = std::is_same<ArrowType, ::arrow::BooleanType>;
73
74template <class ArrowType>
75typename std::enable_if<is_arrow_float<ArrowType>::value, Status>::type NonNullArray(
76 size_t size, std::shared_ptr<Array>* out) {
77 using c_type = typename ArrowType::c_type;
78 std::vector<c_type> values;
79 ::arrow::random_real(size, 0, static_cast<c_type>(0), static_cast<c_type>(1), &values);
80 ::arrow::NumericBuilder<ArrowType> builder;
81 RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
82 return builder.Finish(out);
83}
84
85template <class ArrowType>
86typename std::enable_if<
87 is_arrow_int<ArrowType>::value && !is_arrow_date<ArrowType>::value, Status>::type
88NonNullArray(size_t size, std::shared_ptr<Array>* out) {
89 std::vector<typename ArrowType::c_type> values;
90 ::arrow::randint(size, 0, 64, &values);
91
92 // Passing data type so this will work with TimestampType too
93 ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
94 ::arrow::default_memory_pool());
95 RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
96 return builder.Finish(out);
97}
98
99template <class ArrowType>
100typename std::enable_if<is_arrow_date<ArrowType>::value, Status>::type NonNullArray(
101 size_t size, std::shared_ptr<Array>* out) {
102 std::vector<typename ArrowType::c_type> values;
103 ::arrow::randint(size, 0, 64, &values);
104 for (size_t i = 0; i < size; i++) {
105 values[i] *= 86400000;
106 }
107
108 // Passing data type so this will work with TimestampType too
109 ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
110 ::arrow::default_memory_pool());
111 builder.AppendValues(values.data(), values.size());
112 return builder.Finish(out);
113}
114
115template <class ArrowType>
116typename std::enable_if<
117 is_arrow_string<ArrowType>::value || is_arrow_binary<ArrowType>::value, Status>::type
118NonNullArray(size_t size, std::shared_ptr<Array>* out) {
119 using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
120 BuilderType builder;
121 for (size_t i = 0; i < size; i++) {
122 RETURN_NOT_OK(builder.Append("test-string"));
123 }
124 return builder.Finish(out);
125}
126
127template <typename ArrowType>
128typename std::enable_if<is_arrow_fixed_size_binary<ArrowType>::value, Status>::type
129NonNullArray(size_t size, std::shared_ptr<Array>* out) {
130 using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
131 // set byte_width to the length of "fixed": 5
132 // todo: find a way to generate test data with more diversity.
133 BuilderType builder(::arrow::fixed_size_binary(5));
134 for (size_t i = 0; i < size; i++) {
135 RETURN_NOT_OK(builder.Append("fixed"));
136 }
137 return builder.Finish(out);
138}
139
140static inline void random_decimals(int64_t n, uint32_t seed, int32_t precision,
141 uint8_t* out) {
142 std::default_random_engine gen(seed);
143 std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max());
144 const int32_t required_bytes = ::arrow::DecimalSize(precision);
145 constexpr int32_t byte_width = 16;
146 std::fill(out, out + byte_width * n, '\0');
147
148 for (int64_t i = 0; i < n; ++i, out += byte_width) {
149 std::generate(out, out + required_bytes,
150 [&d, &gen] { return static_cast<uint8_t>(d(gen)); });
151
152 // sign extend if the sign bit is set for the last byte generated
153 // 0b10000000 == 0x80 == 128
154 if ((out[required_bytes - 1] & '\x80') != 0) {
155 std::fill(out + required_bytes, out + byte_width, '\xFF');
156 }
157 }
158}
159
160template <typename ArrowType, int32_t precision = ArrowType::precision>
161typename std::enable_if<
162 std::is_same<ArrowType, DecimalWithPrecisionAndScale<precision>>::value, Status>::type
163NonNullArray(size_t size, std::shared_ptr<Array>* out) {
164 constexpr int32_t kDecimalPrecision = precision;
165 constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale<precision>::scale;
166
167 const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale);
168 ::arrow::Decimal128Builder builder(type);
169 const int32_t byte_width =
170 static_cast<const ::arrow::Decimal128Type&>(*type).byte_width();
171
172 constexpr int32_t seed = 0;
173
174 std::shared_ptr<Buffer> out_buf;
175 RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), size * byte_width,
176 &out_buf));
177 random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data());
178
179 RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
180 return builder.Finish(out);
181}
182
183template <class ArrowType>
184typename std::enable_if<is_arrow_bool<ArrowType>::value, Status>::type NonNullArray(
185 size_t size, std::shared_ptr<Array>* out) {
186 std::vector<uint8_t> values;
187 ::arrow::randint(size, 0, 1, &values);
188 ::arrow::BooleanBuilder builder;
189 RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
190 return builder.Finish(out);
191}
192
193// This helper function only supports (size/2) nulls.
194template <typename ArrowType>
195typename std::enable_if<is_arrow_float<ArrowType>::value, Status>::type NullableArray(
196 size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
197 using c_type = typename ArrowType::c_type;
198 std::vector<c_type> values;
199 ::arrow::random_real(size, seed, static_cast<c_type>(-1e10), static_cast<c_type>(1e10),
200 &values);
201 std::vector<uint8_t> valid_bytes(size, 1);
202
203 for (size_t i = 0; i < num_nulls; i++) {
204 valid_bytes[i * 2] = 0;
205 }
206
207 ::arrow::NumericBuilder<ArrowType> builder;
208 RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
209 return builder.Finish(out);
210}
211
212// This helper function only supports (size/2) nulls.
213template <typename ArrowType>
214typename std::enable_if<
215 is_arrow_int<ArrowType>::value && !is_arrow_date<ArrowType>::value, Status>::type
216NullableArray(size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
217 std::vector<typename ArrowType::c_type> values;
218
219 // Seed is random in Arrow right now
220 (void)seed;
221 ::arrow::randint(size, 0, 64, &values);
222 std::vector<uint8_t> valid_bytes(size, 1);
223
224 for (size_t i = 0; i < num_nulls; i++) {
225 valid_bytes[i * 2] = 0;
226 }
227
228 // Passing data type so this will work with TimestampType too
229 ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
230 ::arrow::default_memory_pool());
231 RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
232 return builder.Finish(out);
233}
234
235template <typename ArrowType>
236typename std::enable_if<is_arrow_date<ArrowType>::value, Status>::type NullableArray(
237 size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
238 std::vector<typename ArrowType::c_type> values;
239
240 // Seed is random in Arrow right now
241 (void)seed;
242 ::arrow::randint(size, 0, 64, &values);
243 for (size_t i = 0; i < size; i++) {
244 values[i] *= 86400000;
245 }
246 std::vector<uint8_t> valid_bytes(size, 1);
247
248 for (size_t i = 0; i < num_nulls; i++) {
249 valid_bytes[i * 2] = 0;
250 }
251
252 // Passing data type so this will work with TimestampType too
253 ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
254 ::arrow::default_memory_pool());
255 builder.AppendValues(values.data(), values.size(), valid_bytes.data());
256 return builder.Finish(out);
257}
258
259// This helper function only supports (size/2) nulls yet.
260template <typename ArrowType>
261typename std::enable_if<
262 is_arrow_string<ArrowType>::value || is_arrow_binary<ArrowType>::value, Status>::type
263NullableArray(size_t size, size_t num_nulls, uint32_t seed,
264 std::shared_ptr<::arrow::Array>* out) {
265 std::vector<uint8_t> valid_bytes(size, 1);
266
267 for (size_t i = 0; i < num_nulls; i++) {
268 valid_bytes[i * 2] = 0;
269 }
270
271 using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
272 BuilderType builder;
273
274 const int kBufferSize = 10;
275 uint8_t buffer[kBufferSize];
276 for (size_t i = 0; i < size; i++) {
277 if (!valid_bytes[i]) {
278 RETURN_NOT_OK(builder.AppendNull());
279 } else {
280 ::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
281 RETURN_NOT_OK(builder.Append(buffer, kBufferSize));
282 }
283 }
284 return builder.Finish(out);
285}
286
287// This helper function only supports (size/2) nulls yet,
288// same as NullableArray<String|Binary>(..)
289template <typename ArrowType>
290typename std::enable_if<is_arrow_fixed_size_binary<ArrowType>::value, Status>::type
291NullableArray(size_t size, size_t num_nulls, uint32_t seed,
292 std::shared_ptr<::arrow::Array>* out) {
293 std::vector<uint8_t> valid_bytes(size, 1);
294
295 for (size_t i = 0; i < num_nulls; i++) {
296 valid_bytes[i * 2] = 0;
297 }
298
299 using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
300 const int byte_width = 10;
301 BuilderType builder(::arrow::fixed_size_binary(byte_width));
302
303 const int kBufferSize = byte_width;
304 uint8_t buffer[kBufferSize];
305 for (size_t i = 0; i < size; i++) {
306 if (!valid_bytes[i]) {
307 RETURN_NOT_OK(builder.AppendNull());
308 } else {
309 ::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
310 RETURN_NOT_OK(builder.Append(buffer));
311 }
312 }
313 return builder.Finish(out);
314}
315
316template <typename ArrowType, int32_t precision = ArrowType::precision>
317typename std::enable_if<
318 std::is_same<ArrowType, DecimalWithPrecisionAndScale<precision>>::value, Status>::type
319NullableArray(size_t size, size_t num_nulls, uint32_t seed,
320 std::shared_ptr<::arrow::Array>* out) {
321 std::vector<uint8_t> valid_bytes(size, '\1');
322
323 for (size_t i = 0; i < num_nulls; ++i) {
324 valid_bytes[i * 2] = '\0';
325 }
326
327 constexpr int32_t kDecimalPrecision = precision;
328 constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale<precision>::scale;
329 const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale);
330 const int32_t byte_width =
331 static_cast<const ::arrow::Decimal128Type&>(*type).byte_width();
332
333 std::shared_ptr<::arrow::Buffer> out_buf;
334 RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), size * byte_width,
335 &out_buf));
336
337 random_decimals(size, seed, precision, out_buf->mutable_data());
338
339 ::arrow::Decimal128Builder builder(type);
340 RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
341 return builder.Finish(out);
342}
343
344// This helper function only supports (size/2) nulls yet.
345template <class ArrowType>
346typename std::enable_if<is_arrow_bool<ArrowType>::value, Status>::type NullableArray(
347 size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
348 std::vector<uint8_t> values;
349
350 // Seed is random in Arrow right now
351 (void)seed;
352
353 ::arrow::randint(size, 0, 1, &values);
354 std::vector<uint8_t> valid_bytes(size, 1);
355
356 for (size_t i = 0; i < num_nulls; i++) {
357 valid_bytes[i * 2] = 0;
358 }
359
360 ::arrow::BooleanBuilder builder;
361 RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
362 return builder.Finish(out);
363}
364
365/// Wrap an Array into a ListArray by splitting it up into size lists.
366///
367/// This helper function only supports (size/2) nulls.
368Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
369 int64_t null_count, bool nullable_values,
370 std::shared_ptr<::arrow::ListArray>* out) {
371 // We always include an empty list
372 int64_t non_null_entries = size - null_count - 1;
373 int64_t length_per_entry = values->length() / non_null_entries;
374
375 auto offsets = AllocateBuffer();
376 RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t)));
377 int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
378
379 auto null_bitmap = AllocateBuffer();
380 int64_t bitmap_size = ::arrow::BitUtil::BytesForBits(size);
381 RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
382 uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
383 memset(null_bitmap_ptr, 0, bitmap_size);
384
385 int32_t current_offset = 0;
386 for (int64_t i = 0; i < size; i++) {
387 offsets_ptr[i] = current_offset;
388 if (!(((i % 2) == 0) && ((i / 2) < null_count))) {
389 // Non-null list (list with index 1 is always empty).
390 ::arrow::BitUtil::SetBit(null_bitmap_ptr, i);
391 if (i != 1) {
392 current_offset += static_cast<int32_t>(length_per_entry);
393 }
394 }
395 }
396 offsets_ptr[size] = static_cast<int32_t>(values->length());
397
398 auto value_field = ::arrow::field("item", values->type(), nullable_values);
399 *out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets,
400 values, null_bitmap, null_count);
401
402 return Status::OK();
403}
404
405// Make an array containing only empty lists, with a null values array
406Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
407 // Allocate an offsets buffer containing only zeroes
408 std::shared_ptr<Buffer> offsets_buffer;
409 const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
410 RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), offsets_nbytes,
411 &offsets_buffer));
412 memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
413
414 auto value_field =
415 ::arrow::field("item", ::arrow::float64(), false /* nullable_values */);
416 auto list_type = ::arrow::list(value_field);
417
418 std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
419 nullptr /* values */};
420 auto child_data =
421 ::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers));
422
423 std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */, offsets_buffer};
424 auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
425 array_data->child_data.push_back(child_data);
426
427 *out_array = ::arrow::MakeArray(array_data);
428 return Status::OK();
429}
430
431static inline std::shared_ptr<::arrow::Column> MakeColumn(
432 const std::string& name, const std::shared_ptr<Array>& array, bool nullable) {
433 auto field = ::arrow::field(name, array->type(), nullable);
434 return std::make_shared<::arrow::Column>(field, array);
435}
436
437static inline std::shared_ptr<::arrow::Column> MakeColumn(
438 const std::string& name, const std::vector<std::shared_ptr<Array>>& arrays,
439 bool nullable) {
440 auto field = ::arrow::field(name, arrays[0]->type(), nullable);
441 return std::make_shared<::arrow::Column>(field, arrays);
442}
443
444std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr<Array>& values,
445 bool nullable) {
446 std::shared_ptr<::arrow::Column> column = MakeColumn("col", values, nullable);
447 std::vector<std::shared_ptr<::arrow::Column>> columns({column});
448 std::vector<std::shared_ptr<::arrow::Field>> fields({column->field()});
449 auto schema = std::make_shared<::arrow::Schema>(fields);
450 return ::arrow::Table::Make(schema, columns);
451}
452
453template <typename T>
454void ExpectArray(T* expected, Array* result) {
455 auto p_array = static_cast<::arrow::PrimitiveArray*>(result);
456 for (int i = 0; i < result->length(); i++) {
457 EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]);
458 }
459}
460
461template <typename ArrowType>
462void ExpectArrayT(void* expected, Array* result) {
463 ::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result);
464 for (int64_t i = 0; i < result->length(); i++) {
465 EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i],
466 reinterpret_cast<const typename ArrowType::c_type*>(
467 p_array->values()->data())[i]);
468 }
469}
470
471template <>
472void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
473 ::arrow::BooleanBuilder builder;
474 EXPECT_OK(builder.AppendValues(reinterpret_cast<uint8_t*>(expected), result->length()));
475
476 std::shared_ptr<Array> expected_array;
477 EXPECT_OK(builder.Finish(&expected_array));
478 EXPECT_TRUE(result->Equals(*expected_array));
479}
480
481} // namespace arrow
482
483} // namespace parquet
484