1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <memory>
19#include <vector>
20
21#include "gtest/gtest.h"
22
23#include "parquet/arrow/schema.h"
24
25#include "arrow/api.h"
26#include "arrow/test-util.h"
27
28using arrow::ArrayFromVector;
29using arrow::Field;
30using arrow::TimeUnit;
31
32using ParquetType = parquet::Type;
33using parquet::LogicalType;
34using parquet::Repetition;
35using parquet::schema::GroupNode;
36using parquet::schema::NodePtr;
37using parquet::schema::PrimitiveNode;
38
39namespace parquet {
40
41namespace arrow {
42
43const auto BOOL = ::arrow::boolean();
44const auto UINT8 = ::arrow::uint8();
45const auto INT32 = ::arrow::int32();
46const auto INT64 = ::arrow::int64();
47const auto FLOAT = ::arrow::float32();
48const auto DOUBLE = ::arrow::float64();
49const auto UTF8 = ::arrow::utf8();
50const auto TIMESTAMP_MS = ::arrow::timestamp(TimeUnit::MILLI);
51const auto TIMESTAMP_US = ::arrow::timestamp(TimeUnit::MICRO);
52const auto TIMESTAMP_NS = ::arrow::timestamp(TimeUnit::NANO);
53const auto BINARY = ::arrow::binary();
54const auto DECIMAL_8_4 = std::make_shared<::arrow::Decimal128Type>(8, 4);
55
56class TestConvertParquetSchema : public ::testing::Test {
57 public:
58 virtual void SetUp() {}
59
60 void CheckFlatSchema(const std::shared_ptr<::arrow::Schema>& expected_schema) {
61 ASSERT_EQ(expected_schema->num_fields(), result_schema_->num_fields());
62 for (int i = 0; i < expected_schema->num_fields(); ++i) {
63 auto lhs = result_schema_->field(i);
64 auto rhs = expected_schema->field(i);
65 EXPECT_TRUE(lhs->Equals(rhs))
66 << i << " " << lhs->ToString() << " != " << rhs->ToString();
67 }
68 }
69
70 ::arrow::Status ConvertSchema(const std::vector<NodePtr>& nodes) {
71 NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes);
72 descr_.Init(schema);
73 return FromParquetSchema(&descr_, &result_schema_);
74 }
75
76 ::arrow::Status ConvertSchema(const std::vector<NodePtr>& nodes,
77 const std::vector<int>& column_indices) {
78 NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes);
79 descr_.Init(schema);
80 return FromParquetSchema(&descr_, column_indices, &result_schema_);
81 }
82
83 ::arrow::Status ConvertSchema(
84 const std::vector<NodePtr>& nodes,
85 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) {
86 NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes);
87 descr_.Init(schema);
88 return FromParquetSchema(&descr_, {}, key_value_metadata, &result_schema_);
89 }
90
91 protected:
92 SchemaDescriptor descr_;
93 std::shared_ptr<::arrow::Schema> result_schema_;
94};
95
96TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) {
97 std::vector<NodePtr> parquet_fields;
98 std::vector<std::shared_ptr<Field>> arrow_fields;
99
100 parquet_fields.push_back(
101 PrimitiveNode::Make("boolean", Repetition::REQUIRED, ParquetType::BOOLEAN));
102 arrow_fields.push_back(std::make_shared<Field>("boolean", BOOL, false));
103
104 parquet_fields.push_back(
105 PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32));
106 arrow_fields.push_back(std::make_shared<Field>("int32", INT32, false));
107
108 parquet_fields.push_back(
109 PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64));
110 arrow_fields.push_back(std::make_shared<Field>("int64", INT64, false));
111
112 parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
113 ParquetType::INT64,
114 LogicalType::TIMESTAMP_MILLIS));
115 arrow_fields.push_back(std::make_shared<Field>("timestamp", TIMESTAMP_MS, false));
116
117 parquet_fields.push_back(PrimitiveNode::Make("timestamp[us]", Repetition::REQUIRED,
118 ParquetType::INT64,
119 LogicalType::TIMESTAMP_MICROS));
120 arrow_fields.push_back(std::make_shared<Field>("timestamp[us]", TIMESTAMP_US, false));
121
122 parquet_fields.push_back(PrimitiveNode::Make("date", Repetition::REQUIRED,
123 ParquetType::INT32, LogicalType::DATE));
124 arrow_fields.push_back(std::make_shared<Field>("date", ::arrow::date32(), false));
125
126 parquet_fields.push_back(PrimitiveNode::Make(
127 "time32", Repetition::REQUIRED, ParquetType::INT32, LogicalType::TIME_MILLIS));
128 arrow_fields.push_back(
129 std::make_shared<Field>("time32", ::arrow::time32(TimeUnit::MILLI), false));
130
131 parquet_fields.push_back(PrimitiveNode::Make(
132 "time64", Repetition::REQUIRED, ParquetType::INT64, LogicalType::TIME_MICROS));
133 arrow_fields.push_back(
134 std::make_shared<Field>("time64", ::arrow::time64(TimeUnit::MICRO), false));
135
136 parquet_fields.push_back(
137 PrimitiveNode::Make("timestamp96", Repetition::REQUIRED, ParquetType::INT96));
138 arrow_fields.push_back(std::make_shared<Field>("timestamp96", TIMESTAMP_NS, false));
139
140 parquet_fields.push_back(
141 PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT));
142 arrow_fields.push_back(std::make_shared<Field>("float", FLOAT));
143
144 parquet_fields.push_back(
145 PrimitiveNode::Make("double", Repetition::OPTIONAL, ParquetType::DOUBLE));
146 arrow_fields.push_back(std::make_shared<Field>("double", DOUBLE));
147
148 parquet_fields.push_back(
149 PrimitiveNode::Make("binary", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY));
150 arrow_fields.push_back(std::make_shared<Field>("binary", BINARY));
151
152 parquet_fields.push_back(PrimitiveNode::Make(
153 "string", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8));
154 arrow_fields.push_back(std::make_shared<Field>("string", UTF8));
155
156 parquet_fields.push_back(PrimitiveNode::Make("flba-binary", Repetition::OPTIONAL,
157 ParquetType::FIXED_LEN_BYTE_ARRAY,
158 LogicalType::NONE, 12));
159 arrow_fields.push_back(
160 std::make_shared<Field>("flba-binary", ::arrow::fixed_size_binary(12)));
161
162 auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields);
163 ASSERT_OK(ConvertSchema(parquet_fields));
164
165 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
166}
167
168TEST_F(TestConvertParquetSchema, DuplicateFieldNames) {
169 std::vector<NodePtr> parquet_fields;
170 std::vector<std::shared_ptr<Field>> arrow_fields;
171
172 parquet_fields.push_back(
173 PrimitiveNode::Make("xxx", Repetition::REQUIRED, ParquetType::BOOLEAN));
174 auto arrow_field1 = std::make_shared<Field>("xxx", BOOL, false);
175
176 parquet_fields.push_back(
177 PrimitiveNode::Make("xxx", Repetition::REQUIRED, ParquetType::INT32));
178 auto arrow_field2 = std::make_shared<Field>("xxx", INT32, false);
179
180 ASSERT_OK(ConvertSchema(parquet_fields));
181 arrow_fields = {arrow_field1, arrow_field2};
182 ASSERT_NO_FATAL_FAILURE(
183 CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields)));
184
185 ASSERT_OK(ConvertSchema(parquet_fields, std::vector<int>({0, 1})));
186 arrow_fields = {arrow_field1, arrow_field2};
187 ASSERT_NO_FATAL_FAILURE(
188 CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields)));
189
190 ASSERT_OK(ConvertSchema(parquet_fields, std::vector<int>({1, 0})));
191 arrow_fields = {arrow_field2, arrow_field1};
192 ASSERT_NO_FATAL_FAILURE(
193 CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields)));
194}
195
196TEST_F(TestConvertParquetSchema, ParquetKeyValueMetadata) {
197 std::vector<NodePtr> parquet_fields;
198 std::vector<std::shared_ptr<Field>> arrow_fields;
199
200 parquet_fields.push_back(
201 PrimitiveNode::Make("boolean", Repetition::REQUIRED, ParquetType::BOOLEAN));
202 arrow_fields.push_back(std::make_shared<Field>("boolean", BOOL, false));
203
204 parquet_fields.push_back(
205 PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32));
206 arrow_fields.push_back(std::make_shared<Field>("int32", INT32, false));
207
208 auto key_value_metadata = std::make_shared<KeyValueMetadata>();
209 key_value_metadata->Append("foo", "bar");
210 key_value_metadata->Append("biz", "baz");
211 ASSERT_OK(ConvertSchema(parquet_fields, key_value_metadata));
212
213 auto arrow_metadata = result_schema_->metadata();
214 ASSERT_EQ("foo", arrow_metadata->key(0));
215 ASSERT_EQ("bar", arrow_metadata->value(0));
216 ASSERT_EQ("biz", arrow_metadata->key(1));
217 ASSERT_EQ("baz", arrow_metadata->value(1));
218}
219
220TEST_F(TestConvertParquetSchema, ParquetEmptyKeyValueMetadata) {
221 std::vector<NodePtr> parquet_fields;
222 std::vector<std::shared_ptr<Field>> arrow_fields;
223
224 parquet_fields.push_back(
225 PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32));
226 arrow_fields.push_back(std::make_shared<Field>("int32", INT32, false));
227
228 std::shared_ptr<KeyValueMetadata> key_value_metadata = nullptr;
229 ASSERT_OK(ConvertSchema(parquet_fields, key_value_metadata));
230
231 auto arrow_metadata = result_schema_->metadata();
232 ASSERT_EQ(arrow_metadata, nullptr);
233}
234
235TEST_F(TestConvertParquetSchema, ParquetFlatDecimals) {
236 std::vector<NodePtr> parquet_fields;
237 std::vector<std::shared_ptr<Field>> arrow_fields;
238
239 parquet_fields.push_back(PrimitiveNode::Make("flba-decimal", Repetition::OPTIONAL,
240 ParquetType::FIXED_LEN_BYTE_ARRAY,
241 LogicalType::DECIMAL, 4, 8, 4));
242 arrow_fields.push_back(std::make_shared<Field>("flba-decimal", DECIMAL_8_4));
243
244 parquet_fields.push_back(PrimitiveNode::Make("binary-decimal", Repetition::OPTIONAL,
245 ParquetType::BYTE_ARRAY,
246 LogicalType::DECIMAL, -1, 8, 4));
247 arrow_fields.push_back(std::make_shared<Field>("binary-decimal", DECIMAL_8_4));
248
249 parquet_fields.push_back(PrimitiveNode::Make("int32-decimal", Repetition::OPTIONAL,
250 ParquetType::INT32, LogicalType::DECIMAL,
251 -1, 8, 4));
252 arrow_fields.push_back(std::make_shared<Field>("int32-decimal", DECIMAL_8_4));
253
254 parquet_fields.push_back(PrimitiveNode::Make("int64-decimal", Repetition::OPTIONAL,
255 ParquetType::INT64, LogicalType::DECIMAL,
256 -1, 8, 4));
257 arrow_fields.push_back(std::make_shared<Field>("int64-decimal", DECIMAL_8_4));
258
259 auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields);
260 ASSERT_OK(ConvertSchema(parquet_fields));
261
262 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
263}
264
265TEST_F(TestConvertParquetSchema, ParquetLists) {
266 std::vector<NodePtr> parquet_fields;
267 std::vector<std::shared_ptr<Field>> arrow_fields;
268
269 // LIST encoding example taken from parquet-format/LogicalTypes.md
270
271 // // List<String> (list non-null, elements nullable)
272 // required group my_list (LIST) {
273 // repeated group list {
274 // optional binary element (UTF8);
275 // }
276 // }
277 {
278 auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
279 ParquetType::BYTE_ARRAY, LogicalType::UTF8);
280 auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
281 parquet_fields.push_back(
282 GroupNode::Make("my_list", Repetition::REQUIRED, {list}, LogicalType::LIST));
283 auto arrow_element = std::make_shared<Field>("string", UTF8, true);
284 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
285 arrow_fields.push_back(std::make_shared<Field>("my_list", arrow_list, false));
286 }
287
288 // // List<String> (list nullable, elements non-null)
289 // optional group my_list (LIST) {
290 // repeated group list {
291 // required binary element (UTF8);
292 // }
293 // }
294 {
295 auto element = PrimitiveNode::Make("string", Repetition::REQUIRED,
296 ParquetType::BYTE_ARRAY, LogicalType::UTF8);
297 auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
298 parquet_fields.push_back(
299 GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, LogicalType::LIST));
300 auto arrow_element = std::make_shared<Field>("string", UTF8, false);
301 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
302 arrow_fields.push_back(std::make_shared<Field>("my_list", arrow_list, true));
303 }
304
305 // Element types can be nested structures. For example, a list of lists:
306 //
307 // // List<List<Integer>>
308 // optional group array_of_arrays (LIST) {
309 // repeated group list {
310 // required group element (LIST) {
311 // repeated group list {
312 // required int32 element;
313 // }
314 // }
315 // }
316 // }
317 {
318 auto inner_element =
319 PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32);
320 auto inner_list = GroupNode::Make("list", Repetition::REPEATED, {inner_element});
321 auto element =
322 GroupNode::Make("element", Repetition::REQUIRED, {inner_list}, LogicalType::LIST);
323 auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
324 parquet_fields.push_back(GroupNode::Make("array_of_arrays", Repetition::OPTIONAL,
325 {list}, LogicalType::LIST));
326 auto arrow_inner_element = std::make_shared<Field>("int32", INT32, false);
327 auto arrow_inner_list = std::make_shared<::arrow::ListType>(arrow_inner_element);
328 auto arrow_element = std::make_shared<Field>("element", arrow_inner_list, false);
329 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
330 arrow_fields.push_back(std::make_shared<Field>("array_of_arrays", arrow_list, true));
331 }
332
333 // // List<String> (list nullable, elements non-null)
334 // optional group my_list (LIST) {
335 // repeated group element {
336 // required binary str (UTF8);
337 // };
338 // }
339 {
340 auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
341 ParquetType::BYTE_ARRAY, LogicalType::UTF8);
342 auto list = GroupNode::Make("element", Repetition::REPEATED, {element});
343 parquet_fields.push_back(
344 GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, LogicalType::LIST));
345 auto arrow_element = std::make_shared<Field>("str", UTF8, false);
346 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
347 arrow_fields.push_back(std::make_shared<Field>("my_list", arrow_list, true));
348 }
349
350 // // List<Integer> (nullable list, non-null elements)
351 // optional group my_list (LIST) {
352 // repeated int32 element;
353 // }
354 {
355 auto element =
356 PrimitiveNode::Make("element", Repetition::REPEATED, ParquetType::INT32);
357 parquet_fields.push_back(
358 GroupNode::Make("my_list", Repetition::OPTIONAL, {element}, LogicalType::LIST));
359 auto arrow_element = std::make_shared<Field>("element", INT32, false);
360 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
361 arrow_fields.push_back(std::make_shared<Field>("my_list", arrow_list, true));
362 }
363
364 // // List<Tuple<String, Integer>> (nullable list, non-null elements)
365 // optional group my_list (LIST) {
366 // repeated group element {
367 // required binary str (UTF8);
368 // required int32 num;
369 // };
370 // }
371 {
372 auto str_element = PrimitiveNode::Make("str", Repetition::REQUIRED,
373 ParquetType::BYTE_ARRAY, LogicalType::UTF8);
374 auto num_element =
375 PrimitiveNode::Make("num", Repetition::REQUIRED, ParquetType::INT32);
376 auto element =
377 GroupNode::Make("element", Repetition::REPEATED, {str_element, num_element});
378 parquet_fields.push_back(
379 GroupNode::Make("my_list", Repetition::OPTIONAL, {element}, LogicalType::LIST));
380 auto arrow_str = std::make_shared<Field>("str", UTF8, false);
381 auto arrow_num = std::make_shared<Field>("num", INT32, false);
382 std::vector<std::shared_ptr<Field>> fields({arrow_str, arrow_num});
383 auto arrow_struct = std::make_shared<::arrow::StructType>(fields);
384 auto arrow_element = std::make_shared<Field>("element", arrow_struct, false);
385 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
386 arrow_fields.push_back(std::make_shared<Field>("my_list", arrow_list, true));
387 }
388
389 // // List<OneTuple<String>> (nullable list, non-null elements)
390 // optional group my_list (LIST) {
391 // repeated group array {
392 // required binary str (UTF8);
393 // };
394 // }
395 // Special case: group is named array
396 {
397 auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
398 ParquetType::BYTE_ARRAY, LogicalType::UTF8);
399 auto array = GroupNode::Make("array", Repetition::REPEATED, {element});
400 parquet_fields.push_back(
401 GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, LogicalType::LIST));
402 auto arrow_str = std::make_shared<Field>("str", UTF8, false);
403 std::vector<std::shared_ptr<Field>> fields({arrow_str});
404 auto arrow_struct = std::make_shared<::arrow::StructType>(fields);
405 auto arrow_element = std::make_shared<Field>("array", arrow_struct, false);
406 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
407 arrow_fields.push_back(std::make_shared<Field>("my_list", arrow_list, true));
408 }
409
410 // // List<OneTuple<String>> (nullable list, non-null elements)
411 // optional group my_list (LIST) {
412 // repeated group my_list_tuple {
413 // required binary str (UTF8);
414 // };
415 // }
416 // Special case: group named ends in _tuple
417 {
418 auto element = PrimitiveNode::Make("str", Repetition::REQUIRED,
419 ParquetType::BYTE_ARRAY, LogicalType::UTF8);
420 auto array = GroupNode::Make("my_list_tuple", Repetition::REPEATED, {element});
421 parquet_fields.push_back(
422 GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, LogicalType::LIST));
423 auto arrow_str = std::make_shared<Field>("str", UTF8, false);
424 std::vector<std::shared_ptr<Field>> fields({arrow_str});
425 auto arrow_struct = std::make_shared<::arrow::StructType>(fields);
426 auto arrow_element = std::make_shared<Field>("my_list_tuple", arrow_struct, false);
427 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
428 arrow_fields.push_back(std::make_shared<Field>("my_list", arrow_list, true));
429 }
430
431 // One-level encoding: Only allows required lists with required cells
432 // repeated value_type name
433 {
434 parquet_fields.push_back(
435 PrimitiveNode::Make("name", Repetition::REPEATED, ParquetType::INT32));
436 auto arrow_element = std::make_shared<Field>("name", INT32, false);
437 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
438 arrow_fields.push_back(std::make_shared<Field>("name", arrow_list, false));
439 }
440
441 auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields);
442 ASSERT_OK(ConvertSchema(parquet_fields));
443
444 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
445}
446
447TEST_F(TestConvertParquetSchema, UnsupportedThings) {
448 std::vector<NodePtr> unsupported_nodes;
449
450 for (const NodePtr& node : unsupported_nodes) {
451 ASSERT_RAISES(NotImplemented, ConvertSchema({node}));
452 }
453}
454
455TEST_F(TestConvertParquetSchema, ParquetNestedSchema) {
456 std::vector<NodePtr> parquet_fields;
457 std::vector<std::shared_ptr<Field>> arrow_fields;
458
459 // required group group1 {
460 // required bool leaf1;
461 // required int32 leaf2;
462 // }
463 // required int64 leaf3;
464 {
465 parquet_fields.push_back(GroupNode::Make(
466 "group1", Repetition::REQUIRED,
467 {PrimitiveNode::Make("leaf1", Repetition::REQUIRED, ParquetType::BOOLEAN),
468 PrimitiveNode::Make("leaf2", Repetition::REQUIRED, ParquetType::INT32)}));
469 parquet_fields.push_back(
470 PrimitiveNode::Make("leaf3", Repetition::REQUIRED, ParquetType::INT64));
471
472 auto group1_fields = {std::make_shared<Field>("leaf1", BOOL, false),
473 std::make_shared<Field>("leaf2", INT32, false)};
474 auto arrow_group1_type = std::make_shared<::arrow::StructType>(group1_fields);
475 arrow_fields.push_back(std::make_shared<Field>("group1", arrow_group1_type, false));
476 arrow_fields.push_back(std::make_shared<Field>("leaf3", INT64, false));
477 }
478
479 auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields);
480 ASSERT_OK(ConvertSchema(parquet_fields));
481
482 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
483}
484
485TEST_F(TestConvertParquetSchema, ParquetNestedSchemaPartial) {
486 std::vector<NodePtr> parquet_fields;
487 std::vector<std::shared_ptr<Field>> arrow_fields;
488
489 // Full Parquet Schema:
490 // required group group1 {
491 // required int64 leaf1;
492 // required int64 leaf2;
493 // }
494 // required group group2 {
495 // required int64 leaf3;
496 // required int64 leaf4;
497 // }
498 // required int64 leaf5;
499 //
500 // Expected partial arrow schema (columns 0, 3, 4):
501 // required group group1 {
502 // required int64 leaf1;
503 // }
504 // required group group2 {
505 // required int64 leaf4;
506 // }
507 // required int64 leaf5;
508 {
509 parquet_fields.push_back(GroupNode::Make(
510 "group1", Repetition::REQUIRED,
511 {PrimitiveNode::Make("leaf1", Repetition::REQUIRED, ParquetType::INT64),
512 PrimitiveNode::Make("leaf2", Repetition::REQUIRED, ParquetType::INT64)}));
513 parquet_fields.push_back(GroupNode::Make(
514 "group2", Repetition::REQUIRED,
515 {PrimitiveNode::Make("leaf3", Repetition::REQUIRED, ParquetType::INT64),
516 PrimitiveNode::Make("leaf4", Repetition::REQUIRED, ParquetType::INT64)}));
517 parquet_fields.push_back(
518 PrimitiveNode::Make("leaf5", Repetition::REQUIRED, ParquetType::INT64));
519
520 auto group1_fields = {std::make_shared<Field>("leaf1", INT64, false)};
521 auto arrow_group1_type = std::make_shared<::arrow::StructType>(group1_fields);
522 auto group2_fields = {std::make_shared<Field>("leaf4", INT64, false)};
523 auto arrow_group2_type = std::make_shared<::arrow::StructType>(group2_fields);
524
525 arrow_fields.push_back(std::make_shared<Field>("group1", arrow_group1_type, false));
526 arrow_fields.push_back(std::make_shared<Field>("group2", arrow_group2_type, false));
527 arrow_fields.push_back(std::make_shared<Field>("leaf5", INT64, false));
528 }
529
530 auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields);
531 ASSERT_OK(ConvertSchema(parquet_fields, std::vector<int>{0, 3, 4}));
532
533 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
534}
535
536TEST_F(TestConvertParquetSchema, ParquetNestedSchemaPartialOrdering) {
537 std::vector<NodePtr> parquet_fields;
538 std::vector<std::shared_ptr<Field>> arrow_fields;
539
540 // Full Parquet Schema:
541 // required group group1 {
542 // required int64 leaf1;
543 // required int64 leaf2;
544 // }
545 // required group group2 {
546 // required int64 leaf3;
547 // required int64 leaf4;
548 // }
549 // required int64 leaf5;
550 //
551 // Expected partial arrow schema (columns 3, 4, 0):
552 // required group group2 {
553 // required int64 leaf4;
554 // }
555 // required int64 leaf5;
556 // required group group1 {
557 // required int64 leaf1;
558 // }
559 {
560 parquet_fields.push_back(GroupNode::Make(
561 "group1", Repetition::REQUIRED,
562 {PrimitiveNode::Make("leaf1", Repetition::REQUIRED, ParquetType::INT64),
563 PrimitiveNode::Make("leaf2", Repetition::REQUIRED, ParquetType::INT64)}));
564 parquet_fields.push_back(GroupNode::Make(
565 "group2", Repetition::REQUIRED,
566 {PrimitiveNode::Make("leaf3", Repetition::REQUIRED, ParquetType::INT64),
567 PrimitiveNode::Make("leaf4", Repetition::REQUIRED, ParquetType::INT64)}));
568 parquet_fields.push_back(
569 PrimitiveNode::Make("leaf5", Repetition::REQUIRED, ParquetType::INT64));
570
571 auto group1_fields = {std::make_shared<Field>("leaf1", INT64, false)};
572 auto arrow_group1_type = std::make_shared<::arrow::StructType>(group1_fields);
573 auto group2_fields = {std::make_shared<Field>("leaf4", INT64, false)};
574 auto arrow_group2_type = std::make_shared<::arrow::StructType>(group2_fields);
575
576 arrow_fields.push_back(std::make_shared<Field>("group2", arrow_group2_type, false));
577 arrow_fields.push_back(std::make_shared<Field>("leaf5", INT64, false));
578 arrow_fields.push_back(std::make_shared<Field>("group1", arrow_group1_type, false));
579 }
580
581 auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields);
582 ASSERT_OK(ConvertSchema(parquet_fields, std::vector<int>{3, 4, 0}));
583
584 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
585}
586TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) {
587 std::vector<NodePtr> parquet_fields;
588 std::vector<std::shared_ptr<Field>> arrow_fields;
589 {
590 // optional int32 leaf1;
591 // repeated group outerGroup {
592 // optional int32 leaf2;
593 // repeated group innerGroup {
594 // optional int32 leaf3;
595 // }
596 // }
597 parquet_fields.push_back(
598 PrimitiveNode::Make("leaf1", Repetition::OPTIONAL, ParquetType::INT32));
599 parquet_fields.push_back(GroupNode::Make(
600 "outerGroup", Repetition::REPEATED,
601 {PrimitiveNode::Make("leaf2", Repetition::OPTIONAL, ParquetType::INT32),
602 GroupNode::Make(
603 "innerGroup", Repetition::REPEATED,
604 {PrimitiveNode::Make("leaf3", Repetition::OPTIONAL, ParquetType::INT32)})}));
605
606 auto inner_group_fields = {std::make_shared<Field>("leaf3", INT32, true)};
607 auto inner_group_type = std::make_shared<::arrow::StructType>(inner_group_fields);
608 auto outer_group_fields = {
609 std::make_shared<Field>("leaf2", INT32, true),
610 std::make_shared<Field>(
611 "innerGroup",
612 ::arrow::list(std::make_shared<Field>("innerGroup", inner_group_type, false)),
613 false)};
614 auto outer_group_type = std::make_shared<::arrow::StructType>(outer_group_fields);
615
616 arrow_fields.push_back(std::make_shared<Field>("leaf1", INT32, true));
617 arrow_fields.push_back(std::make_shared<Field>(
618 "outerGroup",
619 ::arrow::list(std::make_shared<Field>("outerGroup", outer_group_type, false)),
620 false));
621 }
622 auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields);
623 ASSERT_OK(ConvertSchema(parquet_fields));
624
625 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
626}
627
628class TestConvertArrowSchema : public ::testing::Test {
629 public:
630 virtual void SetUp() {}
631
632 void CheckFlatSchema(const std::vector<NodePtr>& nodes) {
633 NodePtr schema_node = GroupNode::Make("schema", Repetition::REPEATED, nodes);
634 const GroupNode* expected_schema_node =
635 static_cast<const GroupNode*>(schema_node.get());
636 const GroupNode* result_schema_node = result_schema_->group_node();
637
638 ASSERT_EQ(expected_schema_node->field_count(), result_schema_node->field_count());
639
640 for (int i = 0; i < expected_schema_node->field_count(); i++) {
641 auto lhs = result_schema_node->field(i);
642 auto rhs = expected_schema_node->field(i);
643 EXPECT_TRUE(lhs->Equals(rhs.get()));
644 }
645 }
646
647 ::arrow::Status ConvertSchema(const std::vector<std::shared_ptr<Field>>& fields) {
648 arrow_schema_ = std::make_shared<::arrow::Schema>(fields);
649 std::shared_ptr<::parquet::WriterProperties> properties =
650 ::parquet::default_writer_properties();
651 return ToParquetSchema(arrow_schema_.get(), *properties.get(), &result_schema_);
652 }
653
654 protected:
655 std::shared_ptr<::arrow::Schema> arrow_schema_;
656 std::shared_ptr<SchemaDescriptor> result_schema_;
657};
658
659TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) {
660 std::vector<NodePtr> parquet_fields;
661 std::vector<std::shared_ptr<Field>> arrow_fields;
662
663 parquet_fields.push_back(
664 PrimitiveNode::Make("boolean", Repetition::REQUIRED, ParquetType::BOOLEAN));
665 arrow_fields.push_back(std::make_shared<Field>("boolean", BOOL, false));
666
667 parquet_fields.push_back(
668 PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32));
669 arrow_fields.push_back(std::make_shared<Field>("int32", INT32, false));
670
671 parquet_fields.push_back(
672 PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64));
673 arrow_fields.push_back(std::make_shared<Field>("int64", INT64, false));
674
675 parquet_fields.push_back(PrimitiveNode::Make("date", Repetition::REQUIRED,
676 ParquetType::INT32, LogicalType::DATE));
677 arrow_fields.push_back(std::make_shared<Field>("date", ::arrow::date32(), false));
678
679 parquet_fields.push_back(PrimitiveNode::Make("date64", Repetition::REQUIRED,
680 ParquetType::INT32, LogicalType::DATE));
681 arrow_fields.push_back(std::make_shared<Field>("date64", ::arrow::date64(), false));
682
683 parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
684 ParquetType::INT64,
685 LogicalType::TIMESTAMP_MILLIS));
686 arrow_fields.push_back(std::make_shared<Field>("timestamp", TIMESTAMP_MS, false));
687
688 parquet_fields.push_back(PrimitiveNode::Make("timestamp[us]", Repetition::REQUIRED,
689 ParquetType::INT64,
690 LogicalType::TIMESTAMP_MICROS));
691 arrow_fields.push_back(std::make_shared<Field>("timestamp[us]", TIMESTAMP_US, false));
692
693 parquet_fields.push_back(
694 PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT));
695 arrow_fields.push_back(std::make_shared<Field>("float", FLOAT));
696
697 parquet_fields.push_back(
698 PrimitiveNode::Make("double", Repetition::OPTIONAL, ParquetType::DOUBLE));
699 arrow_fields.push_back(std::make_shared<Field>("double", DOUBLE));
700
701 parquet_fields.push_back(PrimitiveNode::Make(
702 "string", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8));
703 arrow_fields.push_back(std::make_shared<Field>("string", UTF8));
704
705 parquet_fields.push_back(PrimitiveNode::Make(
706 "binary", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::NONE));
707 arrow_fields.push_back(std::make_shared<Field>("binary", BINARY));
708
709 ASSERT_OK(ConvertSchema(arrow_fields));
710
711 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
712}
713
714TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) {
715 std::vector<NodePtr> parquet_fields;
716 std::vector<std::shared_ptr<Field>> arrow_fields;
717 std::shared_ptr<::arrow::Array> dict;
718
719 parquet_fields.push_back(
720 PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32));
721 ArrayFromVector<::arrow::Int32Type, int32_t>(std::vector<int32_t>(), &dict);
722 arrow_fields.push_back(
723 ::arrow::field("int32", ::arrow::dictionary(::arrow::int8(), dict), false));
724
725 parquet_fields.push_back(
726 PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64));
727 ArrayFromVector<::arrow::Int64Type, int64_t>(std::vector<int64_t>(), &dict);
728 arrow_fields.push_back(std::make_shared<Field>(
729 "int64", ::arrow::dictionary(::arrow::int8(), dict), false));
730
731 parquet_fields.push_back(PrimitiveNode::Make("date", Repetition::REQUIRED,
732 ParquetType::INT32, LogicalType::DATE));
733 ArrayFromVector<::arrow::Date32Type, int32_t>(std::vector<int32_t>(), &dict);
734 arrow_fields.push_back(
735 std::make_shared<Field>("date", ::arrow::dictionary(::arrow::int8(), dict), false));
736
737 parquet_fields.push_back(PrimitiveNode::Make("date64", Repetition::REQUIRED,
738 ParquetType::INT32, LogicalType::DATE));
739 ArrayFromVector<::arrow::Date64Type, int64_t>(std::vector<int64_t>(), &dict);
740 arrow_fields.push_back(std::make_shared<Field>(
741 "date64", ::arrow::dictionary(::arrow::int8(), dict), false));
742
743 parquet_fields.push_back(
744 PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT));
745 ArrayFromVector<::arrow::FloatType, float>(std::vector<float>(), &dict);
746 arrow_fields.push_back(
747 std::make_shared<Field>("float", ::arrow::dictionary(::arrow::int8(), dict)));
748
749 parquet_fields.push_back(
750 PrimitiveNode::Make("double", Repetition::OPTIONAL, ParquetType::DOUBLE));
751 ArrayFromVector<::arrow::DoubleType, double>(std::vector<double>(), &dict);
752 arrow_fields.push_back(
753 std::make_shared<Field>("double", ::arrow::dictionary(::arrow::int8(), dict)));
754
755 parquet_fields.push_back(PrimitiveNode::Make(
756 "string", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8));
757 ::arrow::StringBuilder string_builder(::arrow::default_memory_pool());
758 ASSERT_OK(string_builder.Finish(&dict));
759 arrow_fields.push_back(
760 std::make_shared<Field>("string", ::arrow::dictionary(::arrow::int8(), dict)));
761
762 parquet_fields.push_back(PrimitiveNode::Make(
763 "binary", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::NONE));
764 ::arrow::BinaryBuilder binary_builder(::arrow::default_memory_pool());
765 ASSERT_OK(binary_builder.Finish(&dict));
766 arrow_fields.push_back(
767 std::make_shared<Field>("binary", ::arrow::dictionary(::arrow::int8(), dict)));
768
769 ASSERT_OK(ConvertSchema(arrow_fields));
770
771 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
772}
773
774TEST_F(TestConvertArrowSchema, ParquetLists) {
775 std::vector<NodePtr> parquet_fields;
776 std::vector<std::shared_ptr<Field>> arrow_fields;
777
778 // parquet_arrow will always generate 3-level LIST encodings
779
780 // // List<String> (list non-null, elements nullable)
781 // required group my_list (LIST) {
782 // repeated group list {
783 // optional binary element (UTF8);
784 // }
785 // }
786 {
787 auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
788 ParquetType::BYTE_ARRAY, LogicalType::UTF8);
789 auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
790 parquet_fields.push_back(
791 GroupNode::Make("my_list", Repetition::REQUIRED, {list}, LogicalType::LIST));
792 auto arrow_element = std::make_shared<Field>("string", UTF8, true);
793 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
794 arrow_fields.push_back(std::make_shared<Field>("my_list", arrow_list, false));
795 }
796
797 // // List<String> (list nullable, elements non-null)
798 // optional group my_list (LIST) {
799 // repeated group list {
800 // required binary element (UTF8);
801 // }
802 // }
803 {
804 auto element = PrimitiveNode::Make("string", Repetition::REQUIRED,
805 ParquetType::BYTE_ARRAY, LogicalType::UTF8);
806 auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
807 parquet_fields.push_back(
808 GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, LogicalType::LIST));
809 auto arrow_element = std::make_shared<Field>("string", UTF8, false);
810 auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element);
811 arrow_fields.push_back(std::make_shared<Field>("my_list", arrow_list, true));
812 }
813
814 ASSERT_OK(ConvertSchema(arrow_fields));
815
816 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
817}
818
819TEST_F(TestConvertArrowSchema, UnsupportedTypes) {
820 std::vector<std::shared_ptr<Field>> unsupported_fields = {
821 ::arrow::field("f0", ::arrow::time64(TimeUnit::NANO))};
822
823 for (const auto& field : unsupported_fields) {
824 ASSERT_RAISES(NotImplemented, ConvertSchema({field}));
825 }
826}
827
828TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) {
829 std::vector<NodePtr> parquet_fields;
830 std::vector<std::shared_ptr<Field>> arrow_fields;
831
832 // TODO: Test Decimal Arrow -> Parquet conversion
833
834 ASSERT_OK(ConvertSchema(arrow_fields));
835
836 ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
837}
838
839TEST(InvalidSchema, ParquetNegativeDecimalScale) {
840 const auto& type = ::arrow::decimal(23, -2);
841 const auto& field = ::arrow::field("f0", type);
842 const auto& arrow_schema = ::arrow::schema({field});
843 std::shared_ptr<::parquet::WriterProperties> properties =
844 ::parquet::default_writer_properties();
845 std::shared_ptr<SchemaDescriptor> result_schema;
846
847 ASSERT_RAISES(IOError,
848 ToParquetSchema(arrow_schema.get(), *properties.get(), &result_schema));
849}
850
851} // namespace arrow
852} // namespace parquet
853