1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <memory> |
19 | #include <vector> |
20 | |
21 | #include "gtest/gtest.h" |
22 | |
23 | #include "parquet/arrow/schema.h" |
24 | |
25 | #include "arrow/api.h" |
26 | #include "arrow/test-util.h" |
27 | |
28 | using arrow::ArrayFromVector; |
29 | using arrow::Field; |
30 | using arrow::TimeUnit; |
31 | |
32 | using ParquetType = parquet::Type; |
33 | using parquet::LogicalType; |
34 | using parquet::Repetition; |
35 | using parquet::schema::GroupNode; |
36 | using parquet::schema::NodePtr; |
37 | using parquet::schema::PrimitiveNode; |
38 | |
39 | namespace parquet { |
40 | |
41 | namespace arrow { |
42 | |
43 | const auto BOOL = ::arrow::boolean(); |
44 | const auto UINT8 = ::arrow::uint8(); |
45 | const auto INT32 = ::arrow::int32(); |
46 | const auto INT64 = ::arrow::int64(); |
47 | const auto FLOAT = ::arrow::float32(); |
48 | const auto DOUBLE = ::arrow::float64(); |
49 | const auto UTF8 = ::arrow::utf8(); |
50 | const auto TIMESTAMP_MS = ::arrow::timestamp(TimeUnit::MILLI); |
51 | const auto TIMESTAMP_US = ::arrow::timestamp(TimeUnit::MICRO); |
52 | const auto TIMESTAMP_NS = ::arrow::timestamp(TimeUnit::NANO); |
53 | const auto BINARY = ::arrow::binary(); |
54 | const auto DECIMAL_8_4 = std::make_shared<::arrow::Decimal128Type>(8, 4); |
55 | |
56 | class TestConvertParquetSchema : public ::testing::Test { |
57 | public: |
58 | virtual void SetUp() {} |
59 | |
60 | void CheckFlatSchema(const std::shared_ptr<::arrow::Schema>& expected_schema) { |
61 | ASSERT_EQ(expected_schema->num_fields(), result_schema_->num_fields()); |
62 | for (int i = 0; i < expected_schema->num_fields(); ++i) { |
63 | auto lhs = result_schema_->field(i); |
64 | auto rhs = expected_schema->field(i); |
65 | EXPECT_TRUE(lhs->Equals(rhs)) |
66 | << i << " " << lhs->ToString() << " != " << rhs->ToString(); |
67 | } |
68 | } |
69 | |
70 | ::arrow::Status ConvertSchema(const std::vector<NodePtr>& nodes) { |
71 | NodePtr schema = GroupNode::Make("schema" , Repetition::REPEATED, nodes); |
72 | descr_.Init(schema); |
73 | return FromParquetSchema(&descr_, &result_schema_); |
74 | } |
75 | |
76 | ::arrow::Status ConvertSchema(const std::vector<NodePtr>& nodes, |
77 | const std::vector<int>& column_indices) { |
78 | NodePtr schema = GroupNode::Make("schema" , Repetition::REPEATED, nodes); |
79 | descr_.Init(schema); |
80 | return FromParquetSchema(&descr_, column_indices, &result_schema_); |
81 | } |
82 | |
83 | ::arrow::Status ConvertSchema( |
84 | const std::vector<NodePtr>& nodes, |
85 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) { |
86 | NodePtr schema = GroupNode::Make("schema" , Repetition::REPEATED, nodes); |
87 | descr_.Init(schema); |
88 | return FromParquetSchema(&descr_, {}, key_value_metadata, &result_schema_); |
89 | } |
90 | |
91 | protected: |
92 | SchemaDescriptor descr_; |
93 | std::shared_ptr<::arrow::Schema> result_schema_; |
94 | }; |
95 | |
96 | TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { |
97 | std::vector<NodePtr> parquet_fields; |
98 | std::vector<std::shared_ptr<Field>> arrow_fields; |
99 | |
100 | parquet_fields.push_back( |
101 | PrimitiveNode::Make("boolean" , Repetition::REQUIRED, ParquetType::BOOLEAN)); |
102 | arrow_fields.push_back(std::make_shared<Field>("boolean" , BOOL, false)); |
103 | |
104 | parquet_fields.push_back( |
105 | PrimitiveNode::Make("int32" , Repetition::REQUIRED, ParquetType::INT32)); |
106 | arrow_fields.push_back(std::make_shared<Field>("int32" , INT32, false)); |
107 | |
108 | parquet_fields.push_back( |
109 | PrimitiveNode::Make("int64" , Repetition::REQUIRED, ParquetType::INT64)); |
110 | arrow_fields.push_back(std::make_shared<Field>("int64" , INT64, false)); |
111 | |
112 | parquet_fields.push_back(PrimitiveNode::Make("timestamp" , Repetition::REQUIRED, |
113 | ParquetType::INT64, |
114 | LogicalType::TIMESTAMP_MILLIS)); |
115 | arrow_fields.push_back(std::make_shared<Field>("timestamp" , TIMESTAMP_MS, false)); |
116 | |
117 | parquet_fields.push_back(PrimitiveNode::Make("timestamp[us]" , Repetition::REQUIRED, |
118 | ParquetType::INT64, |
119 | LogicalType::TIMESTAMP_MICROS)); |
120 | arrow_fields.push_back(std::make_shared<Field>("timestamp[us]" , TIMESTAMP_US, false)); |
121 | |
122 | parquet_fields.push_back(PrimitiveNode::Make("date" , Repetition::REQUIRED, |
123 | ParquetType::INT32, LogicalType::DATE)); |
124 | arrow_fields.push_back(std::make_shared<Field>("date" , ::arrow::date32(), false)); |
125 | |
126 | parquet_fields.push_back(PrimitiveNode::Make( |
127 | "time32" , Repetition::REQUIRED, ParquetType::INT32, LogicalType::TIME_MILLIS)); |
128 | arrow_fields.push_back( |
129 | std::make_shared<Field>("time32" , ::arrow::time32(TimeUnit::MILLI), false)); |
130 | |
131 | parquet_fields.push_back(PrimitiveNode::Make( |
132 | "time64" , Repetition::REQUIRED, ParquetType::INT64, LogicalType::TIME_MICROS)); |
133 | arrow_fields.push_back( |
134 | std::make_shared<Field>("time64" , ::arrow::time64(TimeUnit::MICRO), false)); |
135 | |
136 | parquet_fields.push_back( |
137 | PrimitiveNode::Make("timestamp96" , Repetition::REQUIRED, ParquetType::INT96)); |
138 | arrow_fields.push_back(std::make_shared<Field>("timestamp96" , TIMESTAMP_NS, false)); |
139 | |
140 | parquet_fields.push_back( |
141 | PrimitiveNode::Make("float" , Repetition::OPTIONAL, ParquetType::FLOAT)); |
142 | arrow_fields.push_back(std::make_shared<Field>("float" , FLOAT)); |
143 | |
144 | parquet_fields.push_back( |
145 | PrimitiveNode::Make("double" , Repetition::OPTIONAL, ParquetType::DOUBLE)); |
146 | arrow_fields.push_back(std::make_shared<Field>("double" , DOUBLE)); |
147 | |
148 | parquet_fields.push_back( |
149 | PrimitiveNode::Make("binary" , Repetition::OPTIONAL, ParquetType::BYTE_ARRAY)); |
150 | arrow_fields.push_back(std::make_shared<Field>("binary" , BINARY)); |
151 | |
152 | parquet_fields.push_back(PrimitiveNode::Make( |
153 | "string" , Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8)); |
154 | arrow_fields.push_back(std::make_shared<Field>("string" , UTF8)); |
155 | |
156 | parquet_fields.push_back(PrimitiveNode::Make("flba-binary" , Repetition::OPTIONAL, |
157 | ParquetType::FIXED_LEN_BYTE_ARRAY, |
158 | LogicalType::NONE, 12)); |
159 | arrow_fields.push_back( |
160 | std::make_shared<Field>("flba-binary" , ::arrow::fixed_size_binary(12))); |
161 | |
162 | auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields); |
163 | ASSERT_OK(ConvertSchema(parquet_fields)); |
164 | |
165 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); |
166 | } |
167 | |
168 | TEST_F(TestConvertParquetSchema, DuplicateFieldNames) { |
169 | std::vector<NodePtr> parquet_fields; |
170 | std::vector<std::shared_ptr<Field>> arrow_fields; |
171 | |
172 | parquet_fields.push_back( |
173 | PrimitiveNode::Make("xxx" , Repetition::REQUIRED, ParquetType::BOOLEAN)); |
174 | auto arrow_field1 = std::make_shared<Field>("xxx" , BOOL, false); |
175 | |
176 | parquet_fields.push_back( |
177 | PrimitiveNode::Make("xxx" , Repetition::REQUIRED, ParquetType::INT32)); |
178 | auto arrow_field2 = std::make_shared<Field>("xxx" , INT32, false); |
179 | |
180 | ASSERT_OK(ConvertSchema(parquet_fields)); |
181 | arrow_fields = {arrow_field1, arrow_field2}; |
182 | ASSERT_NO_FATAL_FAILURE( |
183 | CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields))); |
184 | |
185 | ASSERT_OK(ConvertSchema(parquet_fields, std::vector<int>({0, 1}))); |
186 | arrow_fields = {arrow_field1, arrow_field2}; |
187 | ASSERT_NO_FATAL_FAILURE( |
188 | CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields))); |
189 | |
190 | ASSERT_OK(ConvertSchema(parquet_fields, std::vector<int>({1, 0}))); |
191 | arrow_fields = {arrow_field2, arrow_field1}; |
192 | ASSERT_NO_FATAL_FAILURE( |
193 | CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields))); |
194 | } |
195 | |
196 | TEST_F(TestConvertParquetSchema, ParquetKeyValueMetadata) { |
197 | std::vector<NodePtr> parquet_fields; |
198 | std::vector<std::shared_ptr<Field>> arrow_fields; |
199 | |
200 | parquet_fields.push_back( |
201 | PrimitiveNode::Make("boolean" , Repetition::REQUIRED, ParquetType::BOOLEAN)); |
202 | arrow_fields.push_back(std::make_shared<Field>("boolean" , BOOL, false)); |
203 | |
204 | parquet_fields.push_back( |
205 | PrimitiveNode::Make("int32" , Repetition::REQUIRED, ParquetType::INT32)); |
206 | arrow_fields.push_back(std::make_shared<Field>("int32" , INT32, false)); |
207 | |
208 | auto key_value_metadata = std::make_shared<KeyValueMetadata>(); |
209 | key_value_metadata->Append("foo" , "bar" ); |
210 | key_value_metadata->Append("biz" , "baz" ); |
211 | ASSERT_OK(ConvertSchema(parquet_fields, key_value_metadata)); |
212 | |
213 | auto arrow_metadata = result_schema_->metadata(); |
214 | ASSERT_EQ("foo" , arrow_metadata->key(0)); |
215 | ASSERT_EQ("bar" , arrow_metadata->value(0)); |
216 | ASSERT_EQ("biz" , arrow_metadata->key(1)); |
217 | ASSERT_EQ("baz" , arrow_metadata->value(1)); |
218 | } |
219 | |
220 | TEST_F(TestConvertParquetSchema, ParquetEmptyKeyValueMetadata) { |
221 | std::vector<NodePtr> parquet_fields; |
222 | std::vector<std::shared_ptr<Field>> arrow_fields; |
223 | |
224 | parquet_fields.push_back( |
225 | PrimitiveNode::Make("int32" , Repetition::REQUIRED, ParquetType::INT32)); |
226 | arrow_fields.push_back(std::make_shared<Field>("int32" , INT32, false)); |
227 | |
228 | std::shared_ptr<KeyValueMetadata> key_value_metadata = nullptr; |
229 | ASSERT_OK(ConvertSchema(parquet_fields, key_value_metadata)); |
230 | |
231 | auto arrow_metadata = result_schema_->metadata(); |
232 | ASSERT_EQ(arrow_metadata, nullptr); |
233 | } |
234 | |
235 | TEST_F(TestConvertParquetSchema, ParquetFlatDecimals) { |
236 | std::vector<NodePtr> parquet_fields; |
237 | std::vector<std::shared_ptr<Field>> arrow_fields; |
238 | |
239 | parquet_fields.push_back(PrimitiveNode::Make("flba-decimal" , Repetition::OPTIONAL, |
240 | ParquetType::FIXED_LEN_BYTE_ARRAY, |
241 | LogicalType::DECIMAL, 4, 8, 4)); |
242 | arrow_fields.push_back(std::make_shared<Field>("flba-decimal" , DECIMAL_8_4)); |
243 | |
244 | parquet_fields.push_back(PrimitiveNode::Make("binary-decimal" , Repetition::OPTIONAL, |
245 | ParquetType::BYTE_ARRAY, |
246 | LogicalType::DECIMAL, -1, 8, 4)); |
247 | arrow_fields.push_back(std::make_shared<Field>("binary-decimal" , DECIMAL_8_4)); |
248 | |
249 | parquet_fields.push_back(PrimitiveNode::Make("int32-decimal" , Repetition::OPTIONAL, |
250 | ParquetType::INT32, LogicalType::DECIMAL, |
251 | -1, 8, 4)); |
252 | arrow_fields.push_back(std::make_shared<Field>("int32-decimal" , DECIMAL_8_4)); |
253 | |
254 | parquet_fields.push_back(PrimitiveNode::Make("int64-decimal" , Repetition::OPTIONAL, |
255 | ParquetType::INT64, LogicalType::DECIMAL, |
256 | -1, 8, 4)); |
257 | arrow_fields.push_back(std::make_shared<Field>("int64-decimal" , DECIMAL_8_4)); |
258 | |
259 | auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields); |
260 | ASSERT_OK(ConvertSchema(parquet_fields)); |
261 | |
262 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); |
263 | } |
264 | |
265 | TEST_F(TestConvertParquetSchema, ParquetLists) { |
266 | std::vector<NodePtr> parquet_fields; |
267 | std::vector<std::shared_ptr<Field>> arrow_fields; |
268 | |
269 | // LIST encoding example taken from parquet-format/LogicalTypes.md |
270 | |
271 | // // List<String> (list non-null, elements nullable) |
272 | // required group my_list (LIST) { |
273 | // repeated group list { |
274 | // optional binary element (UTF8); |
275 | // } |
276 | // } |
277 | { |
278 | auto element = PrimitiveNode::Make("string" , Repetition::OPTIONAL, |
279 | ParquetType::BYTE_ARRAY, LogicalType::UTF8); |
280 | auto list = GroupNode::Make("list" , Repetition::REPEATED, {element}); |
281 | parquet_fields.push_back( |
282 | GroupNode::Make("my_list" , Repetition::REQUIRED, {list}, LogicalType::LIST)); |
283 | auto arrow_element = std::make_shared<Field>("string" , UTF8, true); |
284 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
285 | arrow_fields.push_back(std::make_shared<Field>("my_list" , arrow_list, false)); |
286 | } |
287 | |
288 | // // List<String> (list nullable, elements non-null) |
289 | // optional group my_list (LIST) { |
290 | // repeated group list { |
291 | // required binary element (UTF8); |
292 | // } |
293 | // } |
294 | { |
295 | auto element = PrimitiveNode::Make("string" , Repetition::REQUIRED, |
296 | ParquetType::BYTE_ARRAY, LogicalType::UTF8); |
297 | auto list = GroupNode::Make("list" , Repetition::REPEATED, {element}); |
298 | parquet_fields.push_back( |
299 | GroupNode::Make("my_list" , Repetition::OPTIONAL, {list}, LogicalType::LIST)); |
300 | auto arrow_element = std::make_shared<Field>("string" , UTF8, false); |
301 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
302 | arrow_fields.push_back(std::make_shared<Field>("my_list" , arrow_list, true)); |
303 | } |
304 | |
305 | // Element types can be nested structures. For example, a list of lists: |
306 | // |
307 | // // List<List<Integer>> |
308 | // optional group array_of_arrays (LIST) { |
309 | // repeated group list { |
310 | // required group element (LIST) { |
311 | // repeated group list { |
312 | // required int32 element; |
313 | // } |
314 | // } |
315 | // } |
316 | // } |
317 | { |
318 | auto inner_element = |
319 | PrimitiveNode::Make("int32" , Repetition::REQUIRED, ParquetType::INT32); |
320 | auto inner_list = GroupNode::Make("list" , Repetition::REPEATED, {inner_element}); |
321 | auto element = |
322 | GroupNode::Make("element" , Repetition::REQUIRED, {inner_list}, LogicalType::LIST); |
323 | auto list = GroupNode::Make("list" , Repetition::REPEATED, {element}); |
324 | parquet_fields.push_back(GroupNode::Make("array_of_arrays" , Repetition::OPTIONAL, |
325 | {list}, LogicalType::LIST)); |
326 | auto arrow_inner_element = std::make_shared<Field>("int32" , INT32, false); |
327 | auto arrow_inner_list = std::make_shared<::arrow::ListType>(arrow_inner_element); |
328 | auto arrow_element = std::make_shared<Field>("element" , arrow_inner_list, false); |
329 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
330 | arrow_fields.push_back(std::make_shared<Field>("array_of_arrays" , arrow_list, true)); |
331 | } |
332 | |
333 | // // List<String> (list nullable, elements non-null) |
334 | // optional group my_list (LIST) { |
335 | // repeated group element { |
336 | // required binary str (UTF8); |
337 | // }; |
338 | // } |
339 | { |
340 | auto element = PrimitiveNode::Make("str" , Repetition::REQUIRED, |
341 | ParquetType::BYTE_ARRAY, LogicalType::UTF8); |
342 | auto list = GroupNode::Make("element" , Repetition::REPEATED, {element}); |
343 | parquet_fields.push_back( |
344 | GroupNode::Make("my_list" , Repetition::OPTIONAL, {list}, LogicalType::LIST)); |
345 | auto arrow_element = std::make_shared<Field>("str" , UTF8, false); |
346 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
347 | arrow_fields.push_back(std::make_shared<Field>("my_list" , arrow_list, true)); |
348 | } |
349 | |
350 | // // List<Integer> (nullable list, non-null elements) |
351 | // optional group my_list (LIST) { |
352 | // repeated int32 element; |
353 | // } |
354 | { |
355 | auto element = |
356 | PrimitiveNode::Make("element" , Repetition::REPEATED, ParquetType::INT32); |
357 | parquet_fields.push_back( |
358 | GroupNode::Make("my_list" , Repetition::OPTIONAL, {element}, LogicalType::LIST)); |
359 | auto arrow_element = std::make_shared<Field>("element" , INT32, false); |
360 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
361 | arrow_fields.push_back(std::make_shared<Field>("my_list" , arrow_list, true)); |
362 | } |
363 | |
364 | // // List<Tuple<String, Integer>> (nullable list, non-null elements) |
365 | // optional group my_list (LIST) { |
366 | // repeated group element { |
367 | // required binary str (UTF8); |
368 | // required int32 num; |
369 | // }; |
370 | // } |
371 | { |
372 | auto str_element = PrimitiveNode::Make("str" , Repetition::REQUIRED, |
373 | ParquetType::BYTE_ARRAY, LogicalType::UTF8); |
374 | auto num_element = |
375 | PrimitiveNode::Make("num" , Repetition::REQUIRED, ParquetType::INT32); |
376 | auto element = |
377 | GroupNode::Make("element" , Repetition::REPEATED, {str_element, num_element}); |
378 | parquet_fields.push_back( |
379 | GroupNode::Make("my_list" , Repetition::OPTIONAL, {element}, LogicalType::LIST)); |
380 | auto arrow_str = std::make_shared<Field>("str" , UTF8, false); |
381 | auto arrow_num = std::make_shared<Field>("num" , INT32, false); |
382 | std::vector<std::shared_ptr<Field>> fields({arrow_str, arrow_num}); |
383 | auto arrow_struct = std::make_shared<::arrow::StructType>(fields); |
384 | auto arrow_element = std::make_shared<Field>("element" , arrow_struct, false); |
385 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
386 | arrow_fields.push_back(std::make_shared<Field>("my_list" , arrow_list, true)); |
387 | } |
388 | |
389 | // // List<OneTuple<String>> (nullable list, non-null elements) |
390 | // optional group my_list (LIST) { |
391 | // repeated group array { |
392 | // required binary str (UTF8); |
393 | // }; |
394 | // } |
395 | // Special case: group is named array |
396 | { |
397 | auto element = PrimitiveNode::Make("str" , Repetition::REQUIRED, |
398 | ParquetType::BYTE_ARRAY, LogicalType::UTF8); |
399 | auto array = GroupNode::Make("array" , Repetition::REPEATED, {element}); |
400 | parquet_fields.push_back( |
401 | GroupNode::Make("my_list" , Repetition::OPTIONAL, {array}, LogicalType::LIST)); |
402 | auto arrow_str = std::make_shared<Field>("str" , UTF8, false); |
403 | std::vector<std::shared_ptr<Field>> fields({arrow_str}); |
404 | auto arrow_struct = std::make_shared<::arrow::StructType>(fields); |
405 | auto arrow_element = std::make_shared<Field>("array" , arrow_struct, false); |
406 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
407 | arrow_fields.push_back(std::make_shared<Field>("my_list" , arrow_list, true)); |
408 | } |
409 | |
410 | // // List<OneTuple<String>> (nullable list, non-null elements) |
411 | // optional group my_list (LIST) { |
412 | // repeated group my_list_tuple { |
413 | // required binary str (UTF8); |
414 | // }; |
415 | // } |
416 | // Special case: group named ends in _tuple |
417 | { |
418 | auto element = PrimitiveNode::Make("str" , Repetition::REQUIRED, |
419 | ParquetType::BYTE_ARRAY, LogicalType::UTF8); |
420 | auto array = GroupNode::Make("my_list_tuple" , Repetition::REPEATED, {element}); |
421 | parquet_fields.push_back( |
422 | GroupNode::Make("my_list" , Repetition::OPTIONAL, {array}, LogicalType::LIST)); |
423 | auto arrow_str = std::make_shared<Field>("str" , UTF8, false); |
424 | std::vector<std::shared_ptr<Field>> fields({arrow_str}); |
425 | auto arrow_struct = std::make_shared<::arrow::StructType>(fields); |
426 | auto arrow_element = std::make_shared<Field>("my_list_tuple" , arrow_struct, false); |
427 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
428 | arrow_fields.push_back(std::make_shared<Field>("my_list" , arrow_list, true)); |
429 | } |
430 | |
431 | // One-level encoding: Only allows required lists with required cells |
432 | // repeated value_type name |
433 | { |
434 | parquet_fields.push_back( |
435 | PrimitiveNode::Make("name" , Repetition::REPEATED, ParquetType::INT32)); |
436 | auto arrow_element = std::make_shared<Field>("name" , INT32, false); |
437 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
438 | arrow_fields.push_back(std::make_shared<Field>("name" , arrow_list, false)); |
439 | } |
440 | |
441 | auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields); |
442 | ASSERT_OK(ConvertSchema(parquet_fields)); |
443 | |
444 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); |
445 | } |
446 | |
447 | TEST_F(TestConvertParquetSchema, UnsupportedThings) { |
448 | std::vector<NodePtr> unsupported_nodes; |
449 | |
450 | for (const NodePtr& node : unsupported_nodes) { |
451 | ASSERT_RAISES(NotImplemented, ConvertSchema({node})); |
452 | } |
453 | } |
454 | |
455 | TEST_F(TestConvertParquetSchema, ParquetNestedSchema) { |
456 | std::vector<NodePtr> parquet_fields; |
457 | std::vector<std::shared_ptr<Field>> arrow_fields; |
458 | |
459 | // required group group1 { |
460 | // required bool leaf1; |
461 | // required int32 leaf2; |
462 | // } |
463 | // required int64 leaf3; |
464 | { |
465 | parquet_fields.push_back(GroupNode::Make( |
466 | "group1" , Repetition::REQUIRED, |
467 | {PrimitiveNode::Make("leaf1" , Repetition::REQUIRED, ParquetType::BOOLEAN), |
468 | PrimitiveNode::Make("leaf2" , Repetition::REQUIRED, ParquetType::INT32)})); |
469 | parquet_fields.push_back( |
470 | PrimitiveNode::Make("leaf3" , Repetition::REQUIRED, ParquetType::INT64)); |
471 | |
472 | auto group1_fields = {std::make_shared<Field>("leaf1" , BOOL, false), |
473 | std::make_shared<Field>("leaf2" , INT32, false)}; |
474 | auto arrow_group1_type = std::make_shared<::arrow::StructType>(group1_fields); |
475 | arrow_fields.push_back(std::make_shared<Field>("group1" , arrow_group1_type, false)); |
476 | arrow_fields.push_back(std::make_shared<Field>("leaf3" , INT64, false)); |
477 | } |
478 | |
479 | auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields); |
480 | ASSERT_OK(ConvertSchema(parquet_fields)); |
481 | |
482 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); |
483 | } |
484 | |
485 | TEST_F(TestConvertParquetSchema, ParquetNestedSchemaPartial) { |
486 | std::vector<NodePtr> parquet_fields; |
487 | std::vector<std::shared_ptr<Field>> arrow_fields; |
488 | |
489 | // Full Parquet Schema: |
490 | // required group group1 { |
491 | // required int64 leaf1; |
492 | // required int64 leaf2; |
493 | // } |
494 | // required group group2 { |
495 | // required int64 leaf3; |
496 | // required int64 leaf4; |
497 | // } |
498 | // required int64 leaf5; |
499 | // |
500 | // Expected partial arrow schema (columns 0, 3, 4): |
501 | // required group group1 { |
502 | // required int64 leaf1; |
503 | // } |
504 | // required group group2 { |
505 | // required int64 leaf4; |
506 | // } |
507 | // required int64 leaf5; |
508 | { |
509 | parquet_fields.push_back(GroupNode::Make( |
510 | "group1" , Repetition::REQUIRED, |
511 | {PrimitiveNode::Make("leaf1" , Repetition::REQUIRED, ParquetType::INT64), |
512 | PrimitiveNode::Make("leaf2" , Repetition::REQUIRED, ParquetType::INT64)})); |
513 | parquet_fields.push_back(GroupNode::Make( |
514 | "group2" , Repetition::REQUIRED, |
515 | {PrimitiveNode::Make("leaf3" , Repetition::REQUIRED, ParquetType::INT64), |
516 | PrimitiveNode::Make("leaf4" , Repetition::REQUIRED, ParquetType::INT64)})); |
517 | parquet_fields.push_back( |
518 | PrimitiveNode::Make("leaf5" , Repetition::REQUIRED, ParquetType::INT64)); |
519 | |
520 | auto group1_fields = {std::make_shared<Field>("leaf1" , INT64, false)}; |
521 | auto arrow_group1_type = std::make_shared<::arrow::StructType>(group1_fields); |
522 | auto group2_fields = {std::make_shared<Field>("leaf4" , INT64, false)}; |
523 | auto arrow_group2_type = std::make_shared<::arrow::StructType>(group2_fields); |
524 | |
525 | arrow_fields.push_back(std::make_shared<Field>("group1" , arrow_group1_type, false)); |
526 | arrow_fields.push_back(std::make_shared<Field>("group2" , arrow_group2_type, false)); |
527 | arrow_fields.push_back(std::make_shared<Field>("leaf5" , INT64, false)); |
528 | } |
529 | |
530 | auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields); |
531 | ASSERT_OK(ConvertSchema(parquet_fields, std::vector<int>{0, 3, 4})); |
532 | |
533 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); |
534 | } |
535 | |
536 | TEST_F(TestConvertParquetSchema, ParquetNestedSchemaPartialOrdering) { |
537 | std::vector<NodePtr> parquet_fields; |
538 | std::vector<std::shared_ptr<Field>> arrow_fields; |
539 | |
540 | // Full Parquet Schema: |
541 | // required group group1 { |
542 | // required int64 leaf1; |
543 | // required int64 leaf2; |
544 | // } |
545 | // required group group2 { |
546 | // required int64 leaf3; |
547 | // required int64 leaf4; |
548 | // } |
549 | // required int64 leaf5; |
550 | // |
551 | // Expected partial arrow schema (columns 3, 4, 0): |
552 | // required group group2 { |
553 | // required int64 leaf4; |
554 | // } |
555 | // required int64 leaf5; |
556 | // required group group1 { |
557 | // required int64 leaf1; |
558 | // } |
559 | { |
560 | parquet_fields.push_back(GroupNode::Make( |
561 | "group1" , Repetition::REQUIRED, |
562 | {PrimitiveNode::Make("leaf1" , Repetition::REQUIRED, ParquetType::INT64), |
563 | PrimitiveNode::Make("leaf2" , Repetition::REQUIRED, ParquetType::INT64)})); |
564 | parquet_fields.push_back(GroupNode::Make( |
565 | "group2" , Repetition::REQUIRED, |
566 | {PrimitiveNode::Make("leaf3" , Repetition::REQUIRED, ParquetType::INT64), |
567 | PrimitiveNode::Make("leaf4" , Repetition::REQUIRED, ParquetType::INT64)})); |
568 | parquet_fields.push_back( |
569 | PrimitiveNode::Make("leaf5" , Repetition::REQUIRED, ParquetType::INT64)); |
570 | |
571 | auto group1_fields = {std::make_shared<Field>("leaf1" , INT64, false)}; |
572 | auto arrow_group1_type = std::make_shared<::arrow::StructType>(group1_fields); |
573 | auto group2_fields = {std::make_shared<Field>("leaf4" , INT64, false)}; |
574 | auto arrow_group2_type = std::make_shared<::arrow::StructType>(group2_fields); |
575 | |
576 | arrow_fields.push_back(std::make_shared<Field>("group2" , arrow_group2_type, false)); |
577 | arrow_fields.push_back(std::make_shared<Field>("leaf5" , INT64, false)); |
578 | arrow_fields.push_back(std::make_shared<Field>("group1" , arrow_group1_type, false)); |
579 | } |
580 | |
581 | auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields); |
582 | ASSERT_OK(ConvertSchema(parquet_fields, std::vector<int>{3, 4, 0})); |
583 | |
584 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); |
585 | } |
586 | TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) { |
587 | std::vector<NodePtr> parquet_fields; |
588 | std::vector<std::shared_ptr<Field>> arrow_fields; |
589 | { |
590 | // optional int32 leaf1; |
591 | // repeated group outerGroup { |
592 | // optional int32 leaf2; |
593 | // repeated group innerGroup { |
594 | // optional int32 leaf3; |
595 | // } |
596 | // } |
597 | parquet_fields.push_back( |
598 | PrimitiveNode::Make("leaf1" , Repetition::OPTIONAL, ParquetType::INT32)); |
599 | parquet_fields.push_back(GroupNode::Make( |
600 | "outerGroup" , Repetition::REPEATED, |
601 | {PrimitiveNode::Make("leaf2" , Repetition::OPTIONAL, ParquetType::INT32), |
602 | GroupNode::Make( |
603 | "innerGroup" , Repetition::REPEATED, |
604 | {PrimitiveNode::Make("leaf3" , Repetition::OPTIONAL, ParquetType::INT32)})})); |
605 | |
606 | auto inner_group_fields = {std::make_shared<Field>("leaf3" , INT32, true)}; |
607 | auto inner_group_type = std::make_shared<::arrow::StructType>(inner_group_fields); |
608 | auto outer_group_fields = { |
609 | std::make_shared<Field>("leaf2" , INT32, true), |
610 | std::make_shared<Field>( |
611 | "innerGroup" , |
612 | ::arrow::list(std::make_shared<Field>("innerGroup" , inner_group_type, false)), |
613 | false)}; |
614 | auto outer_group_type = std::make_shared<::arrow::StructType>(outer_group_fields); |
615 | |
616 | arrow_fields.push_back(std::make_shared<Field>("leaf1" , INT32, true)); |
617 | arrow_fields.push_back(std::make_shared<Field>( |
618 | "outerGroup" , |
619 | ::arrow::list(std::make_shared<Field>("outerGroup" , outer_group_type, false)), |
620 | false)); |
621 | } |
622 | auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields); |
623 | ASSERT_OK(ConvertSchema(parquet_fields)); |
624 | |
625 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); |
626 | } |
627 | |
628 | class TestConvertArrowSchema : public ::testing::Test { |
629 | public: |
630 | virtual void SetUp() {} |
631 | |
632 | void CheckFlatSchema(const std::vector<NodePtr>& nodes) { |
633 | NodePtr schema_node = GroupNode::Make("schema" , Repetition::REPEATED, nodes); |
634 | const GroupNode* expected_schema_node = |
635 | static_cast<const GroupNode*>(schema_node.get()); |
636 | const GroupNode* result_schema_node = result_schema_->group_node(); |
637 | |
638 | ASSERT_EQ(expected_schema_node->field_count(), result_schema_node->field_count()); |
639 | |
640 | for (int i = 0; i < expected_schema_node->field_count(); i++) { |
641 | auto lhs = result_schema_node->field(i); |
642 | auto rhs = expected_schema_node->field(i); |
643 | EXPECT_TRUE(lhs->Equals(rhs.get())); |
644 | } |
645 | } |
646 | |
647 | ::arrow::Status ConvertSchema(const std::vector<std::shared_ptr<Field>>& fields) { |
648 | arrow_schema_ = std::make_shared<::arrow::Schema>(fields); |
649 | std::shared_ptr<::parquet::WriterProperties> properties = |
650 | ::parquet::default_writer_properties(); |
651 | return ToParquetSchema(arrow_schema_.get(), *properties.get(), &result_schema_); |
652 | } |
653 | |
654 | protected: |
655 | std::shared_ptr<::arrow::Schema> arrow_schema_; |
656 | std::shared_ptr<SchemaDescriptor> result_schema_; |
657 | }; |
658 | |
659 | TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) { |
660 | std::vector<NodePtr> parquet_fields; |
661 | std::vector<std::shared_ptr<Field>> arrow_fields; |
662 | |
663 | parquet_fields.push_back( |
664 | PrimitiveNode::Make("boolean" , Repetition::REQUIRED, ParquetType::BOOLEAN)); |
665 | arrow_fields.push_back(std::make_shared<Field>("boolean" , BOOL, false)); |
666 | |
667 | parquet_fields.push_back( |
668 | PrimitiveNode::Make("int32" , Repetition::REQUIRED, ParquetType::INT32)); |
669 | arrow_fields.push_back(std::make_shared<Field>("int32" , INT32, false)); |
670 | |
671 | parquet_fields.push_back( |
672 | PrimitiveNode::Make("int64" , Repetition::REQUIRED, ParquetType::INT64)); |
673 | arrow_fields.push_back(std::make_shared<Field>("int64" , INT64, false)); |
674 | |
675 | parquet_fields.push_back(PrimitiveNode::Make("date" , Repetition::REQUIRED, |
676 | ParquetType::INT32, LogicalType::DATE)); |
677 | arrow_fields.push_back(std::make_shared<Field>("date" , ::arrow::date32(), false)); |
678 | |
679 | parquet_fields.push_back(PrimitiveNode::Make("date64" , Repetition::REQUIRED, |
680 | ParquetType::INT32, LogicalType::DATE)); |
681 | arrow_fields.push_back(std::make_shared<Field>("date64" , ::arrow::date64(), false)); |
682 | |
683 | parquet_fields.push_back(PrimitiveNode::Make("timestamp" , Repetition::REQUIRED, |
684 | ParquetType::INT64, |
685 | LogicalType::TIMESTAMP_MILLIS)); |
686 | arrow_fields.push_back(std::make_shared<Field>("timestamp" , TIMESTAMP_MS, false)); |
687 | |
688 | parquet_fields.push_back(PrimitiveNode::Make("timestamp[us]" , Repetition::REQUIRED, |
689 | ParquetType::INT64, |
690 | LogicalType::TIMESTAMP_MICROS)); |
691 | arrow_fields.push_back(std::make_shared<Field>("timestamp[us]" , TIMESTAMP_US, false)); |
692 | |
693 | parquet_fields.push_back( |
694 | PrimitiveNode::Make("float" , Repetition::OPTIONAL, ParquetType::FLOAT)); |
695 | arrow_fields.push_back(std::make_shared<Field>("float" , FLOAT)); |
696 | |
697 | parquet_fields.push_back( |
698 | PrimitiveNode::Make("double" , Repetition::OPTIONAL, ParquetType::DOUBLE)); |
699 | arrow_fields.push_back(std::make_shared<Field>("double" , DOUBLE)); |
700 | |
701 | parquet_fields.push_back(PrimitiveNode::Make( |
702 | "string" , Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8)); |
703 | arrow_fields.push_back(std::make_shared<Field>("string" , UTF8)); |
704 | |
705 | parquet_fields.push_back(PrimitiveNode::Make( |
706 | "binary" , Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::NONE)); |
707 | arrow_fields.push_back(std::make_shared<Field>("binary" , BINARY)); |
708 | |
709 | ASSERT_OK(ConvertSchema(arrow_fields)); |
710 | |
711 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); |
712 | } |
713 | |
714 | TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) { |
715 | std::vector<NodePtr> parquet_fields; |
716 | std::vector<std::shared_ptr<Field>> arrow_fields; |
717 | std::shared_ptr<::arrow::Array> dict; |
718 | |
719 | parquet_fields.push_back( |
720 | PrimitiveNode::Make("int32" , Repetition::REQUIRED, ParquetType::INT32)); |
721 | ArrayFromVector<::arrow::Int32Type, int32_t>(std::vector<int32_t>(), &dict); |
722 | arrow_fields.push_back( |
723 | ::arrow::field("int32" , ::arrow::dictionary(::arrow::int8(), dict), false)); |
724 | |
725 | parquet_fields.push_back( |
726 | PrimitiveNode::Make("int64" , Repetition::REQUIRED, ParquetType::INT64)); |
727 | ArrayFromVector<::arrow::Int64Type, int64_t>(std::vector<int64_t>(), &dict); |
728 | arrow_fields.push_back(std::make_shared<Field>( |
729 | "int64" , ::arrow::dictionary(::arrow::int8(), dict), false)); |
730 | |
731 | parquet_fields.push_back(PrimitiveNode::Make("date" , Repetition::REQUIRED, |
732 | ParquetType::INT32, LogicalType::DATE)); |
733 | ArrayFromVector<::arrow::Date32Type, int32_t>(std::vector<int32_t>(), &dict); |
734 | arrow_fields.push_back( |
735 | std::make_shared<Field>("date" , ::arrow::dictionary(::arrow::int8(), dict), false)); |
736 | |
737 | parquet_fields.push_back(PrimitiveNode::Make("date64" , Repetition::REQUIRED, |
738 | ParquetType::INT32, LogicalType::DATE)); |
739 | ArrayFromVector<::arrow::Date64Type, int64_t>(std::vector<int64_t>(), &dict); |
740 | arrow_fields.push_back(std::make_shared<Field>( |
741 | "date64" , ::arrow::dictionary(::arrow::int8(), dict), false)); |
742 | |
743 | parquet_fields.push_back( |
744 | PrimitiveNode::Make("float" , Repetition::OPTIONAL, ParquetType::FLOAT)); |
745 | ArrayFromVector<::arrow::FloatType, float>(std::vector<float>(), &dict); |
746 | arrow_fields.push_back( |
747 | std::make_shared<Field>("float" , ::arrow::dictionary(::arrow::int8(), dict))); |
748 | |
749 | parquet_fields.push_back( |
750 | PrimitiveNode::Make("double" , Repetition::OPTIONAL, ParquetType::DOUBLE)); |
751 | ArrayFromVector<::arrow::DoubleType, double>(std::vector<double>(), &dict); |
752 | arrow_fields.push_back( |
753 | std::make_shared<Field>("double" , ::arrow::dictionary(::arrow::int8(), dict))); |
754 | |
755 | parquet_fields.push_back(PrimitiveNode::Make( |
756 | "string" , Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8)); |
757 | ::arrow::StringBuilder string_builder(::arrow::default_memory_pool()); |
758 | ASSERT_OK(string_builder.Finish(&dict)); |
759 | arrow_fields.push_back( |
760 | std::make_shared<Field>("string" , ::arrow::dictionary(::arrow::int8(), dict))); |
761 | |
762 | parquet_fields.push_back(PrimitiveNode::Make( |
763 | "binary" , Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::NONE)); |
764 | ::arrow::BinaryBuilder binary_builder(::arrow::default_memory_pool()); |
765 | ASSERT_OK(binary_builder.Finish(&dict)); |
766 | arrow_fields.push_back( |
767 | std::make_shared<Field>("binary" , ::arrow::dictionary(::arrow::int8(), dict))); |
768 | |
769 | ASSERT_OK(ConvertSchema(arrow_fields)); |
770 | |
771 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); |
772 | } |
773 | |
774 | TEST_F(TestConvertArrowSchema, ParquetLists) { |
775 | std::vector<NodePtr> parquet_fields; |
776 | std::vector<std::shared_ptr<Field>> arrow_fields; |
777 | |
778 | // parquet_arrow will always generate 3-level LIST encodings |
779 | |
780 | // // List<String> (list non-null, elements nullable) |
781 | // required group my_list (LIST) { |
782 | // repeated group list { |
783 | // optional binary element (UTF8); |
784 | // } |
785 | // } |
786 | { |
787 | auto element = PrimitiveNode::Make("string" , Repetition::OPTIONAL, |
788 | ParquetType::BYTE_ARRAY, LogicalType::UTF8); |
789 | auto list = GroupNode::Make("list" , Repetition::REPEATED, {element}); |
790 | parquet_fields.push_back( |
791 | GroupNode::Make("my_list" , Repetition::REQUIRED, {list}, LogicalType::LIST)); |
792 | auto arrow_element = std::make_shared<Field>("string" , UTF8, true); |
793 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
794 | arrow_fields.push_back(std::make_shared<Field>("my_list" , arrow_list, false)); |
795 | } |
796 | |
797 | // // List<String> (list nullable, elements non-null) |
798 | // optional group my_list (LIST) { |
799 | // repeated group list { |
800 | // required binary element (UTF8); |
801 | // } |
802 | // } |
803 | { |
804 | auto element = PrimitiveNode::Make("string" , Repetition::REQUIRED, |
805 | ParquetType::BYTE_ARRAY, LogicalType::UTF8); |
806 | auto list = GroupNode::Make("list" , Repetition::REPEATED, {element}); |
807 | parquet_fields.push_back( |
808 | GroupNode::Make("my_list" , Repetition::OPTIONAL, {list}, LogicalType::LIST)); |
809 | auto arrow_element = std::make_shared<Field>("string" , UTF8, false); |
810 | auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); |
811 | arrow_fields.push_back(std::make_shared<Field>("my_list" , arrow_list, true)); |
812 | } |
813 | |
814 | ASSERT_OK(ConvertSchema(arrow_fields)); |
815 | |
816 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); |
817 | } |
818 | |
819 | TEST_F(TestConvertArrowSchema, UnsupportedTypes) { |
820 | std::vector<std::shared_ptr<Field>> unsupported_fields = { |
821 | ::arrow::field("f0" , ::arrow::time64(TimeUnit::NANO))}; |
822 | |
823 | for (const auto& field : unsupported_fields) { |
824 | ASSERT_RAISES(NotImplemented, ConvertSchema({field})); |
825 | } |
826 | } |
827 | |
828 | TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) { |
829 | std::vector<NodePtr> parquet_fields; |
830 | std::vector<std::shared_ptr<Field>> arrow_fields; |
831 | |
832 | // TODO: Test Decimal Arrow -> Parquet conversion |
833 | |
834 | ASSERT_OK(ConvertSchema(arrow_fields)); |
835 | |
836 | ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); |
837 | } |
838 | |
839 | TEST(InvalidSchema, ParquetNegativeDecimalScale) { |
840 | const auto& type = ::arrow::decimal(23, -2); |
841 | const auto& field = ::arrow::field("f0" , type); |
842 | const auto& arrow_schema = ::arrow::schema({field}); |
843 | std::shared_ptr<::parquet::WriterProperties> properties = |
844 | ::parquet::default_writer_properties(); |
845 | std::shared_ptr<SchemaDescriptor> result_schema; |
846 | |
847 | ASSERT_RAISES(IOError, |
848 | ToParquetSchema(arrow_schema.get(), *properties.get(), &result_schema)); |
849 | } |
850 | |
851 | } // namespace arrow |
852 | } // namespace parquet |
853 | |