1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <gtest/gtest.h> |
19 | |
20 | #include <cstdlib> |
21 | #include <iosfwd> |
22 | #include <memory> |
23 | #include <string> |
24 | #include <vector> |
25 | |
26 | #include "parquet/exception.h" |
27 | #include "parquet/schema-internal.h" |
28 | #include "parquet/schema.h" |
29 | #include "parquet/thrift.h" |
30 | #include "parquet/types.h" |
31 | |
32 | using std::string; |
33 | using std::vector; |
34 | |
35 | namespace parquet { |
36 | |
37 | using format::ConvertedType; |
38 | using format::FieldRepetitionType; |
39 | using format::SchemaElement; |
40 | |
41 | namespace schema { |
42 | |
43 | static inline SchemaElement NewPrimitive(const std::string& name, |
44 | FieldRepetitionType::type repetition, |
45 | Type::type type, int id = 0) { |
46 | SchemaElement result; |
47 | result.__set_name(name); |
48 | result.__set_repetition_type(repetition); |
49 | result.__set_type(static_cast<format::Type::type>(type)); |
50 | |
51 | return result; |
52 | } |
53 | |
54 | static inline SchemaElement NewGroup(const std::string& name, |
55 | FieldRepetitionType::type repetition, |
56 | int num_children, int id = 0) { |
57 | SchemaElement result; |
58 | result.__set_name(name); |
59 | result.__set_repetition_type(repetition); |
60 | result.__set_num_children(num_children); |
61 | |
62 | return result; |
63 | } |
64 | |
65 | // ---------------------------------------------------------------------- |
66 | // ColumnPath |
67 | |
68 | TEST(TestColumnPath, TestAttrs) { |
69 | ColumnPath path(std::vector<std::string>({"toplevel" , "leaf" })); |
70 | |
71 | ASSERT_EQ(path.ToDotString(), "toplevel.leaf" ); |
72 | |
73 | std::shared_ptr<ColumnPath> path_ptr = ColumnPath::FromDotString("toplevel.leaf" ); |
74 | ASSERT_EQ(path_ptr->ToDotString(), "toplevel.leaf" ); |
75 | |
76 | std::shared_ptr<ColumnPath> extended = path_ptr->extend("anotherlevel" ); |
77 | ASSERT_EQ(extended->ToDotString(), "toplevel.leaf.anotherlevel" ); |
78 | } |
79 | |
80 | // ---------------------------------------------------------------------- |
81 | // Primitive node |
82 | |
83 | class TestPrimitiveNode : public ::testing::Test { |
84 | public: |
85 | void SetUp() { |
86 | name_ = "name" ; |
87 | id_ = 5; |
88 | } |
89 | |
90 | void Convert(const format::SchemaElement* element) { |
91 | node_ = PrimitiveNode::FromParquet(element, id_); |
92 | ASSERT_TRUE(node_->is_primitive()); |
93 | prim_node_ = static_cast<const PrimitiveNode*>(node_.get()); |
94 | } |
95 | |
96 | protected: |
97 | std::string name_; |
98 | const PrimitiveNode* prim_node_; |
99 | |
100 | int id_; |
101 | std::unique_ptr<Node> node_; |
102 | }; |
103 | |
104 | TEST_F(TestPrimitiveNode, Attrs) { |
105 | PrimitiveNode node1("foo" , Repetition::REPEATED, Type::INT32); |
106 | |
107 | PrimitiveNode node2("bar" , Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::UTF8); |
108 | |
109 | ASSERT_EQ("foo" , node1.name()); |
110 | |
111 | ASSERT_TRUE(node1.is_primitive()); |
112 | ASSERT_FALSE(node1.is_group()); |
113 | |
114 | ASSERT_EQ(Repetition::REPEATED, node1.repetition()); |
115 | ASSERT_EQ(Repetition::OPTIONAL, node2.repetition()); |
116 | |
117 | ASSERT_EQ(Node::PRIMITIVE, node1.node_type()); |
118 | |
119 | ASSERT_EQ(Type::INT32, node1.physical_type()); |
120 | ASSERT_EQ(Type::BYTE_ARRAY, node2.physical_type()); |
121 | |
122 | // logical types |
123 | ASSERT_EQ(LogicalType::NONE, node1.logical_type()); |
124 | ASSERT_EQ(LogicalType::UTF8, node2.logical_type()); |
125 | |
126 | // repetition |
127 | PrimitiveNode node3("foo" , Repetition::REPEATED, Type::INT32); |
128 | PrimitiveNode node4("foo" , Repetition::REQUIRED, Type::INT32); |
129 | PrimitiveNode node5("foo" , Repetition::OPTIONAL, Type::INT32); |
130 | |
131 | ASSERT_TRUE(node3.is_repeated()); |
132 | ASSERT_FALSE(node3.is_optional()); |
133 | |
134 | ASSERT_TRUE(node4.is_required()); |
135 | |
136 | ASSERT_TRUE(node5.is_optional()); |
137 | ASSERT_FALSE(node5.is_required()); |
138 | } |
139 | |
140 | TEST_F(TestPrimitiveNode, FromParquet) { |
141 | SchemaElement elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, Type::INT32, 0); |
142 | ASSERT_NO_FATAL_FAILURE(Convert(&elt)); |
143 | ASSERT_EQ(name_, prim_node_->name()); |
144 | ASSERT_EQ(id_, prim_node_->id()); |
145 | ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition()); |
146 | ASSERT_EQ(Type::INT32, prim_node_->physical_type()); |
147 | ASSERT_EQ(LogicalType::NONE, prim_node_->logical_type()); |
148 | |
149 | // Test a logical type |
150 | elt = NewPrimitive(name_, FieldRepetitionType::REQUIRED, Type::BYTE_ARRAY, 0); |
151 | elt.__set_converted_type(ConvertedType::UTF8); |
152 | |
153 | ASSERT_NO_FATAL_FAILURE(Convert(&elt)); |
154 | ASSERT_EQ(Repetition::REQUIRED, prim_node_->repetition()); |
155 | ASSERT_EQ(Type::BYTE_ARRAY, prim_node_->physical_type()); |
156 | ASSERT_EQ(LogicalType::UTF8, prim_node_->logical_type()); |
157 | |
158 | // FIXED_LEN_BYTE_ARRAY |
159 | elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, 0); |
160 | elt.__set_type_length(16); |
161 | |
162 | ASSERT_NO_FATAL_FAILURE(Convert(&elt)); |
163 | ASSERT_EQ(name_, prim_node_->name()); |
164 | ASSERT_EQ(id_, prim_node_->id()); |
165 | ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition()); |
166 | ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type()); |
167 | ASSERT_EQ(16, prim_node_->type_length()); |
168 | |
169 | // ConvertedType::Decimal |
170 | elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, 0); |
171 | elt.__set_converted_type(ConvertedType::DECIMAL); |
172 | elt.__set_type_length(6); |
173 | elt.__set_scale(2); |
174 | elt.__set_precision(12); |
175 | |
176 | ASSERT_NO_FATAL_FAILURE(Convert(&elt)); |
177 | ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type()); |
178 | ASSERT_EQ(LogicalType::DECIMAL, prim_node_->logical_type()); |
179 | ASSERT_EQ(6, prim_node_->type_length()); |
180 | ASSERT_EQ(2, prim_node_->decimal_metadata().scale); |
181 | ASSERT_EQ(12, prim_node_->decimal_metadata().precision); |
182 | } |
183 | |
184 | TEST_F(TestPrimitiveNode, Equals) { |
185 | PrimitiveNode node1("foo" , Repetition::REQUIRED, Type::INT32); |
186 | PrimitiveNode node2("foo" , Repetition::REQUIRED, Type::INT64); |
187 | PrimitiveNode node3("bar" , Repetition::REQUIRED, Type::INT32); |
188 | PrimitiveNode node4("foo" , Repetition::OPTIONAL, Type::INT32); |
189 | PrimitiveNode node5("foo" , Repetition::REQUIRED, Type::INT32); |
190 | |
191 | ASSERT_TRUE(node1.Equals(&node1)); |
192 | ASSERT_FALSE(node1.Equals(&node2)); |
193 | ASSERT_FALSE(node1.Equals(&node3)); |
194 | ASSERT_FALSE(node1.Equals(&node4)); |
195 | ASSERT_TRUE(node1.Equals(&node5)); |
196 | |
197 | PrimitiveNode flba1("foo" , Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, |
198 | LogicalType::DECIMAL, 12, 4, 2); |
199 | |
200 | PrimitiveNode flba2("foo" , Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, |
201 | LogicalType::DECIMAL, 1, 4, 2); |
202 | flba2.SetTypeLength(12); |
203 | |
204 | PrimitiveNode flba3("foo" , Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, |
205 | LogicalType::DECIMAL, 1, 4, 2); |
206 | flba3.SetTypeLength(16); |
207 | |
208 | PrimitiveNode flba4("foo" , Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, |
209 | LogicalType::DECIMAL, 12, 4, 0); |
210 | |
211 | PrimitiveNode flba5("foo" , Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, |
212 | LogicalType::NONE, 12, 4, 0); |
213 | |
214 | ASSERT_TRUE(flba1.Equals(&flba2)); |
215 | ASSERT_FALSE(flba1.Equals(&flba3)); |
216 | ASSERT_FALSE(flba1.Equals(&flba4)); |
217 | ASSERT_FALSE(flba1.Equals(&flba5)); |
218 | } |
219 | |
220 | TEST_F(TestPrimitiveNode, PhysicalLogicalMapping) { |
221 | ASSERT_NO_THROW( |
222 | PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::INT32, LogicalType::INT_32)); |
223 | ASSERT_NO_THROW(PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::BYTE_ARRAY, |
224 | LogicalType::JSON)); |
225 | ASSERT_THROW( |
226 | PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::INT32, LogicalType::JSON), |
227 | ParquetException); |
228 | ASSERT_NO_THROW(PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::INT64, |
229 | LogicalType::TIMESTAMP_MILLIS)); |
230 | ASSERT_THROW( |
231 | PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::INT32, LogicalType::INT_64), |
232 | ParquetException); |
233 | ASSERT_THROW(PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::BYTE_ARRAY, |
234 | LogicalType::INT_8), |
235 | ParquetException); |
236 | ASSERT_THROW(PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::BYTE_ARRAY, |
237 | LogicalType::INTERVAL), |
238 | ParquetException); |
239 | ASSERT_THROW(PrimitiveNode::Make("foo" , Repetition::REQUIRED, |
240 | Type::FIXED_LEN_BYTE_ARRAY, LogicalType::ENUM), |
241 | ParquetException); |
242 | ASSERT_NO_THROW(PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::BYTE_ARRAY, |
243 | LogicalType::ENUM)); |
244 | ASSERT_THROW( |
245 | PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, |
246 | LogicalType::DECIMAL, 0, 2, 4), |
247 | ParquetException); |
248 | ASSERT_THROW(PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::FLOAT, |
249 | LogicalType::DECIMAL, 0, 2, 4), |
250 | ParquetException); |
251 | ASSERT_THROW( |
252 | PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, |
253 | LogicalType::DECIMAL, 0, 4, 0), |
254 | ParquetException); |
255 | ASSERT_THROW( |
256 | PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, |
257 | LogicalType::DECIMAL, 10, 0, 4), |
258 | ParquetException); |
259 | ASSERT_THROW( |
260 | PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, |
261 | LogicalType::DECIMAL, 10, 4, -1), |
262 | ParquetException); |
263 | ASSERT_THROW( |
264 | PrimitiveNode::Make("foo" , Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, |
265 | LogicalType::DECIMAL, 10, 2, 4), |
266 | ParquetException); |
267 | ASSERT_NO_THROW(PrimitiveNode::Make("foo" , Repetition::REQUIRED, |
268 | Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, |
269 | 10, 6, 4)); |
270 | ASSERT_NO_THROW(PrimitiveNode::Make("foo" , Repetition::REQUIRED, |
271 | Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, |
272 | 12)); |
273 | ASSERT_THROW(PrimitiveNode::Make("foo" , Repetition::REQUIRED, |
274 | Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 10), |
275 | ParquetException); |
276 | } |
277 | |
278 | // ---------------------------------------------------------------------- |
279 | // Group node |
280 | |
281 | class TestGroupNode : public ::testing::Test { |
282 | public: |
283 | NodeVector Fields1() { |
284 | NodeVector fields; |
285 | |
286 | fields.push_back(Int32("one" , Repetition::REQUIRED)); |
287 | fields.push_back(Int64("two" )); |
288 | fields.push_back(Double("three" )); |
289 | |
290 | return fields; |
291 | } |
292 | |
293 | NodeVector Fields2() { |
294 | // Fields with a duplicate name |
295 | NodeVector fields; |
296 | |
297 | fields.push_back(Int32("duplicate" , Repetition::REQUIRED)); |
298 | fields.push_back(Int64("unique" )); |
299 | fields.push_back(Double("duplicate" )); |
300 | |
301 | return fields; |
302 | } |
303 | }; |
304 | |
305 | TEST_F(TestGroupNode, Attrs) { |
306 | NodeVector fields = Fields1(); |
307 | |
308 | GroupNode node1("foo" , Repetition::REPEATED, fields); |
309 | GroupNode node2("bar" , Repetition::OPTIONAL, fields, LogicalType::LIST); |
310 | |
311 | ASSERT_EQ("foo" , node1.name()); |
312 | |
313 | ASSERT_TRUE(node1.is_group()); |
314 | ASSERT_FALSE(node1.is_primitive()); |
315 | |
316 | ASSERT_EQ(fields.size(), node1.field_count()); |
317 | |
318 | ASSERT_TRUE(node1.is_repeated()); |
319 | ASSERT_TRUE(node2.is_optional()); |
320 | |
321 | ASSERT_EQ(Repetition::REPEATED, node1.repetition()); |
322 | ASSERT_EQ(Repetition::OPTIONAL, node2.repetition()); |
323 | |
324 | ASSERT_EQ(Node::GROUP, node1.node_type()); |
325 | |
326 | // logical types |
327 | ASSERT_EQ(LogicalType::NONE, node1.logical_type()); |
328 | ASSERT_EQ(LogicalType::LIST, node2.logical_type()); |
329 | } |
330 | |
331 | TEST_F(TestGroupNode, Equals) { |
332 | NodeVector f1 = Fields1(); |
333 | NodeVector f2 = Fields1(); |
334 | |
335 | GroupNode group1("group" , Repetition::REPEATED, f1); |
336 | GroupNode group2("group" , Repetition::REPEATED, f2); |
337 | GroupNode group3("group2" , Repetition::REPEATED, f2); |
338 | |
339 | // This is copied in the GroupNode ctor, so this is okay |
340 | f2.push_back(Float("four" , Repetition::OPTIONAL)); |
341 | GroupNode group4("group" , Repetition::REPEATED, f2); |
342 | GroupNode group5("group" , Repetition::REPEATED, Fields1()); |
343 | |
344 | ASSERT_TRUE(group1.Equals(&group1)); |
345 | ASSERT_TRUE(group1.Equals(&group2)); |
346 | ASSERT_FALSE(group1.Equals(&group3)); |
347 | |
348 | ASSERT_FALSE(group1.Equals(&group4)); |
349 | ASSERT_FALSE(group5.Equals(&group4)); |
350 | } |
351 | |
352 | TEST_F(TestGroupNode, FieldIndex) { |
353 | NodeVector fields = Fields1(); |
354 | GroupNode group("group" , Repetition::REQUIRED, fields); |
355 | for (size_t i = 0; i < fields.size(); i++) { |
356 | auto field = group.field(static_cast<int>(i)); |
357 | ASSERT_EQ(i, group.FieldIndex(*field)); |
358 | } |
359 | |
360 | // Test a non field node |
361 | auto non_field_alien = Int32("alien" , Repetition::REQUIRED); // other name |
362 | auto non_field_familiar = Int32("one" , Repetition::REPEATED); // other node |
363 | ASSERT_LT(group.FieldIndex(*non_field_alien), 0); |
364 | ASSERT_LT(group.FieldIndex(*non_field_familiar), 0); |
365 | } |
366 | |
367 | TEST_F(TestGroupNode, FieldIndexDuplicateName) { |
368 | NodeVector fields = Fields2(); |
369 | GroupNode group("group" , Repetition::REQUIRED, fields); |
370 | for (size_t i = 0; i < fields.size(); i++) { |
371 | auto field = group.field(static_cast<int>(i)); |
372 | ASSERT_EQ(i, group.FieldIndex(*field)); |
373 | } |
374 | } |
375 | |
376 | // ---------------------------------------------------------------------- |
377 | // Test convert group |
378 | |
379 | class TestSchemaConverter : public ::testing::Test { |
380 | public: |
381 | void setUp() { name_ = "parquet_schema" ; } |
382 | |
383 | void Convert(const parquet::format::SchemaElement* elements, int length) { |
384 | FlatSchemaConverter converter(elements, length); |
385 | node_ = converter.Convert(); |
386 | ASSERT_TRUE(node_->is_group()); |
387 | group_ = static_cast<const GroupNode*>(node_.get()); |
388 | } |
389 | |
390 | protected: |
391 | std::string name_; |
392 | const GroupNode* group_; |
393 | std::unique_ptr<Node> node_; |
394 | }; |
395 | |
396 | bool check_for_parent_consistency(const GroupNode* node) { |
397 | // Each node should have the group as parent |
398 | for (int i = 0; i < node->field_count(); i++) { |
399 | const NodePtr& field = node->field(i); |
400 | if (field->parent() != node) { |
401 | return false; |
402 | } |
403 | if (field->is_group()) { |
404 | const GroupNode* group = static_cast<GroupNode*>(field.get()); |
405 | if (!check_for_parent_consistency(group)) { |
406 | return false; |
407 | } |
408 | } |
409 | } |
410 | return true; |
411 | } |
412 | |
413 | TEST_F(TestSchemaConverter, NestedExample) { |
414 | SchemaElement elt; |
415 | std::vector<SchemaElement> elements; |
416 | elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); |
417 | |
418 | // A primitive one |
419 | elements.push_back(NewPrimitive("a" , FieldRepetitionType::REQUIRED, Type::INT32, 1)); |
420 | |
421 | // A group |
422 | elements.push_back(NewGroup("bag" , FieldRepetitionType::OPTIONAL, 1, 2)); |
423 | |
424 | // 3-level list encoding, by hand |
425 | elt = NewGroup("b" , FieldRepetitionType::REPEATED, 1, 3); |
426 | elt.__set_converted_type(ConvertedType::LIST); |
427 | elements.push_back(elt); |
428 | elements.push_back(NewPrimitive("item" , FieldRepetitionType::OPTIONAL, Type::INT64, 4)); |
429 | |
430 | ASSERT_NO_FATAL_FAILURE(Convert(&elements[0], static_cast<int>(elements.size()))); |
431 | |
432 | // Construct the expected schema |
433 | NodeVector fields; |
434 | fields.push_back(Int32("a" , Repetition::REQUIRED)); |
435 | |
436 | // 3-level list encoding |
437 | NodePtr item = Int64("item" ); |
438 | NodePtr list(GroupNode::Make("b" , Repetition::REPEATED, {item}, LogicalType::LIST)); |
439 | NodePtr bag(GroupNode::Make("bag" , Repetition::OPTIONAL, {list})); |
440 | fields.push_back(bag); |
441 | |
442 | NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields); |
443 | |
444 | ASSERT_TRUE(schema->Equals(group_)); |
445 | |
446 | // Check that the parent relationship in each node is consitent |
447 | ASSERT_EQ(group_->parent(), nullptr); |
448 | ASSERT_TRUE(check_for_parent_consistency(group_)); |
449 | } |
450 | |
451 | TEST_F(TestSchemaConverter, InvalidRoot) { |
452 | // According to the Parquet specification, the first element in the |
453 | // list<SchemaElement> is a group whose children (and their descendants) |
454 | // contain all of the rest of the flattened schema elements. If the first |
455 | // element is not a group, it is a malformed Parquet file. |
456 | |
457 | SchemaElement elements[2]; |
458 | elements[0] = |
459 | NewPrimitive("not-a-group" , FieldRepetitionType::REQUIRED, Type::INT32, 0); |
460 | ASSERT_THROW(Convert(elements, 2), ParquetException); |
461 | |
462 | // While the Parquet spec indicates that the root group should have REPEATED |
463 | // repetition type, some implementations may return REQUIRED or OPTIONAL |
464 | // groups as the first element. These tests check that this is okay as a |
465 | // practicality matter. |
466 | elements[0] = NewGroup("not-repeated" , FieldRepetitionType::REQUIRED, 1, 0); |
467 | elements[1] = NewPrimitive("a" , FieldRepetitionType::REQUIRED, Type::INT32, 1); |
468 | ASSERT_NO_FATAL_FAILURE(Convert(elements, 2)); |
469 | |
470 | elements[0] = NewGroup("not-repeated" , FieldRepetitionType::OPTIONAL, 1, 0); |
471 | ASSERT_NO_FATAL_FAILURE(Convert(elements, 2)); |
472 | } |
473 | |
474 | TEST_F(TestSchemaConverter, NotEnoughChildren) { |
475 | // Throw a ParquetException, but don't core dump or anything |
476 | SchemaElement elt; |
477 | std::vector<SchemaElement> elements; |
478 | elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); |
479 | ASSERT_THROW(Convert(&elements[0], 1), ParquetException); |
480 | } |
481 | |
482 | // ---------------------------------------------------------------------- |
483 | // Schema tree flatten / unflatten |
484 | |
485 | class TestSchemaFlatten : public ::testing::Test { |
486 | public: |
487 | void setUp() { name_ = "parquet_schema" ; } |
488 | |
489 | void Flatten(const GroupNode* schema) { ToParquet(schema, &elements_); } |
490 | |
491 | protected: |
492 | std::string name_; |
493 | std::vector<format::SchemaElement> elements_; |
494 | }; |
495 | |
496 | TEST_F(TestSchemaFlatten, DecimalMetadata) { |
497 | // Checks that DecimalMetadata is only set for DecimalTypes |
498 | NodePtr node = PrimitiveNode::Make("decimal" , Repetition::REQUIRED, Type::INT64, |
499 | LogicalType::DECIMAL, -1, 8, 4); |
500 | NodePtr group = |
501 | GroupNode::Make("group" , Repetition::REPEATED, {node}, LogicalType::LIST); |
502 | Flatten(reinterpret_cast<GroupNode*>(group.get())); |
503 | ASSERT_EQ("decimal" , elements_[1].name); |
504 | ASSERT_TRUE(elements_[1].__isset.precision); |
505 | ASSERT_TRUE(elements_[1].__isset.scale); |
506 | |
507 | elements_.clear(); |
508 | // Not for integers with no logical type |
509 | group = |
510 | GroupNode::Make("group" , Repetition::REPEATED, {Int64("int64" )}, LogicalType::LIST); |
511 | Flatten(reinterpret_cast<GroupNode*>(group.get())); |
512 | ASSERT_EQ("int64" , elements_[1].name); |
513 | ASSERT_FALSE(elements_[0].__isset.precision); |
514 | ASSERT_FALSE(elements_[0].__isset.scale); |
515 | } |
516 | |
517 | TEST_F(TestSchemaFlatten, NestedExample) { |
518 | SchemaElement elt; |
519 | std::vector<SchemaElement> elements; |
520 | elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); |
521 | |
522 | // A primitive one |
523 | elements.push_back(NewPrimitive("a" , FieldRepetitionType::REQUIRED, Type::INT32, 1)); |
524 | |
525 | // A group |
526 | elements.push_back(NewGroup("bag" , FieldRepetitionType::OPTIONAL, 1, 2)); |
527 | |
528 | // 3-level list encoding, by hand |
529 | elt = NewGroup("b" , FieldRepetitionType::REPEATED, 1, 3); |
530 | elt.__set_converted_type(ConvertedType::LIST); |
531 | elements.push_back(elt); |
532 | elements.push_back(NewPrimitive("item" , FieldRepetitionType::OPTIONAL, Type::INT64, 4)); |
533 | |
534 | // Construct the schema |
535 | NodeVector fields; |
536 | fields.push_back(Int32("a" , Repetition::REQUIRED)); |
537 | |
538 | // 3-level list encoding |
539 | NodePtr item = Int64("item" ); |
540 | NodePtr list(GroupNode::Make("b" , Repetition::REPEATED, {item}, LogicalType::LIST)); |
541 | NodePtr bag(GroupNode::Make("bag" , Repetition::OPTIONAL, {list})); |
542 | fields.push_back(bag); |
543 | |
544 | NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields); |
545 | |
546 | Flatten(static_cast<GroupNode*>(schema.get())); |
547 | ASSERT_EQ(elements_.size(), elements.size()); |
548 | for (size_t i = 0; i < elements_.size(); i++) { |
549 | ASSERT_EQ(elements_[i], elements[i]); |
550 | } |
551 | } |
552 | |
553 | TEST(TestColumnDescriptor, TestAttrs) { |
554 | NodePtr node = PrimitiveNode::Make("name" , Repetition::OPTIONAL, Type::BYTE_ARRAY, |
555 | LogicalType::UTF8); |
556 | ColumnDescriptor descr(node, 4, 1); |
557 | |
558 | ASSERT_EQ("name" , descr.name()); |
559 | ASSERT_EQ(4, descr.max_definition_level()); |
560 | ASSERT_EQ(1, descr.max_repetition_level()); |
561 | |
562 | ASSERT_EQ(Type::BYTE_ARRAY, descr.physical_type()); |
563 | |
564 | ASSERT_EQ(-1, descr.type_length()); |
565 | |
566 | // Test FIXED_LEN_BYTE_ARRAY |
567 | node = PrimitiveNode::Make("name" , Repetition::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, |
568 | LogicalType::DECIMAL, 12, 10, 4); |
569 | descr = ColumnDescriptor(node, 4, 1); |
570 | |
571 | ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, descr.physical_type()); |
572 | ASSERT_EQ(12, descr.type_length()); |
573 | } |
574 | |
575 | class TestSchemaDescriptor : public ::testing::Test { |
576 | public: |
577 | void setUp() {} |
578 | |
579 | protected: |
580 | SchemaDescriptor descr_; |
581 | }; |
582 | |
583 | TEST_F(TestSchemaDescriptor, InitNonGroup) { |
584 | NodePtr node = PrimitiveNode::Make("field" , Repetition::OPTIONAL, Type::INT32); |
585 | |
586 | ASSERT_THROW(descr_.Init(node), ParquetException); |
587 | } |
588 | |
589 | TEST_F(TestSchemaDescriptor, Equals) { |
590 | NodePtr schema; |
591 | |
592 | NodePtr inta = Int32("a" , Repetition::REQUIRED); |
593 | NodePtr intb = Int64("b" , Repetition::OPTIONAL); |
594 | NodePtr intb2 = Int64("b2" , Repetition::OPTIONAL); |
595 | NodePtr intc = ByteArray("c" , Repetition::REPEATED); |
596 | |
597 | NodePtr item1 = Int64("item1" , Repetition::REQUIRED); |
598 | NodePtr item2 = Boolean("item2" , Repetition::OPTIONAL); |
599 | NodePtr item3 = Int32("item3" , Repetition::REPEATED); |
600 | NodePtr list(GroupNode::Make("records" , Repetition::REPEATED, {item1, item2, item3}, |
601 | LogicalType::LIST)); |
602 | |
603 | NodePtr bag(GroupNode::Make("bag" , Repetition::OPTIONAL, {list})); |
604 | NodePtr bag2(GroupNode::Make("bag" , Repetition::REQUIRED, {list})); |
605 | |
606 | SchemaDescriptor descr1; |
607 | descr1.Init(GroupNode::Make("schema" , Repetition::REPEATED, {inta, intb, intc, bag})); |
608 | |
609 | ASSERT_TRUE(descr1.Equals(descr1)); |
610 | |
611 | SchemaDescriptor descr2; |
612 | descr2.Init(GroupNode::Make("schema" , Repetition::REPEATED, {inta, intb, intc, bag2})); |
613 | ASSERT_FALSE(descr1.Equals(descr2)); |
614 | |
615 | SchemaDescriptor descr3; |
616 | descr3.Init(GroupNode::Make("schema" , Repetition::REPEATED, {inta, intb2, intc, bag})); |
617 | ASSERT_FALSE(descr1.Equals(descr3)); |
618 | |
619 | // Robust to name of parent node |
620 | SchemaDescriptor descr4; |
621 | descr4.Init(GroupNode::Make("SCHEMA" , Repetition::REPEATED, {inta, intb, intc, bag})); |
622 | ASSERT_TRUE(descr1.Equals(descr4)); |
623 | |
624 | SchemaDescriptor descr5; |
625 | descr5.Init( |
626 | GroupNode::Make("schema" , Repetition::REPEATED, {inta, intb, intc, bag, intb2})); |
627 | ASSERT_FALSE(descr1.Equals(descr5)); |
628 | |
629 | // Different max repetition / definition levels |
630 | ColumnDescriptor col1(inta, 5, 1); |
631 | ColumnDescriptor col2(inta, 6, 1); |
632 | ColumnDescriptor col3(inta, 5, 2); |
633 | |
634 | ASSERT_TRUE(col1.Equals(col1)); |
635 | ASSERT_FALSE(col1.Equals(col2)); |
636 | ASSERT_FALSE(col1.Equals(col3)); |
637 | } |
638 | |
639 | TEST_F(TestSchemaDescriptor, BuildTree) { |
640 | NodeVector fields; |
641 | NodePtr schema; |
642 | |
643 | NodePtr inta = Int32("a" , Repetition::REQUIRED); |
644 | fields.push_back(inta); |
645 | fields.push_back(Int64("b" , Repetition::OPTIONAL)); |
646 | fields.push_back(ByteArray("c" , Repetition::REPEATED)); |
647 | |
648 | // 3-level list encoding |
649 | NodePtr item1 = Int64("item1" , Repetition::REQUIRED); |
650 | NodePtr item2 = Boolean("item2" , Repetition::OPTIONAL); |
651 | NodePtr item3 = Int32("item3" , Repetition::REPEATED); |
652 | NodePtr list(GroupNode::Make("records" , Repetition::REPEATED, {item1, item2, item3}, |
653 | LogicalType::LIST)); |
654 | NodePtr bag(GroupNode::Make("bag" , Repetition::OPTIONAL, {list})); |
655 | fields.push_back(bag); |
656 | |
657 | schema = GroupNode::Make("schema" , Repetition::REPEATED, fields); |
658 | |
659 | descr_.Init(schema); |
660 | |
661 | int nleaves = 6; |
662 | |
663 | // 6 leaves |
664 | ASSERT_EQ(nleaves, descr_.num_columns()); |
665 | |
666 | // mdef mrep |
667 | // required int32 a 0 0 |
668 | // optional int64 b 1 0 |
669 | // repeated byte_array c 1 1 |
670 | // optional group bag 1 0 |
671 | // repeated group records 2 1 |
672 | // required int64 item1 2 1 |
673 | // optional boolean item2 3 1 |
674 | // repeated int32 item3 3 2 |
675 | int16_t ex_max_def_levels[6] = {0, 1, 1, 2, 3, 3}; |
676 | int16_t ex_max_rep_levels[6] = {0, 0, 1, 1, 1, 2}; |
677 | |
678 | for (int i = 0; i < nleaves; ++i) { |
679 | const ColumnDescriptor* col = descr_.Column(i); |
680 | EXPECT_EQ(ex_max_def_levels[i], col->max_definition_level()) << i; |
681 | EXPECT_EQ(ex_max_rep_levels[i], col->max_repetition_level()) << i; |
682 | } |
683 | |
684 | ASSERT_EQ(descr_.Column(0)->path()->ToDotString(), "a" ); |
685 | ASSERT_EQ(descr_.Column(1)->path()->ToDotString(), "b" ); |
686 | ASSERT_EQ(descr_.Column(2)->path()->ToDotString(), "c" ); |
687 | ASSERT_EQ(descr_.Column(3)->path()->ToDotString(), "bag.records.item1" ); |
688 | ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2" ); |
689 | ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3" ); |
690 | |
691 | for (int i = 0; i < nleaves; ++i) { |
692 | auto col = descr_.Column(i); |
693 | ASSERT_EQ(i, descr_.ColumnIndex(*col->schema_node())); |
694 | } |
695 | |
696 | // Test non-column nodes find |
697 | NodePtr non_column_alien = Int32("alien" , Repetition::REQUIRED); // other path |
698 | NodePtr non_column_familiar = Int32("a" , Repetition::REPEATED); // other node |
699 | ASSERT_LT(descr_.ColumnIndex(*non_column_alien), 0); |
700 | ASSERT_LT(descr_.ColumnIndex(*non_column_familiar), 0); |
701 | |
702 | ASSERT_EQ(inta.get(), descr_.GetColumnRoot(0)); |
703 | ASSERT_EQ(bag.get(), descr_.GetColumnRoot(3)); |
704 | ASSERT_EQ(bag.get(), descr_.GetColumnRoot(4)); |
705 | ASSERT_EQ(bag.get(), descr_.GetColumnRoot(5)); |
706 | |
707 | ASSERT_EQ(schema.get(), descr_.group_node()); |
708 | |
709 | // Init clears the leaves |
710 | descr_.Init(schema); |
711 | ASSERT_EQ(nleaves, descr_.num_columns()); |
712 | } |
713 | |
714 | static std::string Print(const NodePtr& node) { |
715 | std::stringstream ss; |
716 | PrintSchema(node.get(), ss); |
717 | return ss.str(); |
718 | } |
719 | |
720 | TEST(TestSchemaPrinter, Examples) { |
721 | // Test schema 1 |
722 | NodeVector fields; |
723 | fields.push_back(Int32("a" , Repetition::REQUIRED)); |
724 | |
725 | // 3-level list encoding |
726 | NodePtr item1 = Int64("item1" ); |
727 | NodePtr item2 = Boolean("item2" , Repetition::REQUIRED); |
728 | NodePtr list( |
729 | GroupNode::Make("b" , Repetition::REPEATED, {item1, item2}, LogicalType::LIST)); |
730 | NodePtr bag(GroupNode::Make("bag" , Repetition::OPTIONAL, {list})); |
731 | fields.push_back(bag); |
732 | |
733 | fields.push_back(PrimitiveNode::Make("c" , Repetition::REQUIRED, Type::INT32, |
734 | LogicalType::DECIMAL, -1, 3, 2)); |
735 | |
736 | NodePtr schema = GroupNode::Make("schema" , Repetition::REPEATED, fields); |
737 | |
738 | std::string result = Print(schema); |
739 | std::string expected = R"(message schema { |
740 | required int32 a; |
741 | optional group bag { |
742 | repeated group b (LIST) { |
743 | optional int64 item1; |
744 | required boolean item2; |
745 | } |
746 | } |
747 | required int32 c (DECIMAL(3,2)); |
748 | } |
749 | )" ; |
750 | ASSERT_EQ(expected, result); |
751 | } |
752 | |
753 | } // namespace schema |
754 | } // namespace parquet |
755 | |