1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <gtest/gtest.h>
19
20#include <cstdlib>
21#include <iosfwd>
22#include <memory>
23#include <string>
24#include <vector>
25
26#include "parquet/exception.h"
27#include "parquet/schema-internal.h"
28#include "parquet/schema.h"
29#include "parquet/thrift.h"
30#include "parquet/types.h"
31
32using std::string;
33using std::vector;
34
35namespace parquet {
36
37using format::ConvertedType;
38using format::FieldRepetitionType;
39using format::SchemaElement;
40
41namespace schema {
42
43static inline SchemaElement NewPrimitive(const std::string& name,
44 FieldRepetitionType::type repetition,
45 Type::type type, int id = 0) {
46 SchemaElement result;
47 result.__set_name(name);
48 result.__set_repetition_type(repetition);
49 result.__set_type(static_cast<format::Type::type>(type));
50
51 return result;
52}
53
54static inline SchemaElement NewGroup(const std::string& name,
55 FieldRepetitionType::type repetition,
56 int num_children, int id = 0) {
57 SchemaElement result;
58 result.__set_name(name);
59 result.__set_repetition_type(repetition);
60 result.__set_num_children(num_children);
61
62 return result;
63}
64
65// ----------------------------------------------------------------------
66// ColumnPath
67
68TEST(TestColumnPath, TestAttrs) {
69 ColumnPath path(std::vector<std::string>({"toplevel", "leaf"}));
70
71 ASSERT_EQ(path.ToDotString(), "toplevel.leaf");
72
73 std::shared_ptr<ColumnPath> path_ptr = ColumnPath::FromDotString("toplevel.leaf");
74 ASSERT_EQ(path_ptr->ToDotString(), "toplevel.leaf");
75
76 std::shared_ptr<ColumnPath> extended = path_ptr->extend("anotherlevel");
77 ASSERT_EQ(extended->ToDotString(), "toplevel.leaf.anotherlevel");
78}
79
80// ----------------------------------------------------------------------
81// Primitive node
82
83class TestPrimitiveNode : public ::testing::Test {
84 public:
85 void SetUp() {
86 name_ = "name";
87 id_ = 5;
88 }
89
90 void Convert(const format::SchemaElement* element) {
91 node_ = PrimitiveNode::FromParquet(element, id_);
92 ASSERT_TRUE(node_->is_primitive());
93 prim_node_ = static_cast<const PrimitiveNode*>(node_.get());
94 }
95
96 protected:
97 std::string name_;
98 const PrimitiveNode* prim_node_;
99
100 int id_;
101 std::unique_ptr<Node> node_;
102};
103
104TEST_F(TestPrimitiveNode, Attrs) {
105 PrimitiveNode node1("foo", Repetition::REPEATED, Type::INT32);
106
107 PrimitiveNode node2("bar", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::UTF8);
108
109 ASSERT_EQ("foo", node1.name());
110
111 ASSERT_TRUE(node1.is_primitive());
112 ASSERT_FALSE(node1.is_group());
113
114 ASSERT_EQ(Repetition::REPEATED, node1.repetition());
115 ASSERT_EQ(Repetition::OPTIONAL, node2.repetition());
116
117 ASSERT_EQ(Node::PRIMITIVE, node1.node_type());
118
119 ASSERT_EQ(Type::INT32, node1.physical_type());
120 ASSERT_EQ(Type::BYTE_ARRAY, node2.physical_type());
121
122 // logical types
123 ASSERT_EQ(LogicalType::NONE, node1.logical_type());
124 ASSERT_EQ(LogicalType::UTF8, node2.logical_type());
125
126 // repetition
127 PrimitiveNode node3("foo", Repetition::REPEATED, Type::INT32);
128 PrimitiveNode node4("foo", Repetition::REQUIRED, Type::INT32);
129 PrimitiveNode node5("foo", Repetition::OPTIONAL, Type::INT32);
130
131 ASSERT_TRUE(node3.is_repeated());
132 ASSERT_FALSE(node3.is_optional());
133
134 ASSERT_TRUE(node4.is_required());
135
136 ASSERT_TRUE(node5.is_optional());
137 ASSERT_FALSE(node5.is_required());
138}
139
140TEST_F(TestPrimitiveNode, FromParquet) {
141 SchemaElement elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, Type::INT32, 0);
142 ASSERT_NO_FATAL_FAILURE(Convert(&elt));
143 ASSERT_EQ(name_, prim_node_->name());
144 ASSERT_EQ(id_, prim_node_->id());
145 ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition());
146 ASSERT_EQ(Type::INT32, prim_node_->physical_type());
147 ASSERT_EQ(LogicalType::NONE, prim_node_->logical_type());
148
149 // Test a logical type
150 elt = NewPrimitive(name_, FieldRepetitionType::REQUIRED, Type::BYTE_ARRAY, 0);
151 elt.__set_converted_type(ConvertedType::UTF8);
152
153 ASSERT_NO_FATAL_FAILURE(Convert(&elt));
154 ASSERT_EQ(Repetition::REQUIRED, prim_node_->repetition());
155 ASSERT_EQ(Type::BYTE_ARRAY, prim_node_->physical_type());
156 ASSERT_EQ(LogicalType::UTF8, prim_node_->logical_type());
157
158 // FIXED_LEN_BYTE_ARRAY
159 elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, 0);
160 elt.__set_type_length(16);
161
162 ASSERT_NO_FATAL_FAILURE(Convert(&elt));
163 ASSERT_EQ(name_, prim_node_->name());
164 ASSERT_EQ(id_, prim_node_->id());
165 ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition());
166 ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type());
167 ASSERT_EQ(16, prim_node_->type_length());
168
169 // ConvertedType::Decimal
170 elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, 0);
171 elt.__set_converted_type(ConvertedType::DECIMAL);
172 elt.__set_type_length(6);
173 elt.__set_scale(2);
174 elt.__set_precision(12);
175
176 ASSERT_NO_FATAL_FAILURE(Convert(&elt));
177 ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type());
178 ASSERT_EQ(LogicalType::DECIMAL, prim_node_->logical_type());
179 ASSERT_EQ(6, prim_node_->type_length());
180 ASSERT_EQ(2, prim_node_->decimal_metadata().scale);
181 ASSERT_EQ(12, prim_node_->decimal_metadata().precision);
182}
183
184TEST_F(TestPrimitiveNode, Equals) {
185 PrimitiveNode node1("foo", Repetition::REQUIRED, Type::INT32);
186 PrimitiveNode node2("foo", Repetition::REQUIRED, Type::INT64);
187 PrimitiveNode node3("bar", Repetition::REQUIRED, Type::INT32);
188 PrimitiveNode node4("foo", Repetition::OPTIONAL, Type::INT32);
189 PrimitiveNode node5("foo", Repetition::REQUIRED, Type::INT32);
190
191 ASSERT_TRUE(node1.Equals(&node1));
192 ASSERT_FALSE(node1.Equals(&node2));
193 ASSERT_FALSE(node1.Equals(&node3));
194 ASSERT_FALSE(node1.Equals(&node4));
195 ASSERT_TRUE(node1.Equals(&node5));
196
197 PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
198 LogicalType::DECIMAL, 12, 4, 2);
199
200 PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
201 LogicalType::DECIMAL, 1, 4, 2);
202 flba2.SetTypeLength(12);
203
204 PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
205 LogicalType::DECIMAL, 1, 4, 2);
206 flba3.SetTypeLength(16);
207
208 PrimitiveNode flba4("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
209 LogicalType::DECIMAL, 12, 4, 0);
210
211 PrimitiveNode flba5("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
212 LogicalType::NONE, 12, 4, 0);
213
214 ASSERT_TRUE(flba1.Equals(&flba2));
215 ASSERT_FALSE(flba1.Equals(&flba3));
216 ASSERT_FALSE(flba1.Equals(&flba4));
217 ASSERT_FALSE(flba1.Equals(&flba5));
218}
219
220TEST_F(TestPrimitiveNode, PhysicalLogicalMapping) {
221 ASSERT_NO_THROW(
222 PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::INT32, LogicalType::INT_32));
223 ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::BYTE_ARRAY,
224 LogicalType::JSON));
225 ASSERT_THROW(
226 PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::INT32, LogicalType::JSON),
227 ParquetException);
228 ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::INT64,
229 LogicalType::TIMESTAMP_MILLIS));
230 ASSERT_THROW(
231 PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::INT32, LogicalType::INT_64),
232 ParquetException);
233 ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::BYTE_ARRAY,
234 LogicalType::INT_8),
235 ParquetException);
236 ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::BYTE_ARRAY,
237 LogicalType::INTERVAL),
238 ParquetException);
239 ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
240 Type::FIXED_LEN_BYTE_ARRAY, LogicalType::ENUM),
241 ParquetException);
242 ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::BYTE_ARRAY,
243 LogicalType::ENUM));
244 ASSERT_THROW(
245 PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
246 LogicalType::DECIMAL, 0, 2, 4),
247 ParquetException);
248 ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::FLOAT,
249 LogicalType::DECIMAL, 0, 2, 4),
250 ParquetException);
251 ASSERT_THROW(
252 PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
253 LogicalType::DECIMAL, 0, 4, 0),
254 ParquetException);
255 ASSERT_THROW(
256 PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
257 LogicalType::DECIMAL, 10, 0, 4),
258 ParquetException);
259 ASSERT_THROW(
260 PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
261 LogicalType::DECIMAL, 10, 4, -1),
262 ParquetException);
263 ASSERT_THROW(
264 PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
265 LogicalType::DECIMAL, 10, 2, 4),
266 ParquetException);
267 ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
268 Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL,
269 10, 6, 4));
270 ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
271 Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL,
272 12));
273 ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
274 Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 10),
275 ParquetException);
276}
277
278// ----------------------------------------------------------------------
279// Group node
280
281class TestGroupNode : public ::testing::Test {
282 public:
283 NodeVector Fields1() {
284 NodeVector fields;
285
286 fields.push_back(Int32("one", Repetition::REQUIRED));
287 fields.push_back(Int64("two"));
288 fields.push_back(Double("three"));
289
290 return fields;
291 }
292
293 NodeVector Fields2() {
294 // Fields with a duplicate name
295 NodeVector fields;
296
297 fields.push_back(Int32("duplicate", Repetition::REQUIRED));
298 fields.push_back(Int64("unique"));
299 fields.push_back(Double("duplicate"));
300
301 return fields;
302 }
303};
304
305TEST_F(TestGroupNode, Attrs) {
306 NodeVector fields = Fields1();
307
308 GroupNode node1("foo", Repetition::REPEATED, fields);
309 GroupNode node2("bar", Repetition::OPTIONAL, fields, LogicalType::LIST);
310
311 ASSERT_EQ("foo", node1.name());
312
313 ASSERT_TRUE(node1.is_group());
314 ASSERT_FALSE(node1.is_primitive());
315
316 ASSERT_EQ(fields.size(), node1.field_count());
317
318 ASSERT_TRUE(node1.is_repeated());
319 ASSERT_TRUE(node2.is_optional());
320
321 ASSERT_EQ(Repetition::REPEATED, node1.repetition());
322 ASSERT_EQ(Repetition::OPTIONAL, node2.repetition());
323
324 ASSERT_EQ(Node::GROUP, node1.node_type());
325
326 // logical types
327 ASSERT_EQ(LogicalType::NONE, node1.logical_type());
328 ASSERT_EQ(LogicalType::LIST, node2.logical_type());
329}
330
331TEST_F(TestGroupNode, Equals) {
332 NodeVector f1 = Fields1();
333 NodeVector f2 = Fields1();
334
335 GroupNode group1("group", Repetition::REPEATED, f1);
336 GroupNode group2("group", Repetition::REPEATED, f2);
337 GroupNode group3("group2", Repetition::REPEATED, f2);
338
339 // This is copied in the GroupNode ctor, so this is okay
340 f2.push_back(Float("four", Repetition::OPTIONAL));
341 GroupNode group4("group", Repetition::REPEATED, f2);
342 GroupNode group5("group", Repetition::REPEATED, Fields1());
343
344 ASSERT_TRUE(group1.Equals(&group1));
345 ASSERT_TRUE(group1.Equals(&group2));
346 ASSERT_FALSE(group1.Equals(&group3));
347
348 ASSERT_FALSE(group1.Equals(&group4));
349 ASSERT_FALSE(group5.Equals(&group4));
350}
351
352TEST_F(TestGroupNode, FieldIndex) {
353 NodeVector fields = Fields1();
354 GroupNode group("group", Repetition::REQUIRED, fields);
355 for (size_t i = 0; i < fields.size(); i++) {
356 auto field = group.field(static_cast<int>(i));
357 ASSERT_EQ(i, group.FieldIndex(*field));
358 }
359
360 // Test a non field node
361 auto non_field_alien = Int32("alien", Repetition::REQUIRED); // other name
362 auto non_field_familiar = Int32("one", Repetition::REPEATED); // other node
363 ASSERT_LT(group.FieldIndex(*non_field_alien), 0);
364 ASSERT_LT(group.FieldIndex(*non_field_familiar), 0);
365}
366
367TEST_F(TestGroupNode, FieldIndexDuplicateName) {
368 NodeVector fields = Fields2();
369 GroupNode group("group", Repetition::REQUIRED, fields);
370 for (size_t i = 0; i < fields.size(); i++) {
371 auto field = group.field(static_cast<int>(i));
372 ASSERT_EQ(i, group.FieldIndex(*field));
373 }
374}
375
376// ----------------------------------------------------------------------
377// Test convert group
378
379class TestSchemaConverter : public ::testing::Test {
380 public:
381 void setUp() { name_ = "parquet_schema"; }
382
383 void Convert(const parquet::format::SchemaElement* elements, int length) {
384 FlatSchemaConverter converter(elements, length);
385 node_ = converter.Convert();
386 ASSERT_TRUE(node_->is_group());
387 group_ = static_cast<const GroupNode*>(node_.get());
388 }
389
390 protected:
391 std::string name_;
392 const GroupNode* group_;
393 std::unique_ptr<Node> node_;
394};
395
396bool check_for_parent_consistency(const GroupNode* node) {
397 // Each node should have the group as parent
398 for (int i = 0; i < node->field_count(); i++) {
399 const NodePtr& field = node->field(i);
400 if (field->parent() != node) {
401 return false;
402 }
403 if (field->is_group()) {
404 const GroupNode* group = static_cast<GroupNode*>(field.get());
405 if (!check_for_parent_consistency(group)) {
406 return false;
407 }
408 }
409 }
410 return true;
411}
412
413TEST_F(TestSchemaConverter, NestedExample) {
414 SchemaElement elt;
415 std::vector<SchemaElement> elements;
416 elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
417
418 // A primitive one
419 elements.push_back(NewPrimitive("a", FieldRepetitionType::REQUIRED, Type::INT32, 1));
420
421 // A group
422 elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2));
423
424 // 3-level list encoding, by hand
425 elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3);
426 elt.__set_converted_type(ConvertedType::LIST);
427 elements.push_back(elt);
428 elements.push_back(NewPrimitive("item", FieldRepetitionType::OPTIONAL, Type::INT64, 4));
429
430 ASSERT_NO_FATAL_FAILURE(Convert(&elements[0], static_cast<int>(elements.size())));
431
432 // Construct the expected schema
433 NodeVector fields;
434 fields.push_back(Int32("a", Repetition::REQUIRED));
435
436 // 3-level list encoding
437 NodePtr item = Int64("item");
438 NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST));
439 NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
440 fields.push_back(bag);
441
442 NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);
443
444 ASSERT_TRUE(schema->Equals(group_));
445
446 // Check that the parent relationship in each node is consitent
447 ASSERT_EQ(group_->parent(), nullptr);
448 ASSERT_TRUE(check_for_parent_consistency(group_));
449}
450
451TEST_F(TestSchemaConverter, InvalidRoot) {
452 // According to the Parquet specification, the first element in the
453 // list<SchemaElement> is a group whose children (and their descendants)
454 // contain all of the rest of the flattened schema elements. If the first
455 // element is not a group, it is a malformed Parquet file.
456
457 SchemaElement elements[2];
458 elements[0] =
459 NewPrimitive("not-a-group", FieldRepetitionType::REQUIRED, Type::INT32, 0);
460 ASSERT_THROW(Convert(elements, 2), ParquetException);
461
462 // While the Parquet spec indicates that the root group should have REPEATED
463 // repetition type, some implementations may return REQUIRED or OPTIONAL
464 // groups as the first element. These tests check that this is okay as a
465 // practicality matter.
466 elements[0] = NewGroup("not-repeated", FieldRepetitionType::REQUIRED, 1, 0);
467 elements[1] = NewPrimitive("a", FieldRepetitionType::REQUIRED, Type::INT32, 1);
468 ASSERT_NO_FATAL_FAILURE(Convert(elements, 2));
469
470 elements[0] = NewGroup("not-repeated", FieldRepetitionType::OPTIONAL, 1, 0);
471 ASSERT_NO_FATAL_FAILURE(Convert(elements, 2));
472}
473
474TEST_F(TestSchemaConverter, NotEnoughChildren) {
475 // Throw a ParquetException, but don't core dump or anything
476 SchemaElement elt;
477 std::vector<SchemaElement> elements;
478 elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
479 ASSERT_THROW(Convert(&elements[0], 1), ParquetException);
480}
481
482// ----------------------------------------------------------------------
483// Schema tree flatten / unflatten
484
485class TestSchemaFlatten : public ::testing::Test {
486 public:
487 void setUp() { name_ = "parquet_schema"; }
488
489 void Flatten(const GroupNode* schema) { ToParquet(schema, &elements_); }
490
491 protected:
492 std::string name_;
493 std::vector<format::SchemaElement> elements_;
494};
495
496TEST_F(TestSchemaFlatten, DecimalMetadata) {
497 // Checks that DecimalMetadata is only set for DecimalTypes
498 NodePtr node = PrimitiveNode::Make("decimal", Repetition::REQUIRED, Type::INT64,
499 LogicalType::DECIMAL, -1, 8, 4);
500 NodePtr group =
501 GroupNode::Make("group", Repetition::REPEATED, {node}, LogicalType::LIST);
502 Flatten(reinterpret_cast<GroupNode*>(group.get()));
503 ASSERT_EQ("decimal", elements_[1].name);
504 ASSERT_TRUE(elements_[1].__isset.precision);
505 ASSERT_TRUE(elements_[1].__isset.scale);
506
507 elements_.clear();
508 // Not for integers with no logical type
509 group =
510 GroupNode::Make("group", Repetition::REPEATED, {Int64("int64")}, LogicalType::LIST);
511 Flatten(reinterpret_cast<GroupNode*>(group.get()));
512 ASSERT_EQ("int64", elements_[1].name);
513 ASSERT_FALSE(elements_[0].__isset.precision);
514 ASSERT_FALSE(elements_[0].__isset.scale);
515}
516
517TEST_F(TestSchemaFlatten, NestedExample) {
518 SchemaElement elt;
519 std::vector<SchemaElement> elements;
520 elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
521
522 // A primitive one
523 elements.push_back(NewPrimitive("a", FieldRepetitionType::REQUIRED, Type::INT32, 1));
524
525 // A group
526 elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2));
527
528 // 3-level list encoding, by hand
529 elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3);
530 elt.__set_converted_type(ConvertedType::LIST);
531 elements.push_back(elt);
532 elements.push_back(NewPrimitive("item", FieldRepetitionType::OPTIONAL, Type::INT64, 4));
533
534 // Construct the schema
535 NodeVector fields;
536 fields.push_back(Int32("a", Repetition::REQUIRED));
537
538 // 3-level list encoding
539 NodePtr item = Int64("item");
540 NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST));
541 NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
542 fields.push_back(bag);
543
544 NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);
545
546 Flatten(static_cast<GroupNode*>(schema.get()));
547 ASSERT_EQ(elements_.size(), elements.size());
548 for (size_t i = 0; i < elements_.size(); i++) {
549 ASSERT_EQ(elements_[i], elements[i]);
550 }
551}
552
553TEST(TestColumnDescriptor, TestAttrs) {
554 NodePtr node = PrimitiveNode::Make("name", Repetition::OPTIONAL, Type::BYTE_ARRAY,
555 LogicalType::UTF8);
556 ColumnDescriptor descr(node, 4, 1);
557
558 ASSERT_EQ("name", descr.name());
559 ASSERT_EQ(4, descr.max_definition_level());
560 ASSERT_EQ(1, descr.max_repetition_level());
561
562 ASSERT_EQ(Type::BYTE_ARRAY, descr.physical_type());
563
564 ASSERT_EQ(-1, descr.type_length());
565
566 // Test FIXED_LEN_BYTE_ARRAY
567 node = PrimitiveNode::Make("name", Repetition::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY,
568 LogicalType::DECIMAL, 12, 10, 4);
569 descr = ColumnDescriptor(node, 4, 1);
570
571 ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, descr.physical_type());
572 ASSERT_EQ(12, descr.type_length());
573}
574
575class TestSchemaDescriptor : public ::testing::Test {
576 public:
577 void setUp() {}
578
579 protected:
580 SchemaDescriptor descr_;
581};
582
583TEST_F(TestSchemaDescriptor, InitNonGroup) {
584 NodePtr node = PrimitiveNode::Make("field", Repetition::OPTIONAL, Type::INT32);
585
586 ASSERT_THROW(descr_.Init(node), ParquetException);
587}
588
589TEST_F(TestSchemaDescriptor, Equals) {
590 NodePtr schema;
591
592 NodePtr inta = Int32("a", Repetition::REQUIRED);
593 NodePtr intb = Int64("b", Repetition::OPTIONAL);
594 NodePtr intb2 = Int64("b2", Repetition::OPTIONAL);
595 NodePtr intc = ByteArray("c", Repetition::REPEATED);
596
597 NodePtr item1 = Int64("item1", Repetition::REQUIRED);
598 NodePtr item2 = Boolean("item2", Repetition::OPTIONAL);
599 NodePtr item3 = Int32("item3", Repetition::REPEATED);
600 NodePtr list(GroupNode::Make("records", Repetition::REPEATED, {item1, item2, item3},
601 LogicalType::LIST));
602
603 NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
604 NodePtr bag2(GroupNode::Make("bag", Repetition::REQUIRED, {list}));
605
606 SchemaDescriptor descr1;
607 descr1.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag}));
608
609 ASSERT_TRUE(descr1.Equals(descr1));
610
611 SchemaDescriptor descr2;
612 descr2.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag2}));
613 ASSERT_FALSE(descr1.Equals(descr2));
614
615 SchemaDescriptor descr3;
616 descr3.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb2, intc, bag}));
617 ASSERT_FALSE(descr1.Equals(descr3));
618
619 // Robust to name of parent node
620 SchemaDescriptor descr4;
621 descr4.Init(GroupNode::Make("SCHEMA", Repetition::REPEATED, {inta, intb, intc, bag}));
622 ASSERT_TRUE(descr1.Equals(descr4));
623
624 SchemaDescriptor descr5;
625 descr5.Init(
626 GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag, intb2}));
627 ASSERT_FALSE(descr1.Equals(descr5));
628
629 // Different max repetition / definition levels
630 ColumnDescriptor col1(inta, 5, 1);
631 ColumnDescriptor col2(inta, 6, 1);
632 ColumnDescriptor col3(inta, 5, 2);
633
634 ASSERT_TRUE(col1.Equals(col1));
635 ASSERT_FALSE(col1.Equals(col2));
636 ASSERT_FALSE(col1.Equals(col3));
637}
638
639TEST_F(TestSchemaDescriptor, BuildTree) {
640 NodeVector fields;
641 NodePtr schema;
642
643 NodePtr inta = Int32("a", Repetition::REQUIRED);
644 fields.push_back(inta);
645 fields.push_back(Int64("b", Repetition::OPTIONAL));
646 fields.push_back(ByteArray("c", Repetition::REPEATED));
647
648 // 3-level list encoding
649 NodePtr item1 = Int64("item1", Repetition::REQUIRED);
650 NodePtr item2 = Boolean("item2", Repetition::OPTIONAL);
651 NodePtr item3 = Int32("item3", Repetition::REPEATED);
652 NodePtr list(GroupNode::Make("records", Repetition::REPEATED, {item1, item2, item3},
653 LogicalType::LIST));
654 NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
655 fields.push_back(bag);
656
657 schema = GroupNode::Make("schema", Repetition::REPEATED, fields);
658
659 descr_.Init(schema);
660
661 int nleaves = 6;
662
663 // 6 leaves
664 ASSERT_EQ(nleaves, descr_.num_columns());
665
666 // mdef mrep
667 // required int32 a 0 0
668 // optional int64 b 1 0
669 // repeated byte_array c 1 1
670 // optional group bag 1 0
671 // repeated group records 2 1
672 // required int64 item1 2 1
673 // optional boolean item2 3 1
674 // repeated int32 item3 3 2
675 int16_t ex_max_def_levels[6] = {0, 1, 1, 2, 3, 3};
676 int16_t ex_max_rep_levels[6] = {0, 0, 1, 1, 1, 2};
677
678 for (int i = 0; i < nleaves; ++i) {
679 const ColumnDescriptor* col = descr_.Column(i);
680 EXPECT_EQ(ex_max_def_levels[i], col->max_definition_level()) << i;
681 EXPECT_EQ(ex_max_rep_levels[i], col->max_repetition_level()) << i;
682 }
683
684 ASSERT_EQ(descr_.Column(0)->path()->ToDotString(), "a");
685 ASSERT_EQ(descr_.Column(1)->path()->ToDotString(), "b");
686 ASSERT_EQ(descr_.Column(2)->path()->ToDotString(), "c");
687 ASSERT_EQ(descr_.Column(3)->path()->ToDotString(), "bag.records.item1");
688 ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2");
689 ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3");
690
691 for (int i = 0; i < nleaves; ++i) {
692 auto col = descr_.Column(i);
693 ASSERT_EQ(i, descr_.ColumnIndex(*col->schema_node()));
694 }
695
696 // Test non-column nodes find
697 NodePtr non_column_alien = Int32("alien", Repetition::REQUIRED); // other path
698 NodePtr non_column_familiar = Int32("a", Repetition::REPEATED); // other node
699 ASSERT_LT(descr_.ColumnIndex(*non_column_alien), 0);
700 ASSERT_LT(descr_.ColumnIndex(*non_column_familiar), 0);
701
702 ASSERT_EQ(inta.get(), descr_.GetColumnRoot(0));
703 ASSERT_EQ(bag.get(), descr_.GetColumnRoot(3));
704 ASSERT_EQ(bag.get(), descr_.GetColumnRoot(4));
705 ASSERT_EQ(bag.get(), descr_.GetColumnRoot(5));
706
707 ASSERT_EQ(schema.get(), descr_.group_node());
708
709 // Init clears the leaves
710 descr_.Init(schema);
711 ASSERT_EQ(nleaves, descr_.num_columns());
712}
713
714static std::string Print(const NodePtr& node) {
715 std::stringstream ss;
716 PrintSchema(node.get(), ss);
717 return ss.str();
718}
719
720TEST(TestSchemaPrinter, Examples) {
721 // Test schema 1
722 NodeVector fields;
723 fields.push_back(Int32("a", Repetition::REQUIRED));
724
725 // 3-level list encoding
726 NodePtr item1 = Int64("item1");
727 NodePtr item2 = Boolean("item2", Repetition::REQUIRED);
728 NodePtr list(
729 GroupNode::Make("b", Repetition::REPEATED, {item1, item2}, LogicalType::LIST));
730 NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
731 fields.push_back(bag);
732
733 fields.push_back(PrimitiveNode::Make("c", Repetition::REQUIRED, Type::INT32,
734 LogicalType::DECIMAL, -1, 3, 2));
735
736 NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, fields);
737
738 std::string result = Print(schema);
739 std::string expected = R"(message schema {
740 required int32 a;
741 optional group bag {
742 repeated group b (LIST) {
743 optional int64 item1;
744 required boolean item2;
745 }
746 }
747 required int32 c (DECIMAL(3,2));
748}
749)";
750 ASSERT_EQ(expected, result);
751}
752
753} // namespace schema
754} // namespace parquet
755