1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_SCHEMA_UTIL_H
19#define PARQUET_SCHEMA_UTIL_H
20
21#include <string>
22#include <unordered_set>
23#include <vector>
24
25#include "parquet/exception.h"
26#include "parquet/schema.h"
27#include "parquet/types.h"
28
29using parquet::LogicalType;
30using parquet::ParquetException;
31using parquet::SchemaDescriptor;
32using parquet::schema::GroupNode;
33using parquet::schema::Node;
34using parquet::schema::NodePtr;
35
36inline bool str_endswith_tuple(const std::string& str) {
37 if (str.size() >= 6) {
38 return str.substr(str.size() - 6, 6) == "_tuple";
39 }
40 return false;
41}
42
43// Special case mentioned in the format spec:
44// If the name is array or ends in _tuple, this should be a list of struct
45// even for single child elements.
46inline bool HasStructListName(const GroupNode& node) {
47 return (node.name() == "array" || str_endswith_tuple(node.name()));
48}
49
50// TODO(itaiin): This aux. function is to be deleted once repeated structs are supported
51inline bool IsSimpleStruct(const Node* node) {
52 if (!node->is_group()) return false;
53 if (node->is_repeated()) return false;
54 if (node->logical_type() == LogicalType::LIST) return false;
55 // Special case mentioned in the format spec:
56 // If the name is array or ends in _tuple, this should be a list of struct
57 // even for single child elements.
58 auto group = static_cast<const GroupNode*>(node);
59 if (group->field_count() == 1 && HasStructListName(*group)) return false;
60
61 return true;
62}
63
64// Coalesce a list of schema fields indices which are the roots of the
65// columns referred by a list of column indices
66inline bool ColumnIndicesToFieldIndices(const SchemaDescriptor& descr,
67 const std::vector<int>& column_indices,
68 std::vector<int>* out) {
69 const GroupNode* group = descr.group_node();
70 std::unordered_set<int> already_added;
71 out->clear();
72 for (auto& column_idx : column_indices) {
73 auto field_node = descr.GetColumnRoot(column_idx);
74 auto field_idx = group->FieldIndex(*field_node);
75 if (field_idx < 0) {
76 return false;
77 }
78 auto insertion = already_added.insert(field_idx);
79 if (insertion.second) {
80 out->push_back(field_idx);
81 }
82 }
83
84 return true;
85}
86
87#endif // PARQUET_SCHEMA_UTIL_H
88