1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef PARQUET_SCHEMA_UTIL_H |
19 | #define PARQUET_SCHEMA_UTIL_H |
20 | |
21 | #include <string> |
22 | #include <unordered_set> |
23 | #include <vector> |
24 | |
25 | #include "parquet/exception.h" |
26 | #include "parquet/schema.h" |
27 | #include "parquet/types.h" |
28 | |
29 | using parquet::LogicalType; |
30 | using parquet::ParquetException; |
31 | using parquet::SchemaDescriptor; |
32 | using parquet::schema::GroupNode; |
33 | using parquet::schema::Node; |
34 | using parquet::schema::NodePtr; |
35 | |
36 | inline bool str_endswith_tuple(const std::string& str) { |
37 | if (str.size() >= 6) { |
38 | return str.substr(str.size() - 6, 6) == "_tuple" ; |
39 | } |
40 | return false; |
41 | } |
42 | |
43 | // Special case mentioned in the format spec: |
44 | // If the name is array or ends in _tuple, this should be a list of struct |
45 | // even for single child elements. |
46 | inline bool HasStructListName(const GroupNode& node) { |
47 | return (node.name() == "array" || str_endswith_tuple(node.name())); |
48 | } |
49 | |
50 | // TODO(itaiin): This aux. function is to be deleted once repeated structs are supported |
51 | inline bool IsSimpleStruct(const Node* node) { |
52 | if (!node->is_group()) return false; |
53 | if (node->is_repeated()) return false; |
54 | if (node->logical_type() == LogicalType::LIST) return false; |
55 | // Special case mentioned in the format spec: |
56 | // If the name is array or ends in _tuple, this should be a list of struct |
57 | // even for single child elements. |
58 | auto group = static_cast<const GroupNode*>(node); |
59 | if (group->field_count() == 1 && HasStructListName(*group)) return false; |
60 | |
61 | return true; |
62 | } |
63 | |
64 | // Coalesce a list of schema fields indices which are the roots of the |
65 | // columns referred by a list of column indices |
66 | inline bool ColumnIndicesToFieldIndices(const SchemaDescriptor& descr, |
67 | const std::vector<int>& column_indices, |
68 | std::vector<int>* out) { |
69 | const GroupNode* group = descr.group_node(); |
70 | std::unordered_set<int> already_added; |
71 | out->clear(); |
72 | for (auto& column_idx : column_indices) { |
73 | auto field_node = descr.GetColumnRoot(column_idx); |
74 | auto field_idx = group->FieldIndex(*field_node); |
75 | if (field_idx < 0) { |
76 | return false; |
77 | } |
78 | auto insertion = already_added.insert(field_idx); |
79 | if (insertion.second) { |
80 | out->push_back(field_idx); |
81 | } |
82 | } |
83 | |
84 | return true; |
85 | } |
86 | |
87 | #endif // PARQUET_SCHEMA_UTIL_H |
88 | |