| 1 | // Licensed to the Apache Software Foundation (ASF) under one | 
|---|
| 2 | // or more contributor license agreements.  See the NOTICE file | 
|---|
| 3 | // distributed with this work for additional information | 
|---|
| 4 | // regarding copyright ownership.  The ASF licenses this file | 
|---|
| 5 | // to you under the Apache License, Version 2.0 (the | 
|---|
| 6 | // "License"); you may not use this file except in compliance | 
|---|
| 7 | // with the License.  You may obtain a copy of the License at | 
|---|
| 8 | // | 
|---|
| 9 | //   http://www.apache.org/licenses/LICENSE-2.0 | 
|---|
| 10 | // | 
|---|
| 11 | // Unless required by applicable law or agreed to in writing, | 
|---|
| 12 | // software distributed under the License is distributed on an | 
|---|
| 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
|---|
| 14 | // KIND, either express or implied.  See the License for the | 
|---|
| 15 | // specific language governing permissions and limitations | 
|---|
| 16 | // under the License. | 
|---|
| 17 |  | 
|---|
| 18 | #ifndef PARQUET_SCHEMA_UTIL_H | 
|---|
| 19 | #define PARQUET_SCHEMA_UTIL_H | 
|---|
| 20 |  | 
|---|
| 21 | #include <string> | 
|---|
| 22 | #include <unordered_set> | 
|---|
| 23 | #include <vector> | 
|---|
| 24 |  | 
|---|
| 25 | #include "parquet/exception.h" | 
|---|
| 26 | #include "parquet/schema.h" | 
|---|
| 27 | #include "parquet/types.h" | 
|---|
| 28 |  | 
|---|
| 29 | using parquet::LogicalType; | 
|---|
| 30 | using parquet::ParquetException; | 
|---|
| 31 | using parquet::SchemaDescriptor; | 
|---|
| 32 | using parquet::schema::GroupNode; | 
|---|
| 33 | using parquet::schema::Node; | 
|---|
| 34 | using parquet::schema::NodePtr; | 
|---|
| 35 |  | 
|---|
| 36 | inline bool str_endswith_tuple(const std::string& str) { | 
|---|
| 37 | if (str.size() >= 6) { | 
|---|
| 38 | return str.substr(str.size() - 6, 6) == "_tuple"; | 
|---|
| 39 | } | 
|---|
| 40 | return false; | 
|---|
| 41 | } | 
|---|
| 42 |  | 
|---|
| 43 | // Special case mentioned in the format spec: | 
|---|
| 44 | //   If the name is array or ends in _tuple, this should be a list of struct | 
|---|
| 45 | //   even for single child elements. | 
|---|
| 46 | inline bool HasStructListName(const GroupNode& node) { | 
|---|
| 47 | return (node.name() == "array"|| str_endswith_tuple(node.name())); | 
|---|
| 48 | } | 
|---|
| 49 |  | 
|---|
| 50 | // TODO(itaiin): This aux. function is to be deleted once repeated structs are supported | 
|---|
| 51 | inline bool IsSimpleStruct(const Node* node) { | 
|---|
| 52 | if (!node->is_group()) return false; | 
|---|
| 53 | if (node->is_repeated()) return false; | 
|---|
| 54 | if (node->logical_type() == LogicalType::LIST) return false; | 
|---|
| 55 | // Special case mentioned in the format spec: | 
|---|
| 56 | //   If the name is array or ends in _tuple, this should be a list of struct | 
|---|
| 57 | //   even for single child elements. | 
|---|
| 58 | auto group = static_cast<const GroupNode*>(node); | 
|---|
| 59 | if (group->field_count() == 1 && HasStructListName(*group)) return false; | 
|---|
| 60 |  | 
|---|
| 61 | return true; | 
|---|
| 62 | } | 
|---|
| 63 |  | 
|---|
| 64 | // Coalesce a list of schema fields indices which are the roots of the | 
|---|
| 65 | // columns referred by a list of column indices | 
|---|
| 66 | inline bool ColumnIndicesToFieldIndices(const SchemaDescriptor& descr, | 
|---|
| 67 | const std::vector<int>& column_indices, | 
|---|
| 68 | std::vector<int>* out) { | 
|---|
| 69 | const GroupNode* group = descr.group_node(); | 
|---|
| 70 | std::unordered_set<int> already_added; | 
|---|
| 71 | out->clear(); | 
|---|
| 72 | for (auto& column_idx : column_indices) { | 
|---|
| 73 | auto field_node = descr.GetColumnRoot(column_idx); | 
|---|
| 74 | auto field_idx = group->FieldIndex(*field_node); | 
|---|
| 75 | if (field_idx < 0) { | 
|---|
| 76 | return false; | 
|---|
| 77 | } | 
|---|
| 78 | auto insertion = already_added.insert(field_idx); | 
|---|
| 79 | if (insertion.second) { | 
|---|
| 80 | out->push_back(field_idx); | 
|---|
| 81 | } | 
|---|
| 82 | } | 
|---|
| 83 |  | 
|---|
| 84 | return true; | 
|---|
| 85 | } | 
|---|
| 86 |  | 
|---|
| 87 | #endif  // PARQUET_SCHEMA_UTIL_H | 
|---|
| 88 |  | 
|---|