1 | #pragma once |
2 | |
3 | #include "config_formats.h" |
4 | #if USE_PROTOBUF |
5 | |
6 | #include <memory> |
7 | #include <unordered_map> |
8 | #include <vector> |
9 | #include <Core/Types.h> |
10 | #include <boost/blank.hpp> |
11 | #include <google/protobuf/descriptor.h> |
12 | #include <google/protobuf/descriptor.pb.h> |
13 | |
14 | namespace google |
15 | { |
16 | namespace protobuf |
17 | { |
18 | class Descriptor; |
19 | class FieldDescriptor; |
20 | } |
21 | } |
22 | |
23 | |
24 | namespace DB |
25 | { |
26 | namespace ProtobufColumnMatcher |
27 | { |
28 | struct DefaultTraits |
29 | { |
30 | using MessageData = boost::blank; |
31 | using FieldData = boost::blank; |
32 | }; |
33 | |
34 | template <typename Traits = DefaultTraits> |
35 | struct Message; |
36 | |
37 | /// Represents a field in a protobuf message. |
38 | template <typename Traits = DefaultTraits> |
39 | struct Field |
40 | { |
41 | const google::protobuf::FieldDescriptor * field_descriptor = nullptr; |
42 | |
43 | /// Same as field_descriptor->number(). |
44 | UInt32 field_number = 0; |
45 | |
46 | /// Index of a column; either 'column_index' or 'nested_message' is set. |
47 | size_t column_index = -1; |
48 | std::unique_ptr<Message<Traits>> nested_message; |
49 | |
50 | typename Traits::FieldData data; |
51 | }; |
52 | |
53 | /// Represents a protobuf message. |
54 | template <typename Traits> |
55 | struct Message |
56 | { |
57 | std::vector<Field<Traits>> fields; |
58 | |
59 | /// Points to the parent message if this is a nested message. |
60 | Message * parent = nullptr; |
61 | size_t index_in_parent = -1; |
62 | |
63 | typename Traits::MessageData data; |
64 | }; |
65 | |
66 | /// Utility function finding matching columns for each protobuf field. |
67 | template <typename Traits = DefaultTraits> |
68 | static std::unique_ptr<Message<Traits>> matchColumns( |
69 | const std::vector<String> & column_names, |
70 | const google::protobuf::Descriptor * message_type); |
71 | |
72 | template <typename Traits = DefaultTraits> |
73 | static std::unique_ptr<Message<Traits>> matchColumns( |
74 | const std::vector<String> & column_names, |
75 | const google::protobuf::Descriptor * message_type, |
76 | std::vector<const google::protobuf::FieldDescriptor *> & field_descriptors_without_match); |
77 | |
78 | namespace details |
79 | { |
80 | [[noreturn]] void throwNoCommonColumns(); |
81 | |
82 | class ColumnNameMatcher |
83 | { |
84 | public: |
85 | ColumnNameMatcher(const std::vector<String> & column_names); |
86 | size_t findColumn(const String & field_name); |
87 | |
88 | private: |
89 | std::unordered_map<String, size_t> column_name_to_index_map; |
90 | std::vector<bool> column_usage; |
91 | }; |
92 | |
93 | template <typename Traits> |
94 | std::unique_ptr<Message<Traits>> matchColumnsRecursive( |
95 | ColumnNameMatcher & name_matcher, |
96 | const google::protobuf::Descriptor * message_type, |
97 | const String & field_name_prefix, |
98 | std::vector<const google::protobuf::FieldDescriptor *> * field_descriptors_without_match) |
99 | { |
100 | auto message = std::make_unique<Message<Traits>>(); |
101 | for (int i = 0; i != message_type->field_count(); ++i) |
102 | { |
103 | const google::protobuf::FieldDescriptor * field_descriptor = message_type->field(i); |
104 | if ((field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_MESSAGE) |
105 | || (field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_GROUP)) |
106 | { |
107 | auto nested_message = matchColumnsRecursive<Traits>( |
108 | name_matcher, |
109 | field_descriptor->message_type(), |
110 | field_name_prefix + field_descriptor->name() + "." , |
111 | field_descriptors_without_match); |
112 | if (nested_message) |
113 | { |
114 | message->fields.emplace_back(); |
115 | auto & current_field = message->fields.back(); |
116 | current_field.field_number = field_descriptor->number(); |
117 | current_field.field_descriptor = field_descriptor; |
118 | current_field.nested_message = std::move(nested_message); |
119 | current_field.nested_message->parent = message.get(); |
120 | } |
121 | } |
122 | else |
123 | { |
124 | size_t column_index = name_matcher.findColumn(field_name_prefix + field_descriptor->name()); |
125 | if (column_index == static_cast<size_t>(-1)) |
126 | { |
127 | if (field_descriptors_without_match) |
128 | field_descriptors_without_match->emplace_back(field_descriptor); |
129 | } |
130 | else |
131 | { |
132 | message->fields.emplace_back(); |
133 | auto & current_field = message->fields.back(); |
134 | current_field.field_number = field_descriptor->number(); |
135 | current_field.field_descriptor = field_descriptor; |
136 | current_field.column_index = column_index; |
137 | } |
138 | } |
139 | } |
140 | |
141 | if (message->fields.empty()) |
142 | return nullptr; |
143 | |
144 | // Columns should be sorted by field_number, it's necessary for writing protobufs and useful reading protobufs. |
145 | std::sort(message->fields.begin(), message->fields.end(), [](const Field<Traits> & left, const Field<Traits> & right) |
146 | { |
147 | return left.field_number < right.field_number; |
148 | }); |
149 | |
150 | for (size_t i = 0; i != message->fields.size(); ++i) |
151 | { |
152 | auto & field = message->fields[i]; |
153 | if (field.nested_message) |
154 | field.nested_message->index_in_parent = i; |
155 | } |
156 | |
157 | return message; |
158 | } |
159 | } |
160 | |
161 | template <typename Data> |
162 | static std::unique_ptr<Message<Data>> matchColumnsImpl( |
163 | const std::vector<String> & column_names, |
164 | const google::protobuf::Descriptor * message_type, |
165 | std::vector<const google::protobuf::FieldDescriptor *> * field_descriptors_without_match) |
166 | { |
167 | details::ColumnNameMatcher name_matcher(column_names); |
168 | auto message = details::matchColumnsRecursive<Data>(name_matcher, message_type, "" , field_descriptors_without_match); |
169 | if (!message) |
170 | details::throwNoCommonColumns(); |
171 | return message; |
172 | } |
173 | |
174 | template <typename Data> |
175 | static std::unique_ptr<Message<Data>> matchColumns( |
176 | const std::vector<String> & column_names, |
177 | const google::protobuf::Descriptor * message_type) |
178 | { |
179 | return matchColumnsImpl<Data>(column_names, message_type, nullptr); |
180 | } |
181 | |
182 | template <typename Data> |
183 | static std::unique_ptr<Message<Data>> matchColumns( |
184 | const std::vector<String> & column_names, |
185 | const google::protobuf::Descriptor * message_type, |
186 | std::vector<const google::protobuf::FieldDescriptor *> & field_descriptors_without_match) |
187 | { |
188 | return matchColumnsImpl<Data>(column_names, message_type, &field_descriptors_without_match); |
189 | } |
190 | } |
191 | |
192 | } |
193 | #endif |
194 | |