1#pragma once
2
3#include "config_formats.h"
4#if USE_PROTOBUF
5
6#include <memory>
7#include <unordered_map>
8#include <vector>
9#include <Core/Types.h>
10#include <boost/blank.hpp>
11#include <google/protobuf/descriptor.h>
12#include <google/protobuf/descriptor.pb.h>
13
14namespace google
15{
16namespace protobuf
17{
18 class Descriptor;
19 class FieldDescriptor;
20}
21}
22
23
24namespace DB
25{
26namespace ProtobufColumnMatcher
27{
28 struct DefaultTraits
29 {
30 using MessageData = boost::blank;
31 using FieldData = boost::blank;
32 };
33
34 template <typename Traits = DefaultTraits>
35 struct Message;
36
37 /// Represents a field in a protobuf message.
38 template <typename Traits = DefaultTraits>
39 struct Field
40 {
41 const google::protobuf::FieldDescriptor * field_descriptor = nullptr;
42
43 /// Same as field_descriptor->number().
44 UInt32 field_number = 0;
45
46 /// Index of a column; either 'column_index' or 'nested_message' is set.
47 size_t column_index = -1;
48 std::unique_ptr<Message<Traits>> nested_message;
49
50 typename Traits::FieldData data;
51 };
52
53 /// Represents a protobuf message.
54 template <typename Traits>
55 struct Message
56 {
57 std::vector<Field<Traits>> fields;
58
59 /// Points to the parent message if this is a nested message.
60 Message * parent = nullptr;
61 size_t index_in_parent = -1;
62
63 typename Traits::MessageData data;
64 };
65
66 /// Utility function finding matching columns for each protobuf field.
67 template <typename Traits = DefaultTraits>
68 static std::unique_ptr<Message<Traits>> matchColumns(
69 const std::vector<String> & column_names,
70 const google::protobuf::Descriptor * message_type);
71
72 template <typename Traits = DefaultTraits>
73 static std::unique_ptr<Message<Traits>> matchColumns(
74 const std::vector<String> & column_names,
75 const google::protobuf::Descriptor * message_type,
76 std::vector<const google::protobuf::FieldDescriptor *> & field_descriptors_without_match);
77
78 namespace details
79 {
80 [[noreturn]] void throwNoCommonColumns();
81
82 class ColumnNameMatcher
83 {
84 public:
85 ColumnNameMatcher(const std::vector<String> & column_names);
86 size_t findColumn(const String & field_name);
87
88 private:
89 std::unordered_map<String, size_t> column_name_to_index_map;
90 std::vector<bool> column_usage;
91 };
92
93 template <typename Traits>
94 std::unique_ptr<Message<Traits>> matchColumnsRecursive(
95 ColumnNameMatcher & name_matcher,
96 const google::protobuf::Descriptor * message_type,
97 const String & field_name_prefix,
98 std::vector<const google::protobuf::FieldDescriptor *> * field_descriptors_without_match)
99 {
100 auto message = std::make_unique<Message<Traits>>();
101 for (int i = 0; i != message_type->field_count(); ++i)
102 {
103 const google::protobuf::FieldDescriptor * field_descriptor = message_type->field(i);
104 if ((field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_MESSAGE)
105 || (field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_GROUP))
106 {
107 auto nested_message = matchColumnsRecursive<Traits>(
108 name_matcher,
109 field_descriptor->message_type(),
110 field_name_prefix + field_descriptor->name() + ".",
111 field_descriptors_without_match);
112 if (nested_message)
113 {
114 message->fields.emplace_back();
115 auto & current_field = message->fields.back();
116 current_field.field_number = field_descriptor->number();
117 current_field.field_descriptor = field_descriptor;
118 current_field.nested_message = std::move(nested_message);
119 current_field.nested_message->parent = message.get();
120 }
121 }
122 else
123 {
124 size_t column_index = name_matcher.findColumn(field_name_prefix + field_descriptor->name());
125 if (column_index == static_cast<size_t>(-1))
126 {
127 if (field_descriptors_without_match)
128 field_descriptors_without_match->emplace_back(field_descriptor);
129 }
130 else
131 {
132 message->fields.emplace_back();
133 auto & current_field = message->fields.back();
134 current_field.field_number = field_descriptor->number();
135 current_field.field_descriptor = field_descriptor;
136 current_field.column_index = column_index;
137 }
138 }
139 }
140
141 if (message->fields.empty())
142 return nullptr;
143
144 // Columns should be sorted by field_number, it's necessary for writing protobufs and useful reading protobufs.
145 std::sort(message->fields.begin(), message->fields.end(), [](const Field<Traits> & left, const Field<Traits> & right)
146 {
147 return left.field_number < right.field_number;
148 });
149
150 for (size_t i = 0; i != message->fields.size(); ++i)
151 {
152 auto & field = message->fields[i];
153 if (field.nested_message)
154 field.nested_message->index_in_parent = i;
155 }
156
157 return message;
158 }
159 }
160
161 template <typename Data>
162 static std::unique_ptr<Message<Data>> matchColumnsImpl(
163 const std::vector<String> & column_names,
164 const google::protobuf::Descriptor * message_type,
165 std::vector<const google::protobuf::FieldDescriptor *> * field_descriptors_without_match)
166 {
167 details::ColumnNameMatcher name_matcher(column_names);
168 auto message = details::matchColumnsRecursive<Data>(name_matcher, message_type, "", field_descriptors_without_match);
169 if (!message)
170 details::throwNoCommonColumns();
171 return message;
172 }
173
174 template <typename Data>
175 static std::unique_ptr<Message<Data>> matchColumns(
176 const std::vector<String> & column_names,
177 const google::protobuf::Descriptor * message_type)
178 {
179 return matchColumnsImpl<Data>(column_names, message_type, nullptr);
180 }
181
182 template <typename Data>
183 static std::unique_ptr<Message<Data>> matchColumns(
184 const std::vector<String> & column_names,
185 const google::protobuf::Descriptor * message_type,
186 std::vector<const google::protobuf::FieldDescriptor *> & field_descriptors_without_match)
187 {
188 return matchColumnsImpl<Data>(column_names, message_type, &field_descriptors_without_match);
189 }
190}
191
192}
193#endif
194