| 1 | #pragma once |
| 2 | |
| 3 | #include "config_formats.h" |
| 4 | #if USE_PROTOBUF |
| 5 | |
| 6 | #include <memory> |
| 7 | #include <unordered_map> |
| 8 | #include <vector> |
| 9 | #include <Core/Types.h> |
| 10 | #include <boost/blank.hpp> |
| 11 | #include <google/protobuf/descriptor.h> |
| 12 | #include <google/protobuf/descriptor.pb.h> |
| 13 | |
| 14 | namespace google |
| 15 | { |
| 16 | namespace protobuf |
| 17 | { |
| 18 | class Descriptor; |
| 19 | class FieldDescriptor; |
| 20 | } |
| 21 | } |
| 22 | |
| 23 | |
| 24 | namespace DB |
| 25 | { |
| 26 | namespace ProtobufColumnMatcher |
| 27 | { |
| 28 | struct DefaultTraits |
| 29 | { |
| 30 | using MessageData = boost::blank; |
| 31 | using FieldData = boost::blank; |
| 32 | }; |
| 33 | |
| 34 | template <typename Traits = DefaultTraits> |
| 35 | struct Message; |
| 36 | |
| 37 | /// Represents a field in a protobuf message. |
| 38 | template <typename Traits = DefaultTraits> |
| 39 | struct Field |
| 40 | { |
| 41 | const google::protobuf::FieldDescriptor * field_descriptor = nullptr; |
| 42 | |
| 43 | /// Same as field_descriptor->number(). |
| 44 | UInt32 field_number = 0; |
| 45 | |
| 46 | /// Index of a column; either 'column_index' or 'nested_message' is set. |
| 47 | size_t column_index = -1; |
| 48 | std::unique_ptr<Message<Traits>> nested_message; |
| 49 | |
| 50 | typename Traits::FieldData data; |
| 51 | }; |
| 52 | |
| 53 | /// Represents a protobuf message. |
| 54 | template <typename Traits> |
| 55 | struct Message |
| 56 | { |
| 57 | std::vector<Field<Traits>> fields; |
| 58 | |
| 59 | /// Points to the parent message if this is a nested message. |
| 60 | Message * parent = nullptr; |
| 61 | size_t index_in_parent = -1; |
| 62 | |
| 63 | typename Traits::MessageData data; |
| 64 | }; |
| 65 | |
| 66 | /// Utility function finding matching columns for each protobuf field. |
| 67 | template <typename Traits = DefaultTraits> |
| 68 | static std::unique_ptr<Message<Traits>> matchColumns( |
| 69 | const std::vector<String> & column_names, |
| 70 | const google::protobuf::Descriptor * message_type); |
| 71 | |
| 72 | template <typename Traits = DefaultTraits> |
| 73 | static std::unique_ptr<Message<Traits>> matchColumns( |
| 74 | const std::vector<String> & column_names, |
| 75 | const google::protobuf::Descriptor * message_type, |
| 76 | std::vector<const google::protobuf::FieldDescriptor *> & field_descriptors_without_match); |
| 77 | |
| 78 | namespace details |
| 79 | { |
| 80 | [[noreturn]] void throwNoCommonColumns(); |
| 81 | |
| 82 | class ColumnNameMatcher |
| 83 | { |
| 84 | public: |
| 85 | ColumnNameMatcher(const std::vector<String> & column_names); |
| 86 | size_t findColumn(const String & field_name); |
| 87 | |
| 88 | private: |
| 89 | std::unordered_map<String, size_t> column_name_to_index_map; |
| 90 | std::vector<bool> column_usage; |
| 91 | }; |
| 92 | |
| 93 | template <typename Traits> |
| 94 | std::unique_ptr<Message<Traits>> matchColumnsRecursive( |
| 95 | ColumnNameMatcher & name_matcher, |
| 96 | const google::protobuf::Descriptor * message_type, |
| 97 | const String & field_name_prefix, |
| 98 | std::vector<const google::protobuf::FieldDescriptor *> * field_descriptors_without_match) |
| 99 | { |
| 100 | auto message = std::make_unique<Message<Traits>>(); |
| 101 | for (int i = 0; i != message_type->field_count(); ++i) |
| 102 | { |
| 103 | const google::protobuf::FieldDescriptor * field_descriptor = message_type->field(i); |
| 104 | if ((field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_MESSAGE) |
| 105 | || (field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_GROUP)) |
| 106 | { |
| 107 | auto nested_message = matchColumnsRecursive<Traits>( |
| 108 | name_matcher, |
| 109 | field_descriptor->message_type(), |
| 110 | field_name_prefix + field_descriptor->name() + "." , |
| 111 | field_descriptors_without_match); |
| 112 | if (nested_message) |
| 113 | { |
| 114 | message->fields.emplace_back(); |
| 115 | auto & current_field = message->fields.back(); |
| 116 | current_field.field_number = field_descriptor->number(); |
| 117 | current_field.field_descriptor = field_descriptor; |
| 118 | current_field.nested_message = std::move(nested_message); |
| 119 | current_field.nested_message->parent = message.get(); |
| 120 | } |
| 121 | } |
| 122 | else |
| 123 | { |
| 124 | size_t column_index = name_matcher.findColumn(field_name_prefix + field_descriptor->name()); |
| 125 | if (column_index == static_cast<size_t>(-1)) |
| 126 | { |
| 127 | if (field_descriptors_without_match) |
| 128 | field_descriptors_without_match->emplace_back(field_descriptor); |
| 129 | } |
| 130 | else |
| 131 | { |
| 132 | message->fields.emplace_back(); |
| 133 | auto & current_field = message->fields.back(); |
| 134 | current_field.field_number = field_descriptor->number(); |
| 135 | current_field.field_descriptor = field_descriptor; |
| 136 | current_field.column_index = column_index; |
| 137 | } |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | if (message->fields.empty()) |
| 142 | return nullptr; |
| 143 | |
| 144 | // Columns should be sorted by field_number, it's necessary for writing protobufs and useful reading protobufs. |
| 145 | std::sort(message->fields.begin(), message->fields.end(), [](const Field<Traits> & left, const Field<Traits> & right) |
| 146 | { |
| 147 | return left.field_number < right.field_number; |
| 148 | }); |
| 149 | |
| 150 | for (size_t i = 0; i != message->fields.size(); ++i) |
| 151 | { |
| 152 | auto & field = message->fields[i]; |
| 153 | if (field.nested_message) |
| 154 | field.nested_message->index_in_parent = i; |
| 155 | } |
| 156 | |
| 157 | return message; |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | template <typename Data> |
| 162 | static std::unique_ptr<Message<Data>> matchColumnsImpl( |
| 163 | const std::vector<String> & column_names, |
| 164 | const google::protobuf::Descriptor * message_type, |
| 165 | std::vector<const google::protobuf::FieldDescriptor *> * field_descriptors_without_match) |
| 166 | { |
| 167 | details::ColumnNameMatcher name_matcher(column_names); |
| 168 | auto message = details::matchColumnsRecursive<Data>(name_matcher, message_type, "" , field_descriptors_without_match); |
| 169 | if (!message) |
| 170 | details::throwNoCommonColumns(); |
| 171 | return message; |
| 172 | } |
| 173 | |
| 174 | template <typename Data> |
| 175 | static std::unique_ptr<Message<Data>> matchColumns( |
| 176 | const std::vector<String> & column_names, |
| 177 | const google::protobuf::Descriptor * message_type) |
| 178 | { |
| 179 | return matchColumnsImpl<Data>(column_names, message_type, nullptr); |
| 180 | } |
| 181 | |
| 182 | template <typename Data> |
| 183 | static std::unique_ptr<Message<Data>> matchColumns( |
| 184 | const std::vector<String> & column_names, |
| 185 | const google::protobuf::Descriptor * message_type, |
| 186 | std::vector<const google::protobuf::FieldDescriptor *> & field_descriptors_without_match) |
| 187 | { |
| 188 | return matchColumnsImpl<Data>(column_names, message_type, &field_descriptors_without_match); |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | } |
| 193 | #endif |
| 194 | |