1 | #include <IO/WriteBufferFromString.h> |
2 | #include <Formats/FormatSettings.h> |
3 | #include <Formats/ProtobufReader.h> |
4 | #include <Formats/ProtobufWriter.h> |
5 | #include <DataTypes/DataTypeEnum.h> |
6 | #include <DataTypes/DataTypeFactory.h> |
7 | #include <Parsers/IAST.h> |
8 | #include <Parsers/ASTFunction.h> |
9 | #include <Parsers/ASTLiteral.h> |
10 | #include <Common/typeid_cast.h> |
11 | #include <Common/assert_cast.h> |
12 | #include <Common/UTF8Helpers.h> |
13 | #include <Poco/UTF8Encoding.h> |
14 | |
15 | #include <limits> |
16 | |
17 | |
18 | namespace DB |
19 | { |
20 | |
21 | namespace ErrorCodes |
22 | { |
23 | extern const int SYNTAX_ERROR; |
24 | extern const int EMPTY_DATA_PASSED; |
25 | extern const int UNEXPECTED_AST_STRUCTURE; |
26 | extern const int ARGUMENT_OUT_OF_BOUND; |
27 | } |
28 | |
29 | |
30 | template <typename FieldType> struct EnumName; |
31 | template <> struct EnumName<Int8> { static constexpr auto value = "Enum8" ; }; |
32 | template <> struct EnumName<Int16> { static constexpr auto value = "Enum16" ; }; |
33 | |
34 | |
35 | template <typename Type> |
36 | const char * DataTypeEnum<Type>::getFamilyName() const |
37 | { |
38 | return EnumName<FieldType>::value; |
39 | } |
40 | |
41 | |
42 | template <typename Type> |
43 | std::string DataTypeEnum<Type>::generateName(const Values & values) |
44 | { |
45 | WriteBufferFromOwnString out; |
46 | |
47 | writeString(EnumName<FieldType>::value, out); |
48 | writeChar('(', out); |
49 | |
50 | auto first = true; |
51 | for (const auto & name_and_value : values) |
52 | { |
53 | if (!first) |
54 | writeString(", " , out); |
55 | |
56 | first = false; |
57 | |
58 | writeQuotedString(name_and_value.first, out); |
59 | writeString(" = " , out); |
60 | writeText(name_and_value.second, out); |
61 | } |
62 | |
63 | writeChar(')', out); |
64 | |
65 | return out.str(); |
66 | } |
67 | |
68 | template <typename Type> |
69 | void DataTypeEnum<Type>::fillMaps() |
70 | { |
71 | for (const auto & name_and_value : values) |
72 | { |
73 | const auto inserted_value = name_to_value_map.insert( |
74 | { StringRef{name_and_value.first}, name_and_value.second }); |
75 | |
76 | if (!inserted_value.second) |
77 | throw Exception{"Duplicate names in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) |
78 | + " and " + toString(inserted_value.first->getMapped()), |
79 | ErrorCodes::SYNTAX_ERROR}; |
80 | |
81 | const auto inserted_name = value_to_name_map.insert( |
82 | { name_and_value.second, StringRef{name_and_value.first} }); |
83 | |
84 | if (!inserted_name.second) |
85 | throw Exception{"Duplicate values in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) |
86 | + " and '" + toString((*inserted_name.first).first) + "'" , |
87 | ErrorCodes::SYNTAX_ERROR}; |
88 | } |
89 | } |
90 | |
91 | template <typename Type> |
92 | DataTypeEnum<Type>::DataTypeEnum(const Values & values_) : values{values_} |
93 | { |
94 | if (values.empty()) |
95 | throw Exception{"DataTypeEnum enumeration cannot be empty" , ErrorCodes::EMPTY_DATA_PASSED}; |
96 | |
97 | std::sort(std::begin(values), std::end(values), [] (auto & left, auto & right) |
98 | { |
99 | return left.second < right.second; |
100 | }); |
101 | |
102 | fillMaps(); |
103 | type_name = generateName(values); |
104 | } |
105 | |
106 | template <typename Type> |
107 | void DataTypeEnum<Type>::serializeBinary(const Field & field, WriteBuffer & ostr) const |
108 | { |
109 | const FieldType x = get<NearestFieldType<FieldType>>(field); |
110 | writeBinary(x, ostr); |
111 | } |
112 | |
113 | template <typename Type> |
114 | void DataTypeEnum<Type>::deserializeBinary(Field & field, ReadBuffer & istr) const |
115 | { |
116 | FieldType x; |
117 | readBinary(x, istr); |
118 | field = castToNearestFieldType(x); |
119 | } |
120 | |
121 | template <typename Type> |
122 | void DataTypeEnum<Type>::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const |
123 | { |
124 | writeBinary(assert_cast<const ColumnType &>(column).getData()[row_num], ostr); |
125 | } |
126 | |
127 | template <typename Type> |
128 | void DataTypeEnum<Type>::deserializeBinary(IColumn & column, ReadBuffer & istr) const |
129 | { |
130 | typename ColumnType::ValueType x; |
131 | readBinary(x, istr); |
132 | assert_cast<ColumnType &>(column).getData().push_back(x); |
133 | } |
134 | |
135 | template <typename Type> |
136 | void DataTypeEnum<Type>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const |
137 | { |
138 | writeString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr); |
139 | } |
140 | |
141 | template <typename Type> |
142 | void DataTypeEnum<Type>::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const |
143 | { |
144 | writeEscapedString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr); |
145 | } |
146 | |
147 | template <typename Type> |
148 | void DataTypeEnum<Type>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const |
149 | { |
150 | /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out. |
151 | std::string field_name; |
152 | readEscapedString(field_name, istr); |
153 | assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name))); |
154 | } |
155 | |
156 | template <typename Type> |
157 | void DataTypeEnum<Type>::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const |
158 | { |
159 | writeQuotedString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr); |
160 | } |
161 | |
162 | template <typename Type> |
163 | void DataTypeEnum<Type>::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const |
164 | { |
165 | std::string field_name; |
166 | readQuotedStringWithSQLStyle(field_name, istr); |
167 | assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name))); |
168 | } |
169 | |
170 | template <typename Type> |
171 | void DataTypeEnum<Type>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const |
172 | { |
173 | std::string field_name; |
174 | readString(field_name, istr); |
175 | assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name))); |
176 | } |
177 | |
178 | template <typename Type> |
179 | void DataTypeEnum<Type>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const |
180 | { |
181 | writeJSONString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr, settings); |
182 | } |
183 | |
184 | template <typename Type> |
185 | void DataTypeEnum<Type>::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const |
186 | { |
187 | writeXMLString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr); |
188 | } |
189 | |
190 | template <typename Type> |
191 | void DataTypeEnum<Type>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const |
192 | { |
193 | std::string field_name; |
194 | readJSONString(field_name, istr); |
195 | assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name))); |
196 | } |
197 | |
198 | template <typename Type> |
199 | void DataTypeEnum<Type>::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const |
200 | { |
201 | writeCSVString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr); |
202 | } |
203 | |
204 | template <typename Type> |
205 | void DataTypeEnum<Type>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const |
206 | { |
207 | std::string field_name; |
208 | readCSVString(field_name, istr, settings.csv); |
209 | assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name))); |
210 | } |
211 | |
212 | template <typename Type> |
213 | void DataTypeEnum<Type>::serializeBinaryBulk( |
214 | const IColumn & column, WriteBuffer & ostr, const size_t offset, size_t limit) const |
215 | { |
216 | const auto & x = typeid_cast<const ColumnType &>(column).getData(); |
217 | const auto size = x.size(); |
218 | |
219 | if (limit == 0 || offset + limit > size) |
220 | limit = size - offset; |
221 | |
222 | ostr.write(reinterpret_cast<const char *>(&x[offset]), sizeof(FieldType) * limit); |
223 | } |
224 | |
225 | template <typename Type> |
226 | void DataTypeEnum<Type>::deserializeBinaryBulk( |
227 | IColumn & column, ReadBuffer & istr, const size_t limit, const double /*avg_value_size_hint*/) const |
228 | { |
229 | auto & x = typeid_cast<ColumnType &>(column).getData(); |
230 | const auto initial_size = x.size(); |
231 | x.resize(initial_size + limit); |
232 | const auto size = istr.readBig(reinterpret_cast<char*>(&x[initial_size]), sizeof(FieldType) * limit); |
233 | x.resize(initial_size + size / sizeof(FieldType)); |
234 | } |
235 | |
236 | template <typename Type> |
237 | void DataTypeEnum<Type>::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const |
238 | { |
239 | if (value_index) |
240 | return; |
241 | protobuf.prepareEnumMapping(values); |
242 | value_index = static_cast<bool>(protobuf.writeEnum(assert_cast<const ColumnType &>(column).getData()[row_num])); |
243 | } |
244 | |
245 | template<typename Type> |
246 | void DataTypeEnum<Type>::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const |
247 | { |
248 | protobuf.prepareEnumMapping(values); |
249 | row_added = false; |
250 | Type value; |
251 | if (!protobuf.readEnum(value)) |
252 | return; |
253 | |
254 | auto & container = assert_cast<ColumnType &>(column).getData(); |
255 | if (allow_add_row) |
256 | { |
257 | container.emplace_back(value); |
258 | row_added = true; |
259 | } |
260 | else |
261 | container.back() = value; |
262 | } |
263 | |
264 | template <typename Type> |
265 | Field DataTypeEnum<Type>::getDefault() const |
266 | { |
267 | return values.front().second; |
268 | } |
269 | |
270 | template <typename Type> |
271 | void DataTypeEnum<Type>::insertDefaultInto(IColumn & column) const |
272 | { |
273 | assert_cast<ColumnType &>(column).getData().push_back(values.front().second); |
274 | } |
275 | |
276 | template <typename Type> |
277 | bool DataTypeEnum<Type>::equals(const IDataType & rhs) const |
278 | { |
279 | return typeid(rhs) == typeid(*this) && type_name == static_cast<const DataTypeEnum<Type> &>(rhs).type_name; |
280 | } |
281 | |
282 | |
283 | template <typename Type> |
284 | bool DataTypeEnum<Type>::textCanContainOnlyValidUTF8() const |
285 | { |
286 | for (const auto & elem : values) |
287 | { |
288 | const char * pos = elem.first.data(); |
289 | const char * end = pos + elem.first.size(); |
290 | while (pos < end) |
291 | { |
292 | size_t length = UTF8::seqLength(*pos); |
293 | if (pos + length > end) |
294 | return false; |
295 | |
296 | if (Poco::UTF8Encoding::isLegal(reinterpret_cast<const unsigned char *>(pos), length)) |
297 | pos += length; |
298 | else |
299 | return false; |
300 | } |
301 | } |
302 | return true; |
303 | } |
304 | |
305 | template <typename Type> |
306 | static void checkOverflow(Int64 value) |
307 | { |
308 | if (!(std::numeric_limits<Type>::min() <= value && value <= std::numeric_limits<Type>::max())) |
309 | throw Exception("DataTypeEnum: Unexpected value " + toString(value), ErrorCodes::BAD_TYPE_OF_FIELD); |
310 | } |
311 | |
312 | template <typename Type> |
313 | Field DataTypeEnum<Type>::castToName(const Field & value_or_name) const |
314 | { |
315 | if (value_or_name.getType() == Field::Types::String) |
316 | { |
317 | getValue(value_or_name.get<String>()); /// Check correctness |
318 | return value_or_name.get<String>(); |
319 | } |
320 | else if (value_or_name.getType() == Field::Types::Int64) |
321 | { |
322 | Int64 value = value_or_name.get<Int64>(); |
323 | checkOverflow<Type>(value); |
324 | return getNameForValue(static_cast<Type>(value)).toString(); |
325 | } |
326 | else |
327 | throw Exception(String("DataTypeEnum: Unsupported type of field " ) + value_or_name.getTypeName(), ErrorCodes::BAD_TYPE_OF_FIELD); |
328 | } |
329 | |
330 | template <typename Type> |
331 | Field DataTypeEnum<Type>::castToValue(const Field & value_or_name) const |
332 | { |
333 | if (value_or_name.getType() == Field::Types::String) |
334 | { |
335 | return getValue(value_or_name.get<String>()); |
336 | } |
337 | else if (value_or_name.getType() == Field::Types::Int64 |
338 | || value_or_name.getType() == Field::Types::UInt64) |
339 | { |
340 | Int64 value = value_or_name.get<Int64>(); |
341 | checkOverflow<Type>(value); |
342 | getNameForValue(static_cast<Type>(value)); /// Check correctness |
343 | return value; |
344 | } |
345 | else |
346 | throw Exception(String("DataTypeEnum: Unsupported type of field " ) + value_or_name.getTypeName(), ErrorCodes::BAD_TYPE_OF_FIELD); |
347 | } |
348 | |
349 | |
350 | /// Explicit instantiations. |
351 | template class DataTypeEnum<Int8>; |
352 | template class DataTypeEnum<Int16>; |
353 | |
354 | static void checkASTStructure(const ASTPtr & child) |
355 | { |
356 | const auto * func = child->as<ASTFunction>(); |
357 | if (!func |
358 | || func->name != "equals" |
359 | || func->parameters |
360 | || !func->arguments |
361 | || func->arguments->children.size() != 2) |
362 | throw Exception("Elements of Enum data type must be of form: 'name' = number, where name is string literal and number is an integer" , |
363 | ErrorCodes::UNEXPECTED_AST_STRUCTURE); |
364 | } |
365 | |
366 | template <typename DataTypeEnum> |
367 | static DataTypePtr createExact(const String & /*type_name*/, const ASTPtr & arguments) |
368 | { |
369 | if (!arguments || arguments->children.empty()) |
370 | throw Exception("Enum data type cannot be empty" , ErrorCodes::EMPTY_DATA_PASSED); |
371 | |
372 | typename DataTypeEnum::Values values; |
373 | values.reserve(arguments->children.size()); |
374 | |
375 | using FieldType = typename DataTypeEnum::FieldType; |
376 | |
377 | /// Children must be functions 'equals' with string literal as left argument and numeric literal as right argument. |
378 | for (const ASTPtr & child : arguments->children) |
379 | { |
380 | checkASTStructure(child); |
381 | |
382 | const auto * func = child->as<ASTFunction>(); |
383 | const auto * name_literal = func->arguments->children[0]->as<ASTLiteral>(); |
384 | const auto * value_literal = func->arguments->children[1]->as<ASTLiteral>(); |
385 | |
386 | if (!name_literal |
387 | || !value_literal |
388 | || name_literal->value.getType() != Field::Types::String |
389 | || (value_literal->value.getType() != Field::Types::UInt64 && value_literal->value.getType() != Field::Types::Int64)) |
390 | throw Exception("Elements of Enum data type must be of form: 'name' = number, where name is string literal and number is an integer" , |
391 | ErrorCodes::UNEXPECTED_AST_STRUCTURE); |
392 | |
393 | const String & field_name = name_literal->value.get<String>(); |
394 | const auto value = value_literal->value.get<NearestFieldType<FieldType>>(); |
395 | |
396 | if (value > std::numeric_limits<FieldType>::max() || value < std::numeric_limits<FieldType>::min()) |
397 | throw Exception{"Value " + toString(value) + " for element '" + field_name + "' exceeds range of " + EnumName<FieldType>::value, |
398 | ErrorCodes::ARGUMENT_OUT_OF_BOUND}; |
399 | |
400 | values.emplace_back(field_name, value); |
401 | } |
402 | |
403 | return std::make_shared<DataTypeEnum>(values); |
404 | } |
405 | |
406 | static DataTypePtr create(const String & type_name, const ASTPtr & arguments) |
407 | { |
408 | if (!arguments || arguments->children.empty()) |
409 | throw Exception("Enum data type cannot be empty" , ErrorCodes::EMPTY_DATA_PASSED); |
410 | |
411 | /// Children must be functions 'equals' with string literal as left argument and numeric literal as right argument. |
412 | for (const ASTPtr & child : arguments->children) |
413 | { |
414 | checkASTStructure(child); |
415 | |
416 | const auto * func = child->as<ASTFunction>(); |
417 | const auto * value_literal = func->arguments->children[1]->as<ASTLiteral>(); |
418 | |
419 | if (!value_literal |
420 | || (value_literal->value.getType() != Field::Types::UInt64 && value_literal->value.getType() != Field::Types::Int64)) |
421 | throw Exception("Elements of Enum data type must be of form: 'name' = number, where name is string literal and number is an integer" , |
422 | ErrorCodes::UNEXPECTED_AST_STRUCTURE); |
423 | |
424 | Int64 value = value_literal->value.get<Int64>(); |
425 | |
426 | if (value > std::numeric_limits<Int8>::max() || value < std::numeric_limits<Int8>::min()) |
427 | return createExact<DataTypeEnum16>(type_name, arguments); |
428 | } |
429 | |
430 | return createExact<DataTypeEnum8>(type_name, arguments); |
431 | } |
432 | |
433 | void registerDataTypeEnum(DataTypeFactory & factory) |
434 | { |
435 | factory.registerDataType("Enum8" , createExact<DataTypeEnum<Int8>>); |
436 | factory.registerDataType("Enum16" , createExact<DataTypeEnum<Int16>>); |
437 | factory.registerDataType("Enum" , create); |
438 | } |
439 | |
440 | } |
441 | |