1#include <IO/WriteBufferFromString.h>
2#include <Formats/FormatSettings.h>
3#include <Formats/ProtobufReader.h>
4#include <Formats/ProtobufWriter.h>
5#include <DataTypes/DataTypeEnum.h>
6#include <DataTypes/DataTypeFactory.h>
7#include <Parsers/IAST.h>
8#include <Parsers/ASTFunction.h>
9#include <Parsers/ASTLiteral.h>
10#include <Common/typeid_cast.h>
11#include <Common/assert_cast.h>
12#include <Common/UTF8Helpers.h>
13#include <Poco/UTF8Encoding.h>
14
15#include <limits>
16
17
18namespace DB
19{
20
21namespace ErrorCodes
22{
23 extern const int SYNTAX_ERROR;
24 extern const int EMPTY_DATA_PASSED;
25 extern const int UNEXPECTED_AST_STRUCTURE;
26 extern const int ARGUMENT_OUT_OF_BOUND;
27}
28
29
30template <typename FieldType> struct EnumName;
31template <> struct EnumName<Int8> { static constexpr auto value = "Enum8"; };
32template <> struct EnumName<Int16> { static constexpr auto value = "Enum16"; };
33
34
35template <typename Type>
36const char * DataTypeEnum<Type>::getFamilyName() const
37{
38 return EnumName<FieldType>::value;
39}
40
41
42template <typename Type>
43std::string DataTypeEnum<Type>::generateName(const Values & values)
44{
45 WriteBufferFromOwnString out;
46
47 writeString(EnumName<FieldType>::value, out);
48 writeChar('(', out);
49
50 auto first = true;
51 for (const auto & name_and_value : values)
52 {
53 if (!first)
54 writeString(", ", out);
55
56 first = false;
57
58 writeQuotedString(name_and_value.first, out);
59 writeString(" = ", out);
60 writeText(name_and_value.second, out);
61 }
62
63 writeChar(')', out);
64
65 return out.str();
66}
67
68template <typename Type>
69void DataTypeEnum<Type>::fillMaps()
70{
71 for (const auto & name_and_value : values)
72 {
73 const auto inserted_value = name_to_value_map.insert(
74 { StringRef{name_and_value.first}, name_and_value.second });
75
76 if (!inserted_value.second)
77 throw Exception{"Duplicate names in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second)
78 + " and " + toString(inserted_value.first->getMapped()),
79 ErrorCodes::SYNTAX_ERROR};
80
81 const auto inserted_name = value_to_name_map.insert(
82 { name_and_value.second, StringRef{name_and_value.first} });
83
84 if (!inserted_name.second)
85 throw Exception{"Duplicate values in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second)
86 + " and '" + toString((*inserted_name.first).first) + "'",
87 ErrorCodes::SYNTAX_ERROR};
88 }
89}
90
91template <typename Type>
92DataTypeEnum<Type>::DataTypeEnum(const Values & values_) : values{values_}
93{
94 if (values.empty())
95 throw Exception{"DataTypeEnum enumeration cannot be empty", ErrorCodes::EMPTY_DATA_PASSED};
96
97 std::sort(std::begin(values), std::end(values), [] (auto & left, auto & right)
98 {
99 return left.second < right.second;
100 });
101
102 fillMaps();
103 type_name = generateName(values);
104}
105
106template <typename Type>
107void DataTypeEnum<Type>::serializeBinary(const Field & field, WriteBuffer & ostr) const
108{
109 const FieldType x = get<NearestFieldType<FieldType>>(field);
110 writeBinary(x, ostr);
111}
112
113template <typename Type>
114void DataTypeEnum<Type>::deserializeBinary(Field & field, ReadBuffer & istr) const
115{
116 FieldType x;
117 readBinary(x, istr);
118 field = castToNearestFieldType(x);
119}
120
121template <typename Type>
122void DataTypeEnum<Type>::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const
123{
124 writeBinary(assert_cast<const ColumnType &>(column).getData()[row_num], ostr);
125}
126
127template <typename Type>
128void DataTypeEnum<Type>::deserializeBinary(IColumn & column, ReadBuffer & istr) const
129{
130 typename ColumnType::ValueType x;
131 readBinary(x, istr);
132 assert_cast<ColumnType &>(column).getData().push_back(x);
133}
134
135template <typename Type>
136void DataTypeEnum<Type>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
137{
138 writeString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr);
139}
140
141template <typename Type>
142void DataTypeEnum<Type>::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
143{
144 writeEscapedString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr);
145}
146
147template <typename Type>
148void DataTypeEnum<Type>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
149{
150 /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out.
151 std::string field_name;
152 readEscapedString(field_name, istr);
153 assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
154}
155
156template <typename Type>
157void DataTypeEnum<Type>::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
158{
159 writeQuotedString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr);
160}
161
162template <typename Type>
163void DataTypeEnum<Type>::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
164{
165 std::string field_name;
166 readQuotedStringWithSQLStyle(field_name, istr);
167 assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
168}
169
170template <typename Type>
171void DataTypeEnum<Type>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
172{
173 std::string field_name;
174 readString(field_name, istr);
175 assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
176}
177
178template <typename Type>
179void DataTypeEnum<Type>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
180{
181 writeJSONString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr, settings);
182}
183
184template <typename Type>
185void DataTypeEnum<Type>::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
186{
187 writeXMLString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr);
188}
189
190template <typename Type>
191void DataTypeEnum<Type>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
192{
193 std::string field_name;
194 readJSONString(field_name, istr);
195 assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
196}
197
198template <typename Type>
199void DataTypeEnum<Type>::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
200{
201 writeCSVString(getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr);
202}
203
204template <typename Type>
205void DataTypeEnum<Type>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
206{
207 std::string field_name;
208 readCSVString(field_name, istr, settings.csv);
209 assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name)));
210}
211
212template <typename Type>
213void DataTypeEnum<Type>::serializeBinaryBulk(
214 const IColumn & column, WriteBuffer & ostr, const size_t offset, size_t limit) const
215{
216 const auto & x = typeid_cast<const ColumnType &>(column).getData();
217 const auto size = x.size();
218
219 if (limit == 0 || offset + limit > size)
220 limit = size - offset;
221
222 ostr.write(reinterpret_cast<const char *>(&x[offset]), sizeof(FieldType) * limit);
223}
224
225template <typename Type>
226void DataTypeEnum<Type>::deserializeBinaryBulk(
227 IColumn & column, ReadBuffer & istr, const size_t limit, const double /*avg_value_size_hint*/) const
228{
229 auto & x = typeid_cast<ColumnType &>(column).getData();
230 const auto initial_size = x.size();
231 x.resize(initial_size + limit);
232 const auto size = istr.readBig(reinterpret_cast<char*>(&x[initial_size]), sizeof(FieldType) * limit);
233 x.resize(initial_size + size / sizeof(FieldType));
234}
235
236template <typename Type>
237void DataTypeEnum<Type>::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const
238{
239 if (value_index)
240 return;
241 protobuf.prepareEnumMapping(values);
242 value_index = static_cast<bool>(protobuf.writeEnum(assert_cast<const ColumnType &>(column).getData()[row_num]));
243}
244
245template<typename Type>
246void DataTypeEnum<Type>::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const
247{
248 protobuf.prepareEnumMapping(values);
249 row_added = false;
250 Type value;
251 if (!protobuf.readEnum(value))
252 return;
253
254 auto & container = assert_cast<ColumnType &>(column).getData();
255 if (allow_add_row)
256 {
257 container.emplace_back(value);
258 row_added = true;
259 }
260 else
261 container.back() = value;
262}
263
264template <typename Type>
265Field DataTypeEnum<Type>::getDefault() const
266{
267 return values.front().second;
268}
269
270template <typename Type>
271void DataTypeEnum<Type>::insertDefaultInto(IColumn & column) const
272{
273 assert_cast<ColumnType &>(column).getData().push_back(values.front().second);
274}
275
276template <typename Type>
277bool DataTypeEnum<Type>::equals(const IDataType & rhs) const
278{
279 return typeid(rhs) == typeid(*this) && type_name == static_cast<const DataTypeEnum<Type> &>(rhs).type_name;
280}
281
282
283template <typename Type>
284bool DataTypeEnum<Type>::textCanContainOnlyValidUTF8() const
285{
286 for (const auto & elem : values)
287 {
288 const char * pos = elem.first.data();
289 const char * end = pos + elem.first.size();
290 while (pos < end)
291 {
292 size_t length = UTF8::seqLength(*pos);
293 if (pos + length > end)
294 return false;
295
296 if (Poco::UTF8Encoding::isLegal(reinterpret_cast<const unsigned char *>(pos), length))
297 pos += length;
298 else
299 return false;
300 }
301 }
302 return true;
303}
304
305template <typename Type>
306static void checkOverflow(Int64 value)
307{
308 if (!(std::numeric_limits<Type>::min() <= value && value <= std::numeric_limits<Type>::max()))
309 throw Exception("DataTypeEnum: Unexpected value " + toString(value), ErrorCodes::BAD_TYPE_OF_FIELD);
310}
311
312template <typename Type>
313Field DataTypeEnum<Type>::castToName(const Field & value_or_name) const
314{
315 if (value_or_name.getType() == Field::Types::String)
316 {
317 getValue(value_or_name.get<String>()); /// Check correctness
318 return value_or_name.get<String>();
319 }
320 else if (value_or_name.getType() == Field::Types::Int64)
321 {
322 Int64 value = value_or_name.get<Int64>();
323 checkOverflow<Type>(value);
324 return getNameForValue(static_cast<Type>(value)).toString();
325 }
326 else
327 throw Exception(String("DataTypeEnum: Unsupported type of field ") + value_or_name.getTypeName(), ErrorCodes::BAD_TYPE_OF_FIELD);
328}
329
330template <typename Type>
331Field DataTypeEnum<Type>::castToValue(const Field & value_or_name) const
332{
333 if (value_or_name.getType() == Field::Types::String)
334 {
335 return getValue(value_or_name.get<String>());
336 }
337 else if (value_or_name.getType() == Field::Types::Int64
338 || value_or_name.getType() == Field::Types::UInt64)
339 {
340 Int64 value = value_or_name.get<Int64>();
341 checkOverflow<Type>(value);
342 getNameForValue(static_cast<Type>(value)); /// Check correctness
343 return value;
344 }
345 else
346 throw Exception(String("DataTypeEnum: Unsupported type of field ") + value_or_name.getTypeName(), ErrorCodes::BAD_TYPE_OF_FIELD);
347}
348
349
350/// Explicit instantiations.
351template class DataTypeEnum<Int8>;
352template class DataTypeEnum<Int16>;
353
354static void checkASTStructure(const ASTPtr & child)
355{
356 const auto * func = child->as<ASTFunction>();
357 if (!func
358 || func->name != "equals"
359 || func->parameters
360 || !func->arguments
361 || func->arguments->children.size() != 2)
362 throw Exception("Elements of Enum data type must be of form: 'name' = number, where name is string literal and number is an integer",
363 ErrorCodes::UNEXPECTED_AST_STRUCTURE);
364}
365
366template <typename DataTypeEnum>
367static DataTypePtr createExact(const String & /*type_name*/, const ASTPtr & arguments)
368{
369 if (!arguments || arguments->children.empty())
370 throw Exception("Enum data type cannot be empty", ErrorCodes::EMPTY_DATA_PASSED);
371
372 typename DataTypeEnum::Values values;
373 values.reserve(arguments->children.size());
374
375 using FieldType = typename DataTypeEnum::FieldType;
376
377 /// Children must be functions 'equals' with string literal as left argument and numeric literal as right argument.
378 for (const ASTPtr & child : arguments->children)
379 {
380 checkASTStructure(child);
381
382 const auto * func = child->as<ASTFunction>();
383 const auto * name_literal = func->arguments->children[0]->as<ASTLiteral>();
384 const auto * value_literal = func->arguments->children[1]->as<ASTLiteral>();
385
386 if (!name_literal
387 || !value_literal
388 || name_literal->value.getType() != Field::Types::String
389 || (value_literal->value.getType() != Field::Types::UInt64 && value_literal->value.getType() != Field::Types::Int64))
390 throw Exception("Elements of Enum data type must be of form: 'name' = number, where name is string literal and number is an integer",
391 ErrorCodes::UNEXPECTED_AST_STRUCTURE);
392
393 const String & field_name = name_literal->value.get<String>();
394 const auto value = value_literal->value.get<NearestFieldType<FieldType>>();
395
396 if (value > std::numeric_limits<FieldType>::max() || value < std::numeric_limits<FieldType>::min())
397 throw Exception{"Value " + toString(value) + " for element '" + field_name + "' exceeds range of " + EnumName<FieldType>::value,
398 ErrorCodes::ARGUMENT_OUT_OF_BOUND};
399
400 values.emplace_back(field_name, value);
401 }
402
403 return std::make_shared<DataTypeEnum>(values);
404}
405
406static DataTypePtr create(const String & type_name, const ASTPtr & arguments)
407{
408 if (!arguments || arguments->children.empty())
409 throw Exception("Enum data type cannot be empty", ErrorCodes::EMPTY_DATA_PASSED);
410
411 /// Children must be functions 'equals' with string literal as left argument and numeric literal as right argument.
412 for (const ASTPtr & child : arguments->children)
413 {
414 checkASTStructure(child);
415
416 const auto * func = child->as<ASTFunction>();
417 const auto * value_literal = func->arguments->children[1]->as<ASTLiteral>();
418
419 if (!value_literal
420 || (value_literal->value.getType() != Field::Types::UInt64 && value_literal->value.getType() != Field::Types::Int64))
421 throw Exception("Elements of Enum data type must be of form: 'name' = number, where name is string literal and number is an integer",
422 ErrorCodes::UNEXPECTED_AST_STRUCTURE);
423
424 Int64 value = value_literal->value.get<Int64>();
425
426 if (value > std::numeric_limits<Int8>::max() || value < std::numeric_limits<Int8>::min())
427 return createExact<DataTypeEnum16>(type_name, arguments);
428 }
429
430 return createExact<DataTypeEnum8>(type_name, arguments);
431}
432
433void registerDataTypeEnum(DataTypeFactory & factory)
434{
435 factory.registerDataType("Enum8", createExact<DataTypeEnum<Int8>>);
436 factory.registerDataType("Enum16", createExact<DataTypeEnum<Int16>>);
437 factory.registerDataType("Enum", create);
438}
439
440}
441