1 | #include "config_core.h" |
2 | #if USE_ICU |
3 | |
4 | #include <Functions/IFunction.h> |
5 | #include <Functions/FunctionFactory.h> |
6 | #include <Functions/FunctionHelpers.h> |
7 | #include <IO/WriteHelpers.h> |
8 | #include <DataTypes/DataTypeString.h> |
9 | #include <Columns/ColumnString.h> |
10 | #include <Columns/ColumnConst.h> |
11 | #include <Common/typeid_cast.h> |
12 | #include <Common/ObjectPool.h> |
13 | #include <ext/range.h> |
14 | |
15 | #include <unicode/ucnv.h> |
16 | #include <string> |
17 | #include <memory> |
18 | |
19 | |
20 | namespace DB |
21 | { |
22 | |
23 | namespace ErrorCodes |
24 | { |
25 | extern const int BAD_ARGUMENTS; |
26 | extern const int LOGICAL_ERROR; |
27 | extern const int CANNOT_CREATE_CHARSET_CONVERTER; |
28 | extern const int CANNOT_CONVERT_CHARSET; |
29 | extern const int ILLEGAL_COLUMN; |
30 | } |
31 | |
32 | |
33 | /** convertCharset(s, from, to) |
34 | * |
35 | * Assuming string 's' contains bytes in charset 'from', |
36 | * returns another string with bytes, representing same content in charset 'to'. |
37 | * from and to must be constants. |
38 | * |
39 | * When bytes are illegal in 'from' charset or are not representable in 'to' charset, |
40 | * behavior is implementation specific. |
41 | */ |
42 | class FunctionConvertCharset : public IFunction |
43 | { |
44 | private: |
45 | struct Converter : private boost::noncopyable |
46 | { |
47 | UConverter * impl; |
48 | |
49 | explicit Converter(const String & charset) |
50 | { |
51 | UErrorCode status = U_ZERO_ERROR; |
52 | impl = ucnv_open(charset.data(), &status); |
53 | |
54 | if (U_SUCCESS(status)) |
55 | ucnv_setToUCallBack(impl, |
56 | UCNV_TO_U_CALLBACK_SUBSTITUTE, |
57 | nullptr, |
58 | nullptr, nullptr, |
59 | &status); |
60 | |
61 | if (U_SUCCESS(status)) |
62 | ucnv_setFromUCallBack(impl, |
63 | UCNV_FROM_U_CALLBACK_SUBSTITUTE, |
64 | nullptr, |
65 | nullptr, nullptr, |
66 | &status); |
67 | |
68 | if (!U_SUCCESS(status)) |
69 | throw Exception("Cannot create UConverter with charset " + charset + ", error: " + String(u_errorName(status)), |
70 | ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER); |
71 | } |
72 | |
73 | ~Converter() |
74 | { |
75 | ucnv_close(impl); |
76 | } |
77 | }; |
78 | |
79 | /// Separate converter is created for each thread. |
80 | using Pool = ObjectPoolMap<Converter, String>; |
81 | |
82 | Pool::Pointer getConverter(const String & charset) |
83 | { |
84 | static Pool pool; |
85 | return pool.get(charset, [&charset] { return new Converter(charset); }); |
86 | } |
87 | |
88 | void convert(const String & from_charset, const String & to_charset, |
89 | const ColumnString::Chars & from_chars, const ColumnString::Offsets & from_offsets, |
90 | ColumnString::Chars & to_chars, ColumnString::Offsets & to_offsets) |
91 | { |
92 | auto converter_from = getConverter(from_charset); |
93 | auto converter_to = getConverter(to_charset); |
94 | |
95 | ColumnString::Offset current_from_offset = 0; |
96 | ColumnString::Offset current_to_offset = 0; |
97 | |
98 | size_t size = from_offsets.size(); |
99 | to_offsets.resize(size); |
100 | |
101 | PODArray<UChar> uchars; |
102 | |
103 | for (size_t i = 0; i < size; ++i) |
104 | { |
105 | size_t from_string_size = from_offsets[i] - current_from_offset - 1; |
106 | |
107 | /// We assume that empty string is empty in every charset. |
108 | if (0 != from_string_size) |
109 | { |
110 | /// reset state of converter |
111 | ucnv_reset(converter_from->impl); |
112 | ucnv_reset(converter_to->impl); |
113 | |
114 | /// maximum number of code points is number of bytes in input string plus one for terminating zero |
115 | uchars.resize(from_string_size + 1); |
116 | |
117 | UErrorCode status = U_ZERO_ERROR; |
118 | int32_t res = ucnv_toUChars( |
119 | converter_from->impl, |
120 | uchars.data(), uchars.size(), |
121 | reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size, |
122 | &status); |
123 | |
124 | if (!U_SUCCESS(status)) |
125 | throw Exception("Cannot convert from charset " + from_charset + ", error: " + String(u_errorName(status)), |
126 | ErrorCodes::CANNOT_CONVERT_CHARSET); |
127 | |
128 | auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl); |
129 | auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size); |
130 | |
131 | to_chars.resize(current_to_offset + max_to_size); |
132 | |
133 | res = ucnv_fromUChars( |
134 | converter_to->impl, |
135 | reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size, |
136 | uchars.data(), res, |
137 | &status); |
138 | |
139 | if (!U_SUCCESS(status)) |
140 | throw Exception("Cannot convert to charset " + to_charset + ", error: " + String(u_errorName(status)), |
141 | ErrorCodes::CANNOT_CONVERT_CHARSET); |
142 | |
143 | current_to_offset += res; |
144 | } |
145 | |
146 | if (to_chars.size() < current_to_offset + 1) |
147 | to_chars.resize(current_to_offset + 1); |
148 | |
149 | to_chars[current_to_offset] = 0; |
150 | |
151 | ++current_to_offset; |
152 | to_offsets[i] = current_to_offset; |
153 | |
154 | current_from_offset = from_offsets[i]; |
155 | } |
156 | |
157 | to_chars.resize(current_to_offset); |
158 | } |
159 | |
160 | public: |
161 | static constexpr auto name = "convertCharset" ; |
162 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionConvertCharset>(); } |
163 | |
164 | String getName() const override |
165 | { |
166 | return name; |
167 | } |
168 | |
169 | size_t getNumberOfArguments() const override { return 3; } |
170 | |
171 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
172 | { |
173 | for (size_t i : ext::range(0, 3)) |
174 | if (!isString(arguments[i])) |
175 | throw Exception("Illegal type " + arguments[i]->getName() + " of argument of function " + getName() |
176 | + ", must be String" , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
177 | |
178 | return std::make_shared<DataTypeString>(); |
179 | } |
180 | |
181 | bool useDefaultImplementationForConstants() const override { return true; } |
182 | ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; } |
183 | |
184 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
185 | { |
186 | const ColumnWithTypeAndName & arg_from = block.getByPosition(arguments[0]); |
187 | const ColumnWithTypeAndName & arg_charset_from = block.getByPosition(arguments[1]); |
188 | const ColumnWithTypeAndName & arg_charset_to = block.getByPosition(arguments[2]); |
189 | |
190 | const ColumnConst * col_charset_from = checkAndGetColumnConstStringOrFixedString(arg_charset_from.column.get()); |
191 | const ColumnConst * col_charset_to = checkAndGetColumnConstStringOrFixedString(arg_charset_to.column.get()); |
192 | |
193 | if (!col_charset_from || !col_charset_to) |
194 | throw Exception("2nd and 3rd arguments of function " + getName() + " (source charset and destination charset) must be constant strings." , |
195 | ErrorCodes::ILLEGAL_COLUMN); |
196 | |
197 | String charset_from = col_charset_from->getValue<String>(); |
198 | String charset_to = col_charset_to->getValue<String>(); |
199 | |
200 | if (const ColumnString * col_from = checkAndGetColumn<ColumnString>(arg_from.column.get())) |
201 | { |
202 | auto col_to = ColumnString::create(); |
203 | convert(charset_from, charset_to, col_from->getChars(), col_from->getOffsets(), col_to->getChars(), col_to->getOffsets()); |
204 | block.getByPosition(result).column = std::move(col_to); |
205 | } |
206 | else |
207 | throw Exception("Illegal column passed as first argument of function " + getName() + " (must be ColumnString)." , |
208 | ErrorCodes::ILLEGAL_COLUMN); |
209 | } |
210 | }; |
211 | |
212 | |
213 | void registerFunctionConvertCharset(FunctionFactory & factory) |
214 | { |
215 | factory.registerFunction<FunctionConvertCharset>(); |
216 | } |
217 | |
218 | } |
219 | |
220 | #endif |
221 | |