1#include "config_core.h"
2#if USE_ICU
3
4#include <Functions/IFunction.h>
5#include <Functions/FunctionFactory.h>
6#include <Functions/FunctionHelpers.h>
7#include <IO/WriteHelpers.h>
8#include <DataTypes/DataTypeString.h>
9#include <Columns/ColumnString.h>
10#include <Columns/ColumnConst.h>
11#include <Common/typeid_cast.h>
12#include <Common/ObjectPool.h>
13#include <ext/range.h>
14
15#include <unicode/ucnv.h>
16#include <string>
17#include <memory>
18
19
20namespace DB
21{
22
23namespace ErrorCodes
24{
25 extern const int BAD_ARGUMENTS;
26 extern const int LOGICAL_ERROR;
27 extern const int CANNOT_CREATE_CHARSET_CONVERTER;
28 extern const int CANNOT_CONVERT_CHARSET;
29 extern const int ILLEGAL_COLUMN;
30}
31
32
33/** convertCharset(s, from, to)
34 *
35 * Assuming string 's' contains bytes in charset 'from',
36 * returns another string with bytes, representing same content in charset 'to'.
37 * from and to must be constants.
38 *
39 * When bytes are illegal in 'from' charset or are not representable in 'to' charset,
40 * behavior is implementation specific.
41 */
42class FunctionConvertCharset : public IFunction
43{
44private:
45 struct Converter : private boost::noncopyable
46 {
47 UConverter * impl;
48
49 explicit Converter(const String & charset)
50 {
51 UErrorCode status = U_ZERO_ERROR;
52 impl = ucnv_open(charset.data(), &status);
53
54 if (U_SUCCESS(status))
55 ucnv_setToUCallBack(impl,
56 UCNV_TO_U_CALLBACK_SUBSTITUTE,
57 nullptr,
58 nullptr, nullptr,
59 &status);
60
61 if (U_SUCCESS(status))
62 ucnv_setFromUCallBack(impl,
63 UCNV_FROM_U_CALLBACK_SUBSTITUTE,
64 nullptr,
65 nullptr, nullptr,
66 &status);
67
68 if (!U_SUCCESS(status))
69 throw Exception("Cannot create UConverter with charset " + charset + ", error: " + String(u_errorName(status)),
70 ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER);
71 }
72
73 ~Converter()
74 {
75 ucnv_close(impl);
76 }
77 };
78
79 /// Separate converter is created for each thread.
80 using Pool = ObjectPoolMap<Converter, String>;
81
82 Pool::Pointer getConverter(const String & charset)
83 {
84 static Pool pool;
85 return pool.get(charset, [&charset] { return new Converter(charset); });
86 }
87
88 void convert(const String & from_charset, const String & to_charset,
89 const ColumnString::Chars & from_chars, const ColumnString::Offsets & from_offsets,
90 ColumnString::Chars & to_chars, ColumnString::Offsets & to_offsets)
91 {
92 auto converter_from = getConverter(from_charset);
93 auto converter_to = getConverter(to_charset);
94
95 ColumnString::Offset current_from_offset = 0;
96 ColumnString::Offset current_to_offset = 0;
97
98 size_t size = from_offsets.size();
99 to_offsets.resize(size);
100
101 PODArray<UChar> uchars;
102
103 for (size_t i = 0; i < size; ++i)
104 {
105 size_t from_string_size = from_offsets[i] - current_from_offset - 1;
106
107 /// We assume that empty string is empty in every charset.
108 if (0 != from_string_size)
109 {
110 /// reset state of converter
111 ucnv_reset(converter_from->impl);
112 ucnv_reset(converter_to->impl);
113
114 /// maximum number of code points is number of bytes in input string plus one for terminating zero
115 uchars.resize(from_string_size + 1);
116
117 UErrorCode status = U_ZERO_ERROR;
118 int32_t res = ucnv_toUChars(
119 converter_from->impl,
120 uchars.data(), uchars.size(),
121 reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size,
122 &status);
123
124 if (!U_SUCCESS(status))
125 throw Exception("Cannot convert from charset " + from_charset + ", error: " + String(u_errorName(status)),
126 ErrorCodes::CANNOT_CONVERT_CHARSET);
127
128 auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl);
129 auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size);
130
131 to_chars.resize(current_to_offset + max_to_size);
132
133 res = ucnv_fromUChars(
134 converter_to->impl,
135 reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size,
136 uchars.data(), res,
137 &status);
138
139 if (!U_SUCCESS(status))
140 throw Exception("Cannot convert to charset " + to_charset + ", error: " + String(u_errorName(status)),
141 ErrorCodes::CANNOT_CONVERT_CHARSET);
142
143 current_to_offset += res;
144 }
145
146 if (to_chars.size() < current_to_offset + 1)
147 to_chars.resize(current_to_offset + 1);
148
149 to_chars[current_to_offset] = 0;
150
151 ++current_to_offset;
152 to_offsets[i] = current_to_offset;
153
154 current_from_offset = from_offsets[i];
155 }
156
157 to_chars.resize(current_to_offset);
158 }
159
160public:
161 static constexpr auto name = "convertCharset";
162 static FunctionPtr create(const Context &) { return std::make_shared<FunctionConvertCharset>(); }
163
164 String getName() const override
165 {
166 return name;
167 }
168
169 size_t getNumberOfArguments() const override { return 3; }
170
171 DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
172 {
173 for (size_t i : ext::range(0, 3))
174 if (!isString(arguments[i]))
175 throw Exception("Illegal type " + arguments[i]->getName() + " of argument of function " + getName()
176 + ", must be String", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
177
178 return std::make_shared<DataTypeString>();
179 }
180
181 bool useDefaultImplementationForConstants() const override { return true; }
182 ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
183
184 void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
185 {
186 const ColumnWithTypeAndName & arg_from = block.getByPosition(arguments[0]);
187 const ColumnWithTypeAndName & arg_charset_from = block.getByPosition(arguments[1]);
188 const ColumnWithTypeAndName & arg_charset_to = block.getByPosition(arguments[2]);
189
190 const ColumnConst * col_charset_from = checkAndGetColumnConstStringOrFixedString(arg_charset_from.column.get());
191 const ColumnConst * col_charset_to = checkAndGetColumnConstStringOrFixedString(arg_charset_to.column.get());
192
193 if (!col_charset_from || !col_charset_to)
194 throw Exception("2nd and 3rd arguments of function " + getName() + " (source charset and destination charset) must be constant strings.",
195 ErrorCodes::ILLEGAL_COLUMN);
196
197 String charset_from = col_charset_from->getValue<String>();
198 String charset_to = col_charset_to->getValue<String>();
199
200 if (const ColumnString * col_from = checkAndGetColumn<ColumnString>(arg_from.column.get()))
201 {
202 auto col_to = ColumnString::create();
203 convert(charset_from, charset_to, col_from->getChars(), col_from->getOffsets(), col_to->getChars(), col_to->getOffsets());
204 block.getByPosition(result).column = std::move(col_to);
205 }
206 else
207 throw Exception("Illegal column passed as first argument of function " + getName() + " (must be ColumnString).",
208 ErrorCodes::ILLEGAL_COLUMN);
209 }
210};
211
212
213void registerFunctionConvertCharset(FunctionFactory & factory)
214{
215 factory.registerFunction<FunctionConvertCharset>();
216}
217
218}
219
220#endif
221