| 1 | #include "config_core.h" | 
|---|
| 2 | #if USE_ICU | 
|---|
| 3 |  | 
|---|
| 4 | #include <Functions/IFunction.h> | 
|---|
| 5 | #include <Functions/FunctionFactory.h> | 
|---|
| 6 | #include <Functions/FunctionHelpers.h> | 
|---|
| 7 | #include <IO/WriteHelpers.h> | 
|---|
| 8 | #include <DataTypes/DataTypeString.h> | 
|---|
| 9 | #include <Columns/ColumnString.h> | 
|---|
| 10 | #include <Columns/ColumnConst.h> | 
|---|
| 11 | #include <Common/typeid_cast.h> | 
|---|
| 12 | #include <Common/ObjectPool.h> | 
|---|
| 13 | #include <ext/range.h> | 
|---|
| 14 |  | 
|---|
| 15 | #include <unicode/ucnv.h> | 
|---|
| 16 | #include <string> | 
|---|
| 17 | #include <memory> | 
|---|
| 18 |  | 
|---|
| 19 |  | 
|---|
| 20 | namespace DB | 
|---|
| 21 | { | 
|---|
| 22 |  | 
|---|
| 23 | namespace ErrorCodes | 
|---|
| 24 | { | 
|---|
| 25 | extern const int BAD_ARGUMENTS; | 
|---|
| 26 | extern const int LOGICAL_ERROR; | 
|---|
| 27 | extern const int CANNOT_CREATE_CHARSET_CONVERTER; | 
|---|
| 28 | extern const int CANNOT_CONVERT_CHARSET; | 
|---|
| 29 | extern const int ILLEGAL_COLUMN; | 
|---|
| 30 | } | 
|---|
| 31 |  | 
|---|
| 32 |  | 
|---|
| 33 | /** convertCharset(s, from, to) | 
|---|
| 34 | * | 
|---|
| 35 | * Assuming string 's' contains bytes in charset 'from', | 
|---|
| 36 | *  returns another string with bytes, representing same content in charset 'to'. | 
|---|
| 37 | * from and to must be constants. | 
|---|
| 38 | * | 
|---|
| 39 | * When bytes are illegal in 'from' charset or are not representable in 'to' charset, | 
|---|
| 40 | *  behavior is implementation specific. | 
|---|
| 41 | */ | 
|---|
| 42 | class FunctionConvertCharset : public IFunction | 
|---|
| 43 | { | 
|---|
| 44 | private: | 
|---|
| 45 | struct Converter : private boost::noncopyable | 
|---|
| 46 | { | 
|---|
| 47 | UConverter * impl; | 
|---|
| 48 |  | 
|---|
| 49 | explicit Converter(const String & charset) | 
|---|
| 50 | { | 
|---|
| 51 | UErrorCode status = U_ZERO_ERROR; | 
|---|
| 52 | impl = ucnv_open(charset.data(), &status); | 
|---|
| 53 |  | 
|---|
| 54 | if (U_SUCCESS(status)) | 
|---|
| 55 | ucnv_setToUCallBack(impl, | 
|---|
| 56 | UCNV_TO_U_CALLBACK_SUBSTITUTE, | 
|---|
| 57 | nullptr, | 
|---|
| 58 | nullptr, nullptr, | 
|---|
| 59 | &status); | 
|---|
| 60 |  | 
|---|
| 61 | if (U_SUCCESS(status)) | 
|---|
| 62 | ucnv_setFromUCallBack(impl, | 
|---|
| 63 | UCNV_FROM_U_CALLBACK_SUBSTITUTE, | 
|---|
| 64 | nullptr, | 
|---|
| 65 | nullptr, nullptr, | 
|---|
| 66 | &status); | 
|---|
| 67 |  | 
|---|
| 68 | if (!U_SUCCESS(status)) | 
|---|
| 69 | throw Exception( "Cannot create UConverter with charset "+ charset + ", error: "+ String(u_errorName(status)), | 
|---|
| 70 | ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER); | 
|---|
| 71 | } | 
|---|
| 72 |  | 
|---|
| 73 | ~Converter() | 
|---|
| 74 | { | 
|---|
| 75 | ucnv_close(impl); | 
|---|
| 76 | } | 
|---|
| 77 | }; | 
|---|
| 78 |  | 
|---|
| 79 | /// Separate converter is created for each thread. | 
|---|
| 80 | using Pool = ObjectPoolMap<Converter, String>; | 
|---|
| 81 |  | 
|---|
| 82 | Pool::Pointer getConverter(const String & charset) | 
|---|
| 83 | { | 
|---|
| 84 | static Pool pool; | 
|---|
| 85 | return pool.get(charset, [&charset] { return new Converter(charset); }); | 
|---|
| 86 | } | 
|---|
| 87 |  | 
|---|
| 88 | void convert(const String & from_charset, const String & to_charset, | 
|---|
| 89 | const ColumnString::Chars & from_chars, const ColumnString::Offsets & from_offsets, | 
|---|
| 90 | ColumnString::Chars & to_chars, ColumnString::Offsets & to_offsets) | 
|---|
| 91 | { | 
|---|
| 92 | auto converter_from = getConverter(from_charset); | 
|---|
| 93 | auto converter_to = getConverter(to_charset); | 
|---|
| 94 |  | 
|---|
| 95 | ColumnString::Offset current_from_offset = 0; | 
|---|
| 96 | ColumnString::Offset current_to_offset = 0; | 
|---|
| 97 |  | 
|---|
| 98 | size_t size = from_offsets.size(); | 
|---|
| 99 | to_offsets.resize(size); | 
|---|
| 100 |  | 
|---|
| 101 | PODArray<UChar> uchars; | 
|---|
| 102 |  | 
|---|
| 103 | for (size_t i = 0; i < size; ++i) | 
|---|
| 104 | { | 
|---|
| 105 | size_t from_string_size = from_offsets[i] - current_from_offset - 1; | 
|---|
| 106 |  | 
|---|
| 107 | /// We assume that empty string is empty in every charset. | 
|---|
| 108 | if (0 != from_string_size) | 
|---|
| 109 | { | 
|---|
| 110 | /// reset state of converter | 
|---|
| 111 | ucnv_reset(converter_from->impl); | 
|---|
| 112 | ucnv_reset(converter_to->impl); | 
|---|
| 113 |  | 
|---|
| 114 | /// maximum number of code points is number of bytes in input string plus one for terminating zero | 
|---|
| 115 | uchars.resize(from_string_size + 1); | 
|---|
| 116 |  | 
|---|
| 117 | UErrorCode status = U_ZERO_ERROR; | 
|---|
| 118 | int32_t res = ucnv_toUChars( | 
|---|
| 119 | converter_from->impl, | 
|---|
| 120 | uchars.data(), uchars.size(), | 
|---|
| 121 | reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size, | 
|---|
| 122 | &status); | 
|---|
| 123 |  | 
|---|
| 124 | if (!U_SUCCESS(status)) | 
|---|
| 125 | throw Exception( "Cannot convert from charset "+ from_charset + ", error: "+ String(u_errorName(status)), | 
|---|
| 126 | ErrorCodes::CANNOT_CONVERT_CHARSET); | 
|---|
| 127 |  | 
|---|
| 128 | auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl); | 
|---|
| 129 | auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size); | 
|---|
| 130 |  | 
|---|
| 131 | to_chars.resize(current_to_offset + max_to_size); | 
|---|
| 132 |  | 
|---|
| 133 | res = ucnv_fromUChars( | 
|---|
| 134 | converter_to->impl, | 
|---|
| 135 | reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size, | 
|---|
| 136 | uchars.data(), res, | 
|---|
| 137 | &status); | 
|---|
| 138 |  | 
|---|
| 139 | if (!U_SUCCESS(status)) | 
|---|
| 140 | throw Exception( "Cannot convert to charset "+ to_charset + ", error: "+ String(u_errorName(status)), | 
|---|
| 141 | ErrorCodes::CANNOT_CONVERT_CHARSET); | 
|---|
| 142 |  | 
|---|
| 143 | current_to_offset += res; | 
|---|
| 144 | } | 
|---|
| 145 |  | 
|---|
| 146 | if (to_chars.size() < current_to_offset + 1) | 
|---|
| 147 | to_chars.resize(current_to_offset + 1); | 
|---|
| 148 |  | 
|---|
| 149 | to_chars[current_to_offset] = 0; | 
|---|
| 150 |  | 
|---|
| 151 | ++current_to_offset; | 
|---|
| 152 | to_offsets[i] = current_to_offset; | 
|---|
| 153 |  | 
|---|
| 154 | current_from_offset = from_offsets[i]; | 
|---|
| 155 | } | 
|---|
| 156 |  | 
|---|
| 157 | to_chars.resize(current_to_offset); | 
|---|
| 158 | } | 
|---|
| 159 |  | 
|---|
| 160 | public: | 
|---|
| 161 | static constexpr auto name = "convertCharset"; | 
|---|
| 162 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionConvertCharset>(); } | 
|---|
| 163 |  | 
|---|
| 164 | String getName() const override | 
|---|
| 165 | { | 
|---|
| 166 | return name; | 
|---|
| 167 | } | 
|---|
| 168 |  | 
|---|
| 169 | size_t getNumberOfArguments() const override { return 3; } | 
|---|
| 170 |  | 
|---|
| 171 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override | 
|---|
| 172 | { | 
|---|
| 173 | for (size_t i : ext::range(0, 3)) | 
|---|
| 174 | if (!isString(arguments[i])) | 
|---|
| 175 | throw Exception( "Illegal type "+ arguments[i]->getName() + " of argument of function "+ getName() | 
|---|
| 176 | + ", must be String", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); | 
|---|
| 177 |  | 
|---|
| 178 | return std::make_shared<DataTypeString>(); | 
|---|
| 179 | } | 
|---|
| 180 |  | 
|---|
| 181 | bool useDefaultImplementationForConstants() const override { return true; } | 
|---|
| 182 | ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; } | 
|---|
| 183 |  | 
|---|
| 184 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override | 
|---|
| 185 | { | 
|---|
| 186 | const ColumnWithTypeAndName & arg_from = block.getByPosition(arguments[0]); | 
|---|
| 187 | const ColumnWithTypeAndName & arg_charset_from = block.getByPosition(arguments[1]); | 
|---|
| 188 | const ColumnWithTypeAndName & arg_charset_to = block.getByPosition(arguments[2]); | 
|---|
| 189 |  | 
|---|
| 190 | const ColumnConst * col_charset_from = checkAndGetColumnConstStringOrFixedString(arg_charset_from.column.get()); | 
|---|
| 191 | const ColumnConst * col_charset_to = checkAndGetColumnConstStringOrFixedString(arg_charset_to.column.get()); | 
|---|
| 192 |  | 
|---|
| 193 | if (!col_charset_from || !col_charset_to) | 
|---|
| 194 | throw Exception( "2nd and 3rd arguments of function "+ getName() + " (source charset and destination charset) must be constant strings.", | 
|---|
| 195 | ErrorCodes::ILLEGAL_COLUMN); | 
|---|
| 196 |  | 
|---|
| 197 | String charset_from = col_charset_from->getValue<String>(); | 
|---|
| 198 | String charset_to = col_charset_to->getValue<String>(); | 
|---|
| 199 |  | 
|---|
| 200 | if (const ColumnString * col_from = checkAndGetColumn<ColumnString>(arg_from.column.get())) | 
|---|
| 201 | { | 
|---|
| 202 | auto col_to = ColumnString::create(); | 
|---|
| 203 | convert(charset_from, charset_to, col_from->getChars(), col_from->getOffsets(), col_to->getChars(), col_to->getOffsets()); | 
|---|
| 204 | block.getByPosition(result).column = std::move(col_to); | 
|---|
| 205 | } | 
|---|
| 206 | else | 
|---|
| 207 | throw Exception( "Illegal column passed as first argument of function "+ getName() + " (must be ColumnString).", | 
|---|
| 208 | ErrorCodes::ILLEGAL_COLUMN); | 
|---|
| 209 | } | 
|---|
| 210 | }; | 
|---|
| 211 |  | 
|---|
| 212 |  | 
|---|
| 213 | void registerFunctionConvertCharset(FunctionFactory & factory) | 
|---|
| 214 | { | 
|---|
| 215 | factory.registerFunction<FunctionConvertCharset>(); | 
|---|
| 216 | } | 
|---|
| 217 |  | 
|---|
| 218 | } | 
|---|
| 219 |  | 
|---|
| 220 | #endif | 
|---|
| 221 |  | 
|---|