| 1 | #include "config_core.h" |
| 2 | #if USE_ICU |
| 3 | |
| 4 | #include <Functions/IFunction.h> |
| 5 | #include <Functions/FunctionFactory.h> |
| 6 | #include <Functions/FunctionHelpers.h> |
| 7 | #include <IO/WriteHelpers.h> |
| 8 | #include <DataTypes/DataTypeString.h> |
| 9 | #include <Columns/ColumnString.h> |
| 10 | #include <Columns/ColumnConst.h> |
| 11 | #include <Common/typeid_cast.h> |
| 12 | #include <Common/ObjectPool.h> |
| 13 | #include <ext/range.h> |
| 14 | |
| 15 | #include <unicode/ucnv.h> |
| 16 | #include <string> |
| 17 | #include <memory> |
| 18 | |
| 19 | |
| 20 | namespace DB |
| 21 | { |
| 22 | |
| 23 | namespace ErrorCodes |
| 24 | { |
| 25 | extern const int BAD_ARGUMENTS; |
| 26 | extern const int LOGICAL_ERROR; |
| 27 | extern const int CANNOT_CREATE_CHARSET_CONVERTER; |
| 28 | extern const int CANNOT_CONVERT_CHARSET; |
| 29 | extern const int ILLEGAL_COLUMN; |
| 30 | } |
| 31 | |
| 32 | |
| 33 | /** convertCharset(s, from, to) |
| 34 | * |
| 35 | * Assuming string 's' contains bytes in charset 'from', |
| 36 | * returns another string with bytes, representing same content in charset 'to'. |
| 37 | * from and to must be constants. |
| 38 | * |
| 39 | * When bytes are illegal in 'from' charset or are not representable in 'to' charset, |
| 40 | * behavior is implementation specific. |
| 41 | */ |
| 42 | class FunctionConvertCharset : public IFunction |
| 43 | { |
| 44 | private: |
| 45 | struct Converter : private boost::noncopyable |
| 46 | { |
| 47 | UConverter * impl; |
| 48 | |
| 49 | explicit Converter(const String & charset) |
| 50 | { |
| 51 | UErrorCode status = U_ZERO_ERROR; |
| 52 | impl = ucnv_open(charset.data(), &status); |
| 53 | |
| 54 | if (U_SUCCESS(status)) |
| 55 | ucnv_setToUCallBack(impl, |
| 56 | UCNV_TO_U_CALLBACK_SUBSTITUTE, |
| 57 | nullptr, |
| 58 | nullptr, nullptr, |
| 59 | &status); |
| 60 | |
| 61 | if (U_SUCCESS(status)) |
| 62 | ucnv_setFromUCallBack(impl, |
| 63 | UCNV_FROM_U_CALLBACK_SUBSTITUTE, |
| 64 | nullptr, |
| 65 | nullptr, nullptr, |
| 66 | &status); |
| 67 | |
| 68 | if (!U_SUCCESS(status)) |
| 69 | throw Exception("Cannot create UConverter with charset " + charset + ", error: " + String(u_errorName(status)), |
| 70 | ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER); |
| 71 | } |
| 72 | |
| 73 | ~Converter() |
| 74 | { |
| 75 | ucnv_close(impl); |
| 76 | } |
| 77 | }; |
| 78 | |
| 79 | /// Separate converter is created for each thread. |
| 80 | using Pool = ObjectPoolMap<Converter, String>; |
| 81 | |
| 82 | Pool::Pointer getConverter(const String & charset) |
| 83 | { |
| 84 | static Pool pool; |
| 85 | return pool.get(charset, [&charset] { return new Converter(charset); }); |
| 86 | } |
| 87 | |
| 88 | void convert(const String & from_charset, const String & to_charset, |
| 89 | const ColumnString::Chars & from_chars, const ColumnString::Offsets & from_offsets, |
| 90 | ColumnString::Chars & to_chars, ColumnString::Offsets & to_offsets) |
| 91 | { |
| 92 | auto converter_from = getConverter(from_charset); |
| 93 | auto converter_to = getConverter(to_charset); |
| 94 | |
| 95 | ColumnString::Offset current_from_offset = 0; |
| 96 | ColumnString::Offset current_to_offset = 0; |
| 97 | |
| 98 | size_t size = from_offsets.size(); |
| 99 | to_offsets.resize(size); |
| 100 | |
| 101 | PODArray<UChar> uchars; |
| 102 | |
| 103 | for (size_t i = 0; i < size; ++i) |
| 104 | { |
| 105 | size_t from_string_size = from_offsets[i] - current_from_offset - 1; |
| 106 | |
| 107 | /// We assume that empty string is empty in every charset. |
| 108 | if (0 != from_string_size) |
| 109 | { |
| 110 | /// reset state of converter |
| 111 | ucnv_reset(converter_from->impl); |
| 112 | ucnv_reset(converter_to->impl); |
| 113 | |
| 114 | /// maximum number of code points is number of bytes in input string plus one for terminating zero |
| 115 | uchars.resize(from_string_size + 1); |
| 116 | |
| 117 | UErrorCode status = U_ZERO_ERROR; |
| 118 | int32_t res = ucnv_toUChars( |
| 119 | converter_from->impl, |
| 120 | uchars.data(), uchars.size(), |
| 121 | reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size, |
| 122 | &status); |
| 123 | |
| 124 | if (!U_SUCCESS(status)) |
| 125 | throw Exception("Cannot convert from charset " + from_charset + ", error: " + String(u_errorName(status)), |
| 126 | ErrorCodes::CANNOT_CONVERT_CHARSET); |
| 127 | |
| 128 | auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl); |
| 129 | auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size); |
| 130 | |
| 131 | to_chars.resize(current_to_offset + max_to_size); |
| 132 | |
| 133 | res = ucnv_fromUChars( |
| 134 | converter_to->impl, |
| 135 | reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size, |
| 136 | uchars.data(), res, |
| 137 | &status); |
| 138 | |
| 139 | if (!U_SUCCESS(status)) |
| 140 | throw Exception("Cannot convert to charset " + to_charset + ", error: " + String(u_errorName(status)), |
| 141 | ErrorCodes::CANNOT_CONVERT_CHARSET); |
| 142 | |
| 143 | current_to_offset += res; |
| 144 | } |
| 145 | |
| 146 | if (to_chars.size() < current_to_offset + 1) |
| 147 | to_chars.resize(current_to_offset + 1); |
| 148 | |
| 149 | to_chars[current_to_offset] = 0; |
| 150 | |
| 151 | ++current_to_offset; |
| 152 | to_offsets[i] = current_to_offset; |
| 153 | |
| 154 | current_from_offset = from_offsets[i]; |
| 155 | } |
| 156 | |
| 157 | to_chars.resize(current_to_offset); |
| 158 | } |
| 159 | |
| 160 | public: |
| 161 | static constexpr auto name = "convertCharset" ; |
| 162 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionConvertCharset>(); } |
| 163 | |
| 164 | String getName() const override |
| 165 | { |
| 166 | return name; |
| 167 | } |
| 168 | |
| 169 | size_t getNumberOfArguments() const override { return 3; } |
| 170 | |
| 171 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
| 172 | { |
| 173 | for (size_t i : ext::range(0, 3)) |
| 174 | if (!isString(arguments[i])) |
| 175 | throw Exception("Illegal type " + arguments[i]->getName() + " of argument of function " + getName() |
| 176 | + ", must be String" , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 177 | |
| 178 | return std::make_shared<DataTypeString>(); |
| 179 | } |
| 180 | |
| 181 | bool useDefaultImplementationForConstants() const override { return true; } |
| 182 | ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; } |
| 183 | |
| 184 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
| 185 | { |
| 186 | const ColumnWithTypeAndName & arg_from = block.getByPosition(arguments[0]); |
| 187 | const ColumnWithTypeAndName & arg_charset_from = block.getByPosition(arguments[1]); |
| 188 | const ColumnWithTypeAndName & arg_charset_to = block.getByPosition(arguments[2]); |
| 189 | |
| 190 | const ColumnConst * col_charset_from = checkAndGetColumnConstStringOrFixedString(arg_charset_from.column.get()); |
| 191 | const ColumnConst * col_charset_to = checkAndGetColumnConstStringOrFixedString(arg_charset_to.column.get()); |
| 192 | |
| 193 | if (!col_charset_from || !col_charset_to) |
| 194 | throw Exception("2nd and 3rd arguments of function " + getName() + " (source charset and destination charset) must be constant strings." , |
| 195 | ErrorCodes::ILLEGAL_COLUMN); |
| 196 | |
| 197 | String charset_from = col_charset_from->getValue<String>(); |
| 198 | String charset_to = col_charset_to->getValue<String>(); |
| 199 | |
| 200 | if (const ColumnString * col_from = checkAndGetColumn<ColumnString>(arg_from.column.get())) |
| 201 | { |
| 202 | auto col_to = ColumnString::create(); |
| 203 | convert(charset_from, charset_to, col_from->getChars(), col_from->getOffsets(), col_to->getChars(), col_to->getOffsets()); |
| 204 | block.getByPosition(result).column = std::move(col_to); |
| 205 | } |
| 206 | else |
| 207 | throw Exception("Illegal column passed as first argument of function " + getName() + " (must be ColumnString)." , |
| 208 | ErrorCodes::ILLEGAL_COLUMN); |
| 209 | } |
| 210 | }; |
| 211 | |
| 212 | |
| 213 | void registerFunctionConvertCharset(FunctionFactory & factory) |
| 214 | { |
| 215 | factory.registerFunction<FunctionConvertCharset>(); |
| 216 | } |
| 217 | |
| 218 | } |
| 219 | |
| 220 | #endif |
| 221 | |