| 1 | #include "config_core.h" | 
| 2 | #if USE_ICU | 
| 3 |  | 
| 4 | #include <Functions/IFunction.h> | 
| 5 | #include <Functions/FunctionFactory.h> | 
| 6 | #include <Functions/FunctionHelpers.h> | 
| 7 | #include <IO/WriteHelpers.h> | 
| 8 | #include <DataTypes/DataTypeString.h> | 
| 9 | #include <Columns/ColumnString.h> | 
| 10 | #include <Columns/ColumnConst.h> | 
| 11 | #include <Common/typeid_cast.h> | 
| 12 | #include <Common/ObjectPool.h> | 
| 13 | #include <ext/range.h> | 
| 14 |  | 
| 15 | #include <unicode/ucnv.h> | 
| 16 | #include <string> | 
| 17 | #include <memory> | 
| 18 |  | 
| 19 |  | 
| 20 | namespace DB | 
| 21 | { | 
| 22 |  | 
| 23 | namespace ErrorCodes | 
| 24 | { | 
| 25 |     extern const int BAD_ARGUMENTS; | 
| 26 |     extern const int LOGICAL_ERROR; | 
| 27 |     extern const int CANNOT_CREATE_CHARSET_CONVERTER; | 
| 28 |     extern const int CANNOT_CONVERT_CHARSET; | 
| 29 |     extern const int ILLEGAL_COLUMN; | 
| 30 | } | 
| 31 |  | 
| 32 |  | 
| 33 | /** convertCharset(s, from, to) | 
| 34 |   * | 
| 35 |   * Assuming string 's' contains bytes in charset 'from', | 
| 36 |   *  returns another string with bytes, representing same content in charset 'to'. | 
| 37 |   * from and to must be constants. | 
| 38 |   * | 
| 39 |   * When bytes are illegal in 'from' charset or are not representable in 'to' charset, | 
| 40 |   *  behavior is implementation specific. | 
| 41 |   */ | 
| 42 | class FunctionConvertCharset : public IFunction | 
| 43 | { | 
| 44 | private: | 
| 45 |     struct Converter : private boost::noncopyable | 
| 46 |     { | 
| 47 |         UConverter * impl; | 
| 48 |  | 
| 49 |         explicit Converter(const String & charset) | 
| 50 |         { | 
| 51 |             UErrorCode status = U_ZERO_ERROR; | 
| 52 |             impl = ucnv_open(charset.data(), &status); | 
| 53 |  | 
| 54 |             if (U_SUCCESS(status)) | 
| 55 |                 ucnv_setToUCallBack(impl, | 
| 56 |                     UCNV_TO_U_CALLBACK_SUBSTITUTE, | 
| 57 |                     nullptr, | 
| 58 |                     nullptr, nullptr, | 
| 59 |                     &status); | 
| 60 |  | 
| 61 |             if (U_SUCCESS(status)) | 
| 62 |                 ucnv_setFromUCallBack(impl, | 
| 63 |                     UCNV_FROM_U_CALLBACK_SUBSTITUTE, | 
| 64 |                     nullptr, | 
| 65 |                     nullptr, nullptr, | 
| 66 |                     &status); | 
| 67 |  | 
| 68 |             if (!U_SUCCESS(status)) | 
| 69 |                 throw Exception("Cannot create UConverter with charset "  + charset + ", error: "  + String(u_errorName(status)), | 
| 70 |                     ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER); | 
| 71 |         } | 
| 72 |  | 
| 73 |         ~Converter() | 
| 74 |         { | 
| 75 |             ucnv_close(impl); | 
| 76 |         } | 
| 77 |     }; | 
| 78 |  | 
| 79 |     /// Separate converter is created for each thread. | 
| 80 |     using Pool = ObjectPoolMap<Converter, String>; | 
| 81 |  | 
| 82 |     Pool::Pointer getConverter(const String & charset) | 
| 83 |     { | 
| 84 |         static Pool pool; | 
| 85 |         return pool.get(charset, [&charset] { return new Converter(charset); }); | 
| 86 |     } | 
| 87 |  | 
| 88 |     void convert(const String & from_charset, const String & to_charset, | 
| 89 |         const ColumnString::Chars & from_chars, const ColumnString::Offsets & from_offsets, | 
| 90 |         ColumnString::Chars & to_chars, ColumnString::Offsets & to_offsets) | 
| 91 |     { | 
| 92 |         auto converter_from = getConverter(from_charset); | 
| 93 |         auto converter_to = getConverter(to_charset); | 
| 94 |  | 
| 95 |         ColumnString::Offset current_from_offset = 0; | 
| 96 |         ColumnString::Offset current_to_offset = 0; | 
| 97 |  | 
| 98 |         size_t size = from_offsets.size(); | 
| 99 |         to_offsets.resize(size); | 
| 100 |  | 
| 101 |         PODArray<UChar> uchars; | 
| 102 |  | 
| 103 |         for (size_t i = 0; i < size; ++i) | 
| 104 |         { | 
| 105 |             size_t from_string_size = from_offsets[i] - current_from_offset - 1; | 
| 106 |  | 
| 107 |             /// We assume that empty string is empty in every charset. | 
| 108 |             if (0 != from_string_size) | 
| 109 |             { | 
| 110 |                 /// reset state of converter | 
| 111 |                 ucnv_reset(converter_from->impl); | 
| 112 |                 ucnv_reset(converter_to->impl); | 
| 113 |  | 
| 114 |                 /// maximum number of code points is number of bytes in input string plus one for terminating zero | 
| 115 |                 uchars.resize(from_string_size + 1); | 
| 116 |  | 
| 117 |                 UErrorCode status = U_ZERO_ERROR; | 
| 118 |                 int32_t res = ucnv_toUChars( | 
| 119 |                     converter_from->impl, | 
| 120 |                     uchars.data(), uchars.size(), | 
| 121 |                     reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size, | 
| 122 |                     &status); | 
| 123 |  | 
| 124 |                 if (!U_SUCCESS(status)) | 
| 125 |                     throw Exception("Cannot convert from charset "  + from_charset + ", error: "  + String(u_errorName(status)), | 
| 126 |                         ErrorCodes::CANNOT_CONVERT_CHARSET); | 
| 127 |  | 
| 128 |                 auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl); | 
| 129 |                 auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size); | 
| 130 |  | 
| 131 |                 to_chars.resize(current_to_offset + max_to_size); | 
| 132 |  | 
| 133 |                 res = ucnv_fromUChars( | 
| 134 |                     converter_to->impl, | 
| 135 |                     reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size, | 
| 136 |                     uchars.data(), res, | 
| 137 |                     &status); | 
| 138 |  | 
| 139 |                 if (!U_SUCCESS(status)) | 
| 140 |                     throw Exception("Cannot convert to charset "  + to_charset + ", error: "  + String(u_errorName(status)), | 
| 141 |                         ErrorCodes::CANNOT_CONVERT_CHARSET); | 
| 142 |  | 
| 143 |                 current_to_offset += res; | 
| 144 |             } | 
| 145 |  | 
| 146 |             if (to_chars.size() < current_to_offset + 1) | 
| 147 |                 to_chars.resize(current_to_offset + 1); | 
| 148 |  | 
| 149 |             to_chars[current_to_offset] = 0; | 
| 150 |  | 
| 151 |             ++current_to_offset; | 
| 152 |             to_offsets[i] = current_to_offset; | 
| 153 |  | 
| 154 |             current_from_offset = from_offsets[i]; | 
| 155 |         } | 
| 156 |  | 
| 157 |         to_chars.resize(current_to_offset); | 
| 158 |     } | 
| 159 |  | 
| 160 | public: | 
| 161 |     static constexpr auto name = "convertCharset" ; | 
| 162 |     static FunctionPtr create(const Context &) { return std::make_shared<FunctionConvertCharset>(); } | 
| 163 |  | 
| 164 |     String getName() const override | 
| 165 |     { | 
| 166 |         return name; | 
| 167 |     } | 
| 168 |  | 
| 169 |     size_t getNumberOfArguments() const override { return 3; } | 
| 170 |  | 
| 171 |     DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override | 
| 172 |     { | 
| 173 |         for (size_t i : ext::range(0, 3)) | 
| 174 |             if (!isString(arguments[i])) | 
| 175 |                 throw Exception("Illegal type "  + arguments[i]->getName() + " of argument of function "  + getName() | 
| 176 |                     + ", must be String" , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); | 
| 177 |  | 
| 178 |         return std::make_shared<DataTypeString>(); | 
| 179 |     } | 
| 180 |  | 
| 181 |     bool useDefaultImplementationForConstants() const override { return true; } | 
| 182 |     ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; } | 
| 183 |  | 
| 184 |     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override | 
| 185 |     { | 
| 186 |         const ColumnWithTypeAndName & arg_from = block.getByPosition(arguments[0]); | 
| 187 |         const ColumnWithTypeAndName & arg_charset_from = block.getByPosition(arguments[1]); | 
| 188 |         const ColumnWithTypeAndName & arg_charset_to = block.getByPosition(arguments[2]); | 
| 189 |  | 
| 190 |         const ColumnConst * col_charset_from = checkAndGetColumnConstStringOrFixedString(arg_charset_from.column.get()); | 
| 191 |         const ColumnConst * col_charset_to = checkAndGetColumnConstStringOrFixedString(arg_charset_to.column.get()); | 
| 192 |  | 
| 193 |         if (!col_charset_from || !col_charset_to) | 
| 194 |             throw Exception("2nd and 3rd arguments of function "  + getName() + " (source charset and destination charset) must be constant strings." , | 
| 195 |                 ErrorCodes::ILLEGAL_COLUMN); | 
| 196 |  | 
| 197 |         String charset_from = col_charset_from->getValue<String>(); | 
| 198 |         String charset_to = col_charset_to->getValue<String>(); | 
| 199 |  | 
| 200 |         if (const ColumnString * col_from = checkAndGetColumn<ColumnString>(arg_from.column.get())) | 
| 201 |         { | 
| 202 |             auto col_to = ColumnString::create(); | 
| 203 |             convert(charset_from, charset_to, col_from->getChars(), col_from->getOffsets(), col_to->getChars(), col_to->getOffsets()); | 
| 204 |             block.getByPosition(result).column = std::move(col_to); | 
| 205 |         } | 
| 206 |         else | 
| 207 |             throw Exception("Illegal column passed as first argument of function "  + getName() + " (must be ColumnString)." , | 
| 208 |                 ErrorCodes::ILLEGAL_COLUMN); | 
| 209 |     } | 
| 210 | }; | 
| 211 |  | 
| 212 |  | 
| 213 | void registerFunctionConvertCharset(FunctionFactory & factory) | 
| 214 | { | 
| 215 |     factory.registerFunction<FunctionConvertCharset>(); | 
| 216 | } | 
| 217 |  | 
| 218 | } | 
| 219 |  | 
| 220 | #endif | 
| 221 |  |