| 1 | #include <DataTypes/DataTypeString.h> |
| 2 | #include <Functions/FunctionFactory.h> |
| 3 | #include <Functions/FunctionStringToString.h> |
| 4 | #include <IO/WriteBufferFromVector.h> |
| 5 | #include <IO/WriteBufferValidUTF8.h> |
| 6 | #include <IO/WriteHelpers.h> |
| 7 | #include <Poco/UTF8Encoding.h> |
| 8 | |
| 9 | #include <string_view> |
| 10 | |
| 11 | #ifdef __SSE2__ |
| 12 | # include <emmintrin.h> |
| 13 | #endif |
| 14 | |
| 15 | namespace DB |
| 16 | { |
| 17 | namespace ErrorCodes |
| 18 | { |
| 19 | extern const int ILLEGAL_COLUMN; |
| 20 | } |
| 21 | |
| 22 | extern const UInt8 length_of_utf8_sequence[256]; |
| 23 | |
| 24 | struct ToValidUTF8Impl |
| 25 | { |
| 26 | static void toValidUTF8One(const char * begin, const char * end, WriteBuffer & write_buffer) |
| 27 | { |
| 28 | static constexpr std::string_view replacement = "\xEF\xBF\xBD" ; |
| 29 | |
| 30 | const char * p = begin; |
| 31 | const char * valid_start = begin; |
| 32 | |
| 33 | /// The last recorded character was `replacement`. |
| 34 | bool just_put_replacement = false; |
| 35 | |
| 36 | auto put_valid = [&write_buffer, &just_put_replacement](const char * data, size_t len) |
| 37 | { |
| 38 | if (len == 0) |
| 39 | return; |
| 40 | just_put_replacement = false; |
| 41 | write_buffer.write(data, len); |
| 42 | }; |
| 43 | |
| 44 | auto put_replacement = [&write_buffer, &just_put_replacement]() |
| 45 | { |
| 46 | if (just_put_replacement) |
| 47 | return; |
| 48 | just_put_replacement = true; |
| 49 | write_buffer.write(replacement.data(), replacement.size()); |
| 50 | }; |
| 51 | |
| 52 | while (p < end) |
| 53 | { |
| 54 | #ifdef __SSE2__ |
| 55 | /// Fast skip of ASCII |
| 56 | static constexpr size_t SIMD_BYTES = 16; |
| 57 | const char * simd_end = p + (end - p) / SIMD_BYTES * SIMD_BYTES; |
| 58 | |
| 59 | while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(p)))) |
| 60 | p += SIMD_BYTES; |
| 61 | |
| 62 | if (!(p < end)) |
| 63 | break; |
| 64 | #endif |
| 65 | |
| 66 | size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)]; |
| 67 | |
| 68 | if (len > 4) |
| 69 | { |
| 70 | /// Invalid start of sequence. Skip one byte. |
| 71 | put_valid(valid_start, p - valid_start); |
| 72 | put_replacement(); |
| 73 | ++p; |
| 74 | valid_start = p; |
| 75 | } |
| 76 | else if (p + len > end) |
| 77 | { |
| 78 | /// Sequence was not fully written to this buffer. |
| 79 | break; |
| 80 | } |
| 81 | else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<const unsigned char *>(p), len)) |
| 82 | { |
| 83 | /// Valid sequence. |
| 84 | p += len; |
| 85 | } |
| 86 | else |
| 87 | { |
| 88 | /// Invalid sequence. Skip just first byte. |
| 89 | put_valid(valid_start, p - valid_start); |
| 90 | put_replacement(); |
| 91 | ++p; |
| 92 | valid_start = p; |
| 93 | } |
| 94 | } |
| 95 | |
| 96 | put_valid(valid_start, p - valid_start); |
| 97 | |
| 98 | if (p != end) |
| 99 | put_replacement(); |
| 100 | } |
| 101 | |
| 102 | static void vector( |
| 103 | const ColumnString::Chars & data, |
| 104 | const ColumnString::Offsets & offsets, |
| 105 | ColumnString::Chars & res_data, |
| 106 | ColumnString::Offsets & res_offsets) |
| 107 | { |
| 108 | const size_t offsets_size = offsets.size(); |
| 109 | /// It can be larger than that, but we believe it is unlikely to happen. |
| 110 | res_data.resize(data.size()); |
| 111 | res_offsets.resize(offsets_size); |
| 112 | |
| 113 | size_t prev_offset = 0; |
| 114 | WriteBufferFromVector<ColumnString::Chars> write_buffer(res_data); |
| 115 | for (size_t i = 0; i < offsets_size; ++i) |
| 116 | { |
| 117 | const char * haystack_data = reinterpret_cast<const char *>(&data[prev_offset]); |
| 118 | const size_t haystack_size = offsets[i] - prev_offset - 1; |
| 119 | toValidUTF8One(haystack_data, haystack_data + haystack_size, write_buffer); |
| 120 | writeChar(0, write_buffer); |
| 121 | res_offsets[i] = write_buffer.count(); |
| 122 | prev_offset = offsets[i]; |
| 123 | } |
| 124 | write_buffer.finish(); |
| 125 | } |
| 126 | |
| 127 | [[noreturn]] static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) |
| 128 | { |
| 129 | throw Exception("Column of type FixedString is not supported by toValidUTF8 function" , ErrorCodes::ILLEGAL_COLUMN); |
| 130 | } |
| 131 | }; |
| 132 | |
| 133 | struct NameToValidUTF8 |
| 134 | { |
| 135 | static constexpr auto name = "toValidUTF8" ; |
| 136 | }; |
| 137 | using FunctionToValidUTF8 = FunctionStringToString<ToValidUTF8Impl, NameToValidUTF8>; |
| 138 | |
| 139 | void registerFunctionToValidUTF8(FunctionFactory & factory) |
| 140 | { |
| 141 | factory.registerFunction<FunctionToValidUTF8>(); |
| 142 | } |
| 143 | |
| 144 | } |
| 145 | |