| 1 | #include <Poco/UTF8Encoding.h> |
| 2 | #include <IO/WriteBufferValidUTF8.h> |
| 3 | #include <Core/Types.h> |
| 4 | |
| 5 | #ifdef __SSE2__ |
| 6 | #include <emmintrin.h> |
| 7 | #endif |
| 8 | |
| 9 | |
| 10 | namespace DB |
| 11 | { |
| 12 | |
| 13 | const size_t WriteBufferValidUTF8::DEFAULT_SIZE = 4096; |
| 14 | |
| 15 | /** Index into the table below with the first byte of a UTF-8 sequence to |
| 16 | * get the number of trailing bytes that are supposed to follow it. |
| 17 | * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is |
| 18 | * left as-is for anyone who may want to do such conversion, which was |
| 19 | * allowed in earlier algorithms. |
| 20 | */ |
| 21 | extern const UInt8 length_of_utf8_sequence[256] = |
| 22 | { |
| 23 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 24 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 25 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 26 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 27 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 28 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 29 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 30 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 |
| 31 | }; |
| 32 | |
| 33 | |
| 34 | WriteBufferValidUTF8::WriteBufferValidUTF8( |
| 35 | WriteBuffer & output_buffer_, bool group_replacements_, const char * replacement_, size_t size) |
| 36 | : BufferWithOwnMemory<WriteBuffer>(std::max(static_cast<size_t>(32), size)), output_buffer(output_buffer_), |
| 37 | group_replacements(group_replacements_), replacement(replacement_) |
| 38 | { |
| 39 | } |
| 40 | |
| 41 | |
| 42 | inline void WriteBufferValidUTF8::putReplacement() |
| 43 | { |
| 44 | if (replacement.empty() || (group_replacements && just_put_replacement)) |
| 45 | return; |
| 46 | |
| 47 | just_put_replacement = true; |
| 48 | output_buffer.write(replacement.data(), replacement.size()); |
| 49 | } |
| 50 | |
| 51 | |
| 52 | inline void WriteBufferValidUTF8::putValid(char *data, size_t len) |
| 53 | { |
| 54 | if (len == 0) |
| 55 | return; |
| 56 | |
| 57 | just_put_replacement = false; |
| 58 | output_buffer.write(data, len); |
| 59 | } |
| 60 | |
| 61 | |
| 62 | void WriteBufferValidUTF8::nextImpl() |
| 63 | { |
| 64 | char * p = memory.data(); |
| 65 | char * valid_start = p; |
| 66 | |
| 67 | while (p < pos) |
| 68 | { |
| 69 | #ifdef __SSE2__ |
| 70 | /// Fast skip of ASCII |
| 71 | static constexpr size_t SIMD_BYTES = 16; |
| 72 | const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES; |
| 73 | |
| 74 | while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)))) |
| 75 | p += SIMD_BYTES; |
| 76 | |
| 77 | if (!(p < pos)) |
| 78 | break; |
| 79 | #endif |
| 80 | |
| 81 | size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)]; |
| 82 | |
| 83 | if (len > 4) |
| 84 | { |
| 85 | /// Invalid start of sequence. Skip one byte. |
| 86 | putValid(valid_start, p - valid_start); |
| 87 | putReplacement(); |
| 88 | ++p; |
| 89 | valid_start = p; |
| 90 | } |
| 91 | else if (p + len > pos) |
| 92 | { |
| 93 | /// Sequence was not fully written to this buffer. |
| 94 | break; |
| 95 | } |
| 96 | else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len)) |
| 97 | { |
| 98 | /// Valid sequence. |
| 99 | p += len; |
| 100 | } |
| 101 | else |
| 102 | { |
| 103 | /// Invalid sequence. Skip just first byte. |
| 104 | putValid(valid_start, p - valid_start); |
| 105 | putReplacement(); |
| 106 | ++p; |
| 107 | valid_start = p; |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | putValid(valid_start, p - valid_start); |
| 112 | |
| 113 | size_t cnt = pos - p; |
| 114 | |
| 115 | /// Shift unfinished sequence to start of buffer. |
| 116 | for (size_t i = 0; i < cnt; ++i) |
| 117 | memory[i] = p[i]; |
| 118 | |
| 119 | working_buffer = Buffer(&memory[cnt], memory.data() + memory.size()); |
| 120 | |
| 121 | /// Propagate next() to the output buffer |
| 122 | output_buffer.next(); |
| 123 | } |
| 124 | |
| 125 | |
| 126 | void WriteBufferValidUTF8::finish() |
| 127 | { |
| 128 | /// Write all complete sequences from buffer. |
| 129 | nextImpl(); |
| 130 | |
| 131 | /// If unfinished sequence at end, then write replacement. |
| 132 | if (working_buffer.begin() != memory.data()) |
| 133 | putReplacement(); |
| 134 | } |
| 135 | |
| 136 | |
| 137 | WriteBufferValidUTF8::~WriteBufferValidUTF8() |
| 138 | { |
| 139 | try |
| 140 | { |
| 141 | finish(); |
| 142 | } |
| 143 | catch (...) |
| 144 | { |
| 145 | tryLogCurrentException(__PRETTY_FUNCTION__); |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | } |
| 150 | |