1 | #include <Poco/UTF8Encoding.h> |
2 | #include <IO/WriteBufferValidUTF8.h> |
3 | #include <Core/Types.h> |
4 | |
5 | #ifdef __SSE2__ |
6 | #include <emmintrin.h> |
7 | #endif |
8 | |
9 | |
10 | namespace DB |
11 | { |
12 | |
13 | const size_t WriteBufferValidUTF8::DEFAULT_SIZE = 4096; |
14 | |
15 | /** Index into the table below with the first byte of a UTF-8 sequence to |
16 | * get the number of trailing bytes that are supposed to follow it. |
17 | * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is |
18 | * left as-is for anyone who may want to do such conversion, which was |
19 | * allowed in earlier algorithms. |
20 | */ |
21 | extern const UInt8 length_of_utf8_sequence[256] = |
22 | { |
23 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
24 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
25 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
26 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
27 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
28 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
29 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
30 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 |
31 | }; |
32 | |
33 | |
34 | WriteBufferValidUTF8::WriteBufferValidUTF8( |
35 | WriteBuffer & output_buffer_, bool group_replacements_, const char * replacement_, size_t size) |
36 | : BufferWithOwnMemory<WriteBuffer>(std::max(static_cast<size_t>(32), size)), output_buffer(output_buffer_), |
37 | group_replacements(group_replacements_), replacement(replacement_) |
38 | { |
39 | } |
40 | |
41 | |
42 | inline void WriteBufferValidUTF8::putReplacement() |
43 | { |
44 | if (replacement.empty() || (group_replacements && just_put_replacement)) |
45 | return; |
46 | |
47 | just_put_replacement = true; |
48 | output_buffer.write(replacement.data(), replacement.size()); |
49 | } |
50 | |
51 | |
52 | inline void WriteBufferValidUTF8::putValid(char *data, size_t len) |
53 | { |
54 | if (len == 0) |
55 | return; |
56 | |
57 | just_put_replacement = false; |
58 | output_buffer.write(data, len); |
59 | } |
60 | |
61 | |
62 | void WriteBufferValidUTF8::nextImpl() |
63 | { |
64 | char * p = memory.data(); |
65 | char * valid_start = p; |
66 | |
67 | while (p < pos) |
68 | { |
69 | #ifdef __SSE2__ |
70 | /// Fast skip of ASCII |
71 | static constexpr size_t SIMD_BYTES = 16; |
72 | const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES; |
73 | |
74 | while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)))) |
75 | p += SIMD_BYTES; |
76 | |
77 | if (!(p < pos)) |
78 | break; |
79 | #endif |
80 | |
81 | size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)]; |
82 | |
83 | if (len > 4) |
84 | { |
85 | /// Invalid start of sequence. Skip one byte. |
86 | putValid(valid_start, p - valid_start); |
87 | putReplacement(); |
88 | ++p; |
89 | valid_start = p; |
90 | } |
91 | else if (p + len > pos) |
92 | { |
93 | /// Sequence was not fully written to this buffer. |
94 | break; |
95 | } |
96 | else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len)) |
97 | { |
98 | /// Valid sequence. |
99 | p += len; |
100 | } |
101 | else |
102 | { |
103 | /// Invalid sequence. Skip just first byte. |
104 | putValid(valid_start, p - valid_start); |
105 | putReplacement(); |
106 | ++p; |
107 | valid_start = p; |
108 | } |
109 | } |
110 | |
111 | putValid(valid_start, p - valid_start); |
112 | |
113 | size_t cnt = pos - p; |
114 | |
115 | /// Shift unfinished sequence to start of buffer. |
116 | for (size_t i = 0; i < cnt; ++i) |
117 | memory[i] = p[i]; |
118 | |
119 | working_buffer = Buffer(&memory[cnt], memory.data() + memory.size()); |
120 | |
121 | /// Propagate next() to the output buffer |
122 | output_buffer.next(); |
123 | } |
124 | |
125 | |
126 | void WriteBufferValidUTF8::finish() |
127 | { |
128 | /// Write all complete sequences from buffer. |
129 | nextImpl(); |
130 | |
131 | /// If unfinished sequence at end, then write replacement. |
132 | if (working_buffer.begin() != memory.data()) |
133 | putReplacement(); |
134 | } |
135 | |
136 | |
137 | WriteBufferValidUTF8::~WriteBufferValidUTF8() |
138 | { |
139 | try |
140 | { |
141 | finish(); |
142 | } |
143 | catch (...) |
144 | { |
145 | tryLogCurrentException(__PRETTY_FUNCTION__); |
146 | } |
147 | } |
148 | |
149 | } |
150 | |