1#include <Poco/UTF8Encoding.h>
2#include <IO/WriteBufferValidUTF8.h>
3#include <Core/Types.h>
4
5#ifdef __SSE2__
6 #include <emmintrin.h>
7#endif
8
9
10namespace DB
11{
12
13const size_t WriteBufferValidUTF8::DEFAULT_SIZE = 4096;
14
15/** Index into the table below with the first byte of a UTF-8 sequence to
16 * get the number of trailing bytes that are supposed to follow it.
17 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
18 * left as-is for anyone who may want to do such conversion, which was
19 * allowed in earlier algorithms.
20 */
21extern const UInt8 length_of_utf8_sequence[256] =
22{
23 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
24 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
25 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
26 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
27 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
28 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
29 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
30 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
31};
32
33
34WriteBufferValidUTF8::WriteBufferValidUTF8(
35 WriteBuffer & output_buffer_, bool group_replacements_, const char * replacement_, size_t size)
36 : BufferWithOwnMemory<WriteBuffer>(std::max(static_cast<size_t>(32), size)), output_buffer(output_buffer_),
37 group_replacements(group_replacements_), replacement(replacement_)
38{
39}
40
41
42inline void WriteBufferValidUTF8::putReplacement()
43{
44 if (replacement.empty() || (group_replacements && just_put_replacement))
45 return;
46
47 just_put_replacement = true;
48 output_buffer.write(replacement.data(), replacement.size());
49}
50
51
52inline void WriteBufferValidUTF8::putValid(char *data, size_t len)
53{
54 if (len == 0)
55 return;
56
57 just_put_replacement = false;
58 output_buffer.write(data, len);
59}
60
61
62void WriteBufferValidUTF8::nextImpl()
63{
64 char * p = memory.data();
65 char * valid_start = p;
66
67 while (p < pos)
68 {
69#ifdef __SSE2__
70 /// Fast skip of ASCII
71 static constexpr size_t SIMD_BYTES = 16;
72 const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
73
74 while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))))
75 p += SIMD_BYTES;
76
77 if (!(p < pos))
78 break;
79#endif
80
81 size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];
82
83 if (len > 4)
84 {
85 /// Invalid start of sequence. Skip one byte.
86 putValid(valid_start, p - valid_start);
87 putReplacement();
88 ++p;
89 valid_start = p;
90 }
91 else if (p + len > pos)
92 {
93 /// Sequence was not fully written to this buffer.
94 break;
95 }
96 else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len))
97 {
98 /// Valid sequence.
99 p += len;
100 }
101 else
102 {
103 /// Invalid sequence. Skip just first byte.
104 putValid(valid_start, p - valid_start);
105 putReplacement();
106 ++p;
107 valid_start = p;
108 }
109 }
110
111 putValid(valid_start, p - valid_start);
112
113 size_t cnt = pos - p;
114
115 /// Shift unfinished sequence to start of buffer.
116 for (size_t i = 0; i < cnt; ++i)
117 memory[i] = p[i];
118
119 working_buffer = Buffer(&memory[cnt], memory.data() + memory.size());
120
121 /// Propagate next() to the output buffer
122 output_buffer.next();
123}
124
125
126void WriteBufferValidUTF8::finish()
127{
128 /// Write all complete sequences from buffer.
129 nextImpl();
130
131 /// If unfinished sequence at end, then write replacement.
132 if (working_buffer.begin() != memory.data())
133 putReplacement();
134}
135
136
137WriteBufferValidUTF8::~WriteBufferValidUTF8()
138{
139 try
140 {
141 finish();
142 }
143 catch (...)
144 {
145 tryLogCurrentException(__PRETTY_FUNCTION__);
146 }
147}
148
149}
150