WriteBufferValidUTF8.cpp source code [ClickHouse/dbms/src/IO/WriteBufferValidUTF8.cpp]

1	#include <Poco/UTF8Encoding.h>
2	#include <IO/WriteBufferValidUTF8.h>
3	#include <Core/Types.h>
4
5	#ifdef __SSE2__
6	#include <emmintrin.h>
7	#endif
8
9
10	namespace DB
11	{
12
13	const size_t WriteBufferValidUTF8::DEFAULT_SIZE = `4096`;
14
15	/* Index into the table below with the first byte of a UTF-8 sequence to*
16	* get the number of trailing bytes that are supposed to follow it.
17	* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
18	* left as-is for anyone who may want to do such conversion, which was
19	* allowed in earlier algorithms.
20	*/
21	extern const UInt8 length_of_utf8_sequence[`256`] =
22	{
23	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, `1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
24	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, `1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
25	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, `1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
26	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, `1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
27	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, `1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
28	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, `1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
29	`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`, `2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,
30	`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`, `4`,`4`,`4`,`4`,`4`,`4`,`4`,`4`,`5`,`5`,`5`,`5`,`6`,`6`,`6`,`6`
31	};
32
33
34	WriteBufferValidUTF8::WriteBufferValidUTF8(
35	WriteBuffer & output_buffer_, bool group_replacements_, const char * replacement_, size_t size)
36	: BufferWithOwnMemory<WriteBuffer>(std::max(static_cast<size_t>(`32`), size)), output_buffer(output_buffer_),
37	group_replacements(group_replacements_), replacement (replacement_)
38	{
39	}
40
41
42	inline void WriteBufferValidUTF8::putReplacement()
43	{
44	if (replacement.empty() \|\| (group_replacements && just_put_replacement))
45	return;
46
47	just_put_replacement = true;
48	output_buffer.write(replacement.data(), replacement.size());
49	}
50
51
52	inline void WriteBufferValidUTF8::putValid(char *data, size_t len)
53	{
54	if (len == `0`)
55	return;
56
57	just_put_replacement = false;
58	output_buffer.write(data, len);
59	}
60
61
62	void WriteBufferValidUTF8::nextImpl()
63	{
64	char * p = memory.data();
65	char * valid_start = p;
66
67	while (p < pos)
68	{
69	#ifdef __SSE2__
70	/// Fast skip of ASCII
71	static constexpr size_t SIMD_BYTES = `16`;
72	const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
73
74	while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))))
75	p += SIMD_BYTES;
76
77	if (!(p < pos))
78	break;
79	#endif
80
81	size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];
82
83	if (len > `4`)
84	{
85	/// Invalid start of sequence. Skip one byte.
86	putValid(valid_start, p - valid_start);
87	putReplacement();
88	++p;
89	valid_start = p;
90	}
91	else if (p + len > pos)
92	{
93	/// Sequence was not fully written to this buffer.
94	break;
95	}
96	else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len))
97	{
98	/// Valid sequence.
99	p += len;
100	}
101	else
102	{
103	/// Invalid sequence. Skip just first byte.
104	putValid(valid_start, p - valid_start);
105	putReplacement();
106	++p;
107	valid_start = p;
108	}
109	}
110
111	putValid(valid_start, p - valid_start);
112
113	size_t cnt = pos - p;
114
115	/// Shift unfinished sequence to start of buffer.
116	for (size_t i = `0`; i < cnt; ++i)
117	memory [i] = p[i];
118
119	working_buffer = Buffer (&memory [cnt], memory.data() + memory.size());
120
121	/// Propagate next() to the output buffer
122	output_buffer.next();
123	}
124
125
126	void WriteBufferValidUTF8::finish()
127	{
128	/// Write all complete sequences from buffer.
129	nextImpl();
130
131	/// If unfinished sequence at end, then write replacement.
132	if (working_buffer.begin() != memory.data())
133	putReplacement();
134	}
135
136
137	WriteBufferValidUTF8::~WriteBufferValidUTF8()
138	{
139	try
140	{
141	finish();
142	}
143	catch (...)
144	{
145	tryLogCurrentException(__PRETTY_FUNCTION__);
146	}
147	}
148
149	}
150

Browse the source code of ClickHouse/dbms/src/IO/WriteBufferValidUTF8.cpp