isValidUTF8.cpp source code [ClickHouse/dbms/src/Functions/isValidUTF8.cpp]

1	#include <DataTypes/DataTypeString.h>
2	#include <Functions/FunctionFactory.h>
3	#include <Functions/FunctionStringOrArrayToT.h>
4
5	#include <cstring>
6
7	#ifdef __SSE4_1__
8	# include <emmintrin.h>
9	# include <smmintrin.h>
10	# include <tmmintrin.h>
11	#endif
12
13	namespace DB
14	{
15	/// inspired by https://github.com/cyb70289/utf8/
16	struct ValidUTF8Impl
17	{
18	/*
19	MIT License
20
21	Copyright (c) 2019 Yibo Cai
22
23	Permission is hereby granted, free of charge, to any person obtaining a copy
24	of this software and associated documentation files (the "Software"), to deal
25	in the Software without restriction, including without limitation the rights
26	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
27	copies of the Software, and to permit persons to whom the Software is
28	furnished to do so, subject to the following conditions:
29
30	The above copyright notice and this permission notice shall be included in all
31	copies or substantial portions of the Software.
32
33	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39	SOFTWARE.
40	*/
41
42	/*
43	* http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
44	*
45	* Table 3-7. Well-Formed UTF-8 Byte Sequences
46	*
47	* +--------------------+------------+-------------+------------+-------------+
48	* \| Code Points \| First Byte \| Second Byte \| Third Byte \| Fourth Byte \|
49	* +--------------------+------------+-------------+------------+-------------+
50	* \| U+0000..U+007F \| 00..7F \| \| \| \|
51	* +--------------------+------------+-------------+------------+-------------+
52	* \| U+0080..U+07FF \| C2..DF \| 80..BF \| \| \|
53	* +--------------------+------------+-------------+------------+-------------+
54	* \| U+0800..U+0FFF \| E0 \| A0..BF \| 80..BF \| \|
55	* +--------------------+------------+-------------+------------+-------------+
56	* \| U+1000..U+CFFF \| E1..EC \| 80..BF \| 80..BF \| \|
57	* +--------------------+------------+-------------+------------+-------------+
58	* \| U+D000..U+D7FF \| ED \| 80..9F \| 80..BF \| \|
59	* +--------------------+------------+-------------+------------+-------------+
60	* \| U+E000..U+FFFF \| EE..EF \| 80..BF \| 80..BF \| \|
61	* +--------------------+------------+-------------+------------+-------------+
62	* \| U+10000..U+3FFFF \| F0 \| 90..BF \| 80..BF \| 80..BF \|
63	* +--------------------+------------+-------------+------------+-------------+
64	* \| U+40000..U+FFFFF \| F1..F3 \| 80..BF \| 80..BF \| 80..BF \|
65	* +--------------------+------------+-------------+------------+-------------+
66	* \| U+100000..U+10FFFF \| F4 \| 80..8F \| 80..BF \| 80..BF \|
67	* +--------------------+------------+-------------+------------+-------------+
68	*/
69
70	static inline UInt8 isValidUTF8Naive(const UInt8 * data, UInt64 len)
71	{
72	while (len)
73	{
74	int bytes;
75	const UInt8 byte1 = data[`0`];
76	/ 00..7F /
77	if (byte1 <= `0x7F`)
78	{
79	bytes = `1`;
80	}
81	/ C2..DF, 80..BF /
82	else if (len >= `2` && byte1 >= `0xC2` && byte1 <= `0xDF` && static_cast<Int8>(data[`1`]) <= static_cast<Int8>(`0xBF`))
83	{
84	bytes = `2`;
85	}
86	else if (len >= `3`)
87	{
88	const UInt8 byte2 = data[`1`];
89	bool byte2_ok = static_cast<Int8>(byte2) <= static_cast<Int8>(`0xBF`);
90	bool byte3_ok = static_cast<Int8>(data[`2`]) <= static_cast<Int8>(`0xBF`);
91
92	if (byte2_ok && byte3_ok &&
93	/ E0, A0..BF, 80..BF /
94	((byte1 == `0xE0` && byte2 >= `0xA0`) \|\|
95	/ E1..EC, 80..BF, 80..BF /
96	(byte1 >= `0xE1` && byte1 <= `0xEC`) \|\|
97	/ ED, 80..9F, 80..BF /
98	(byte1 == `0xED` && byte2 <= `0x9F`) \|\|
99	/ EE..EF, 80..BF, 80..BF /
100	(byte1 >= `0xEE` && byte1 <= `0xEF`)))
101	{
102	bytes = `3`;
103	}
104	else if (len >= `4`)
105	{
106	bool byte4_ok = static_cast<Int8>(data[`3`]) <= static_cast<Int8>(`0xBF`);
107	if (byte2_ok && byte3_ok && byte4_ok &&
108	/ F0, 90..BF, 80..BF, 80..BF /
109	((byte1 == `0xF0` && byte2 >= `0x90`) \|\|
110	/ F1..F3, 80..BF, 80..BF, 80..BF /
111	(byte1 >= `0xF1` && byte1 <= `0xF3`) \|\|
112	/ F4, 80..8F, 80..BF, 80..BF /
113	(byte1 == `0xF4` && byte2 <= `0x8F`)))
114	{
115	bytes = `4`;
116	}
117	else
118	{
119	return false;
120	}
121	}
122	else
123	{
124	return false;
125	}
126	}
127	else
128	{
129	return false;
130	}
131	len -= bytes;
132	data += bytes;
133	}
134	return true;
135	}
136
137	#ifndef __SSE4_1__
138	static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return isValidUTF8Naive(data, len); }
139	#else
140	static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len)
141	{
142	/*
143	* Map high nibble of "First Byte" to legal character length minus 1
144	* 0x00 ~ 0xBF --> 0
145	* 0xC0 ~ 0xDF --> 1
146	* 0xE0 ~ 0xEF --> 2
147	* 0xF0 ~ 0xFF --> 3
148	*/
149	const __m128i first_len_tbl = _mm_setr_epi8(`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `2`, `3`);
150
151	/ Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) /
152	const __m128i first_range_tbl = _mm_setr_epi8(`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `8`, `8`, `8`, `8`);
153
154	/*
155	* Range table, map range index to min and max values
156	*/
157	const __m128i range_min_tbl
158	= _mm_setr_epi8(`0x00`, `0x80`, `0x80`, `0x80`, `0xA0`, `0x80`, `0x90`, `0x80`, `0xC2`, `0x7F`, `0x7F`, `0x7F`, `0x7F`, `0x7F`, `0x7F`, `0x7F`);
159
160	const __m128i range_max_tbl
161	= _mm_setr_epi8(`0x7F`, `0xBF`, `0xBF`, `0xBF`, `0xBF`, `0x9F`, `0xBF`, `0x8F`, `0xF4`, `0x80`, `0x80`, `0x80`, `0x80`, `0x80`, `0x80`, `0x80`);
162
163	/*
164	* Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
165	* which the Second Byte are not 80~BF. It contains "range index adjustment".
166	* +------------+---------------+------------------+----------------+
167	* \| First Byte \| original range\| range adjustment \| adjusted range \|
168	* +------------+---------------+------------------+----------------+
169	* \| E0 \| 2 \| 2 \| 4 \|
170	* +------------+---------------+------------------+----------------+
171	* \| ED \| 2 \| 3 \| 5 \|
172	* +------------+---------------+------------------+----------------+
173	* \| F0 \| 3 \| 3 \| 6 \|
174	* +------------+---------------+------------------+----------------+
175	* \| F4 \| 4 \| 4 \| 8 \|
176	* +------------+---------------+------------------+----------------+
177	*/
178
179	/ index1 -> E0, index14 -> ED /
180	const __m128i df_ee_tbl = _mm_setr_epi8(`0`, `2`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `3`, `0`);
181
182	/ index1 -> F0, index5 -> F4 /
183	const __m128i ef_fe_tbl = _mm_setr_epi8(`0`, `3`, `0`, `0`, `0`, `4`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`);
184
185	__m128i prev_input = _mm_set1_epi8(`0`);
186	__m128i prev_first_len = _mm_set1_epi8(`0`);
187	__m128i error = _mm_set1_epi8(`0`);
188
189	auto check_packed = [&](__m128i input) noexcept
190	{
191	/ high_nibbles = input >> 4 /
192	const __m128i high_nibbles = _mm_and_si128(_mm_srli_epi16(input, `4`), _mm_set1_epi8(`0x0F`));
193
194	/ first_len = legal character length minus 1 /
195	/ 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF /
196	/ first_len = first_len_tbl[high_nibbles] /
197	__m128i first_len = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
198
199	/ First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF /
200	/ range = first_range_tbl[high_nibbles] /
201	__m128i range = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
202
203	/ Second Byte: set range index to first_len /
204	/ 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF /
205	/ range \|= (first_len, prev_first_len) << 1 byte /
206	range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, `15`));
207
208	/ Third Byte: set range index to saturate_sub(first_len, 1) /
209	/ 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF /
210	__m128i tmp1;
211	__m128i tmp2;
212	/ tmp1 = saturate_sub(first_len, 1) /
213	tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(`1`));
214	/ tmp2 = saturate_sub(prev_first_len, 1) /
215	tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(`1`));
216	/ range \|= (tmp1, tmp2) << 2 bytes /
217	range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, `14`));
218
219	/ Fourth Byte: set range index to saturate_sub(first_len, 2) /
220	/ 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF /
221	/ tmp1 = saturate_sub(first_len, 2) /
222	tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(`2`));
223	/ tmp2 = saturate_sub(prev_first_len, 2) /
224	tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(`2`));
225	/ range \|= (tmp1, tmp2) << 3 bytes /
226	range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, `13`));
227
228	/*
229	* Now we have below range indices caluclated
230	* Correct cases:
231	* - 8 for C0~FF
232	* - 3 for 1st byte after F0~FF
233	* - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
234	* - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
235	* 3rd byte after F0~FF
236	* - 0 for others
237	* Error cases:
238	* 9,10,11 if non ascii First Byte overlaps
239	* E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
240	*/
241
242	/ Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) /
243	/ Overlaps lead to index 9~15, which are illegal in range table /
244	__m128i shift1, pos, range2;
245	/ shift1 = (input, prev_input) << 1 byte /
246	shift1 = _mm_alignr_epi8(input, prev_input, `15`);
247	pos = _mm_sub_epi8(shift1, _mm_set1_epi8(`0xEF`));
248	/*
249	* shift1: \| EF F0 ... FE \| FF 00 ... ... DE \| DF E0 ... EE \|
250	* pos: \| 0 1 15 \| 16 17 239\| 240 241 255\|
251	* pos-240: \| 0 0 0 \| 0 0 0 \| 0 1 15 \|
252	* pos+112: \| 112 113 127\| >= 128 \| >= 128 \|
253	*/
254	tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(`0xF0`));
255	range2 = _mm_shuffle_epi8(df_ee_tbl, tmp1);
256	tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(`112`));
257	range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp2));
258
259	range = _mm_add_epi8(range, range2);
260
261	/ Load min and max values per calculated range index /
262	__m128i minv = _mm_shuffle_epi8(range_min_tbl, range);
263	__m128i maxv = _mm_shuffle_epi8(range_max_tbl, range);
264
265	/ Check value range /
266	error = _mm_or_si128(error, _mm_cmplt_epi8(input, minv));
267	error = _mm_or_si128(error, _mm_cmpgt_epi8(input, maxv));
268
269	prev_input = input;
270	prev_first_len = first_len;
271
272	data += `16`;
273	len -= `16`;
274	};
275
276	while (len >= `16`)
277	check_packed(_mm_loadu_si128(reinterpret_cast<const __m128i *>(data)));
278
279	/// 0 <= len <= 15 for now. Reading data from data - 1 because of right padding of 15 and left padding
280	/// Then zero some bytes from the unknown memory and check again.
281	alignas(`16`) char buf[`32`];
282	_mm_store_si128(reinterpret_cast<__m128i >(buf), _mm_loadu_si128(reinterpret_cast<const* __m128i *>(data - `1`)));
283	memset(buf + len + `1`, `0`, `16`);
284	check_packed(_mm_loadu_si128(reinterpret_cast<__m128i *>(buf + `1`)));
285
286	return _mm_testz_si128(error, error);
287	}
288	#endif
289
290	static constexpr bool is_fixed_to_constant = false;
291
292	static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res)
293	{
294	size_t size = offsets.size();
295	size_t prev_offset = `0`;
296	for (size_t i = `0`; i < size; ++i)
297	{
298	res [i] = isValidUTF8(data.data() + prev_offset, offsets [i] - `1` - prev_offset);
299	prev_offset = offsets [i];
300	}
301	}
302
303	static void vector_fixed_to_constant(const ColumnString::Chars & /data/, size_t /n/, UInt8 & /res/) {}
304
305	static void vector_fixed_to_vector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res)
306	{
307	size_t size = data.size() / n;
308	for (size_t i = `0`; i < size; ++i)
309	res [i] = isValidUTF8(data.data() + i * n, n);
310	}
311
312	[[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt8> &)
313	{
314	throw Exception ("Cannot apply function isValidUTF8 to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
315	}
316	};
317
318	struct NameIsValidUTF8
319	{
320	static constexpr auto name = "isValidUTF8";
321	};
322	using FunctionValidUTF8 = FunctionStringOrArrayToT<ValidUTF8Impl, NameIsValidUTF8, UInt8>;
323
324	void registerFunctionIsValidUTF8(FunctionFactory & factory)
325	{
326	factory.registerFunction<FunctionValidUTF8>();
327	}
328
329	}
330

Browse the source code of ClickHouse/dbms/src/Functions/isValidUTF8.cpp