utf8_lookup2_algorithm.h source code [ClickHouse/contrib/simdjson/src/generic/utf8_lookup2_algorithm.h]

1	//
2	// Detect Unicode errors.
3	//
4	// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
5	// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
6	// are straight up concatenated into the final value. The first byte of a multibyte character is a
7	// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
8	// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
9	// start with 0, because that's what ASCII looks like. Here's what each size
10	//
11	// - ASCII (7 bits): 0_______
12	// - 2 byte character (11 bits): 110_____ 10______
13	// - 3 byte character (17 bits): 1110____ 10______ 10______
14	// - 4 byte character (23 bits): 11110___ 10______ 10______ 10______
15	// - 5+ byte character (illegal): 11111___ <illegal>
16	//
17	// There are 5 classes of error that can happen in Unicode:
18	//
19	// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
20	// We detect this by looking for new characters (lead bytes) inside the range of a multibyte
21	// character.
22	//
23	// e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
24	//
25	// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
26	// We detect this by requiring that the next byte after your multibyte character be a new
27	// character--so a continuation after your character is wrong.
28	//
29	// e.g. 11011111 10111111 10111111 (2-byte character followed by another* continuation byte)*
30	//
31	// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
32	//
33	// e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
34	//
35	// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
36	// used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
37	// technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
38	//
39	// e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
40	//
41	// - SURROGATE: Unicode U+D800-U+DFFF is a surrogate* character, reserved for use in UCS-2 and*
42	// WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
43	//
44	// e.g. 11101101 10100000 10000000 (U+D800)
45	//
46	// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
47	// support values with more than 23 bits (which a 4-byte character supports).
48	//
49	// e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
50	//
51	// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
52	//
53	// Code Points 1st 2s 3s 4s
54	// U+0000..U+007F 00..7F
55	// U+0080..U+07FF C2..DF 80..BF
56	// U+0800..U+0FFF E0 A0..BF 80..BF
57	// U+1000..U+CFFF E1..EC 80..BF 80..BF
58	// U+D000..U+D7FF ED 80..9F 80..BF
59	// U+E000..U+FFFF EE..EF 80..BF 80..BF
60	// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
61	// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
62	// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
63	//
64	using namespace simd;
65
66	namespace utf8_validation {
67
68	//
69	// Find special case UTF-8 errors where the character is technically readable (has the right length)
70	// but the value* is disallowed.*
71	//
72	// This includes overlong encodings, surrogates and values too large for Unicode.
73	//
74	// It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
75	// UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
76	// 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
77	// If all 3 lookups detect the same error, it's an error.
78	//
79	really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
80	//
81	// These are the errors we're going to match for bytes 1-2, by looking at the first three
82	// nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
83	//
84	static const int OVERLONG_2 = `0x01`; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
85	static const int OVERLONG_3 = `0x02`; // 11100000 100_____ ________
86	static const int OVERLONG_4 = `0x04`; // 11110000 1000____ ________ ________
87	static const int SURROGATE = `0x08`; // 11101101 [101_]____
88	static const int TOO_LARGE = `0x10`; // 11110100 (1001\|101_)____
89	static const int TOO_LARGE_2 = `0x20`; // 1111(1___\|011_\|0101) 10______
90
91	// After processing the rest of byte 1 (the low bits), we're still not done--we have to check
92	// byte 2 to be sure which things are errors and which aren't.
93	// Since high_bits is byte 5, byte 2 is high_bits.prev<3>
94	static const int CARRY = OVERLONG_2 \| TOO_LARGE_2;
95	const simd8<uint8_t> byte_2_high = input.shr<`4`>().lookup_16<uint8_t>(
96	// ASCII: ________ [0___]____
97	CARRY, CARRY, CARRY, CARRY,
98	// ASCII: ________ [0___]____
99	CARRY, CARRY, CARRY, CARRY,
100	// Continuations: ________ [10__]____
101	CARRY \| OVERLONG_3 \| OVERLONG_4, // ________ [1000]____
102	CARRY \| OVERLONG_3 \| TOO_LARGE, // ________ [1001]____
103	CARRY \| TOO_LARGE \| SURROGATE, // ________ [1010]____
104	CARRY \| TOO_LARGE \| SURROGATE, // ________ [1011]____
105	// Multibyte Leads: ________ [11__]____
106	CARRY, CARRY, CARRY, CARRY
107	);
108
109	const simd8<uint8_t> byte_1_high = prev1.shr<`4`>().lookup_16<uint8_t>(
110	// [0___]____ (ASCII)
111	`0`, `0`, `0`, `0`,
112	`0`, `0`, `0`, `0`,
113	// [10__]____ (continuation)
114	`0`, `0`, `0`, `0`,
115	// [11__]____ (2+-byte leads)
116	OVERLONG_2, `0`, // [110_]____ (2-byte lead)
117	OVERLONG_3 \| SURROGATE, // [1110]____ (3-byte lead)
118	OVERLONG_4 \| TOO_LARGE \| TOO_LARGE_2 // [1111]____ (4+-byte lead)
119	);
120
121	const simd8<uint8_t> byte_1_low = (prev1 & `0x0F`).lookup_16<uint8_t>(
122	// ____[00__] ________
123	OVERLONG_2 \| OVERLONG_3 \| OVERLONG_4, // ____[0000] ________
124	OVERLONG_2, // ____[0001] ________
125	`0`, `0`,
126	// ____[01__] ________
127	TOO_LARGE, // ____[0100] ________
128	TOO_LARGE_2,
129	TOO_LARGE_2,
130	TOO_LARGE_2,
131	// ____[10__] ________
132	TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2,
133	// ____[11__] ________
134	TOO_LARGE_2,
135	TOO_LARGE_2 \| SURROGATE, // ____[1101] ________
136	TOO_LARGE_2, TOO_LARGE_2
137	);
138
139	return byte_1_high & byte_1_low & byte_2_high;
140	}
141
142	//
143	// Validate the length of multibyte characters (that each multibyte character has the right number
144	// of continuation characters, and that all continuation characters are part of a multibyte
145	// character).
146	//
147	// Algorithm
148	// =========
149	//
150	// This algorithm compares expected* continuation characters with actual continuation bytes,*
151	// and emits an error anytime there is a mismatch.
152	//
153	// For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
154	// characters, the file will look like this:
155	//
156	// \| Character \| 𝄞 \| \| \| \| ₿ \| \| \| ֏ \| \| a \| b \|
157	// \|-----------------------\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|
158	// \| Character Length \| 4 \| \| \| \| 3 \| \| \| 2 \| \| 1 \| 1 \|
159	// \| Byte \| F0 \| 9D \| 84 \| 9E \| E2 \| 82 \| BF \| D6 \| 8F \| 61 \| 62 \|
160	// \| is_second_byte \| \| X \| \| \| \| X \| \| \| X \| \| \|
161	// \| is_third_byte \| \| \| X \| \| \| \| X \| \| \| \| \|
162	// \| is_fourth_byte \| \| \| \| X \| \| \| \| \| \| \| \|
163	// \| expected_continuation \| \| X \| X \| X \| \| X \| X \| \| X \| \| \|
164	// \| is_continuation \| \| X \| X \| X \| \| X \| X \| \| X \| \| \|
165	//
166	// The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
167	//
168	// - Extra Continuations:* Any continuation that is not a second, third or fourth byte is not*
169	// part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
170	// floating around extra outside of any character, or that there is an illegal 5-byte character,
171	// or maybe it's at the beginning of the file before any characters have started; but it's an
172	// error in all these cases.
173	// - Missing Continuations:* Any second, third or fourth byte that isn't a continuation is an error, because that means*
174	// we started a new character before we were finished with the current one.
175	//
176	// Getting the Previous Bytes
177	// --------------------------
178	//
179	// Because we want to know if a byte is the second* (or third, or fourth) byte of a multibyte*
180	// character, we need to "shift the bytes" to find that out. This is what they mean:
181	//
182	// - `is_continuation`: if the current byte is a continuation.
183	// - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
184	// - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
185	// - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
186	//
187	// We use shuffles to go n bytes back, selecting part of the current `input` and part of the
188	// `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
189	// function, because the 1-byte-back data is used by other checks as well.
190	//
191	// Getting the Continuation Mask
192	// -----------------------------
193	//
194	// Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
195	// numbers, using signed `<` and `>` operations to check if they are continuations or leads.
196	// In fact, we treat the numbers as signed, partly because it helps us, and partly because
197	// Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
198	//
199	// In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
200	// respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
201	// Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
202	//
203	// When treated as signed numbers, they look like this:
204	//
205	// \| Type \| High Bits \| Binary Range \| Signed \|
206	// \|--------------\|------------\|--------------\|--------\|
207	// \| ASCII \| `0` \| `01111111` \| 127 \|
208	// \| \| \| `00000000` \| 0 \|
209	// \| 4+-Byte Lead \| `1111` \| `11111111` \| -1 \|
210	// \| \| \| `11110000 \| -16 \|
211	// \| 3-Byte Lead \| `1110` \| `11101111` \| -17 \|
212	// \| \| \| `11100000 \| -32 \|
213	// \| 2-Byte Lead \| `110` \| `11011111` \| -33 \|
214	// \| \| \| `11000000 \| -64 \|
215	// \| Continuation \| `10` \| `10111111` \| -65 \|
216	// \| \| \| `10000000 \| -128 \|
217	//
218	// This makes it pretty easy to get the continuation mask! It's just a single comparison:
219	//
220	// ```
221	// is_continuation = input < -64`
222	// ```
223	//
224	// We can do something similar for the others, but it takes two comparisons instead of one: "is
225	// the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
226	// `> -64`. Surely we can do better, they're right next to each other!
227	//
228	// Getting the is_xxx Masks: Shifting the Range
229	// --------------------------------------------
230	//
231	// Notice why* continuations were a single comparison. The actual range would require two*
232	// comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
233	// that for free. In fact, if we had unsigned* comparisons, 2+, 3+ and 4+ comparisons would be*
234	// just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
235	//
236	// Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
237	// ASCII down into the negative, and puts 4+-Byte Lead at the top:
238	//
239	// \| Type \| High Bits \| Binary Range \| Signed \|
240	// \|----------------------\|------------\|--------------\|-------\|
241	// \| 4+-Byte Lead (+ 127) \| `0111` \| `01111111` \| 127 \|
242	// \| \| \| `01110000 \| 112 \|
243	// \|----------------------\|------------\|--------------\|-------\|
244	// \| 3-Byte Lead (+ 127) \| `0110` \| `01101111` \| 111 \|
245	// \| \| \| `01100000 \| 96 \|
246	// \|----------------------\|------------\|--------------\|-------\|
247	// \| 2-Byte Lead (+ 127) \| `010` \| `01011111` \| 95 \|
248	// \| \| \| `01000000 \| 64 \|
249	// \|----------------------\|------------\|--------------\|-------\|
250	// \| Continuation (+ 127) \| `00` \| `00111111` \| 63 \|
251	// \| \| \| `00000000 \| 0 \|
252	// \|----------------------\|------------\|--------------\|-------\|
253	// \| ASCII (+ 127) \| `1` \| `11111111` \| -1 \|
254	// \| \| \| `10000000` \| -128 \|
255	// \|----------------------\|------------\|--------------\|-------\|
256	//
257	// Now* we can use signed `>` on all of them:*
258	//
259	// ```
260	// prev1 = input.prev<1>
261	// prev2 = input.prev<2>
262	// prev3 = input.prev<3>
263	// prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
264	// prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
265	// prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
266	// is_second_byte = prev1_flipped > 63; // 2+-byte lead
267	// is_third_byte = prev2_flipped > 95; // 3+-byte lead
268	// is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
269	// ```
270	//
271	// NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
272	// of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
273	// `^`'s at a time on Haswell, but only 2 `+`'s).
274	//
275	// That doesn't look like it saved us any instructions, did it? Well, because we're adding the
276	// same number to all of them, we can save one of those `+ 128` operations by assembling
277	// `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
278	// to it. One more instruction saved!
279	//
280	// ```
281	// prev1 = input.prev<1>
282	// prev3 = input.prev<3>
283	// prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
284	// prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
285	// prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3
286	// ```
287	//
288	// ### Bringing It All Together: Detecting the Errors
289	//
290	// At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
291	// All we have left to do is check if they match!
292	//
293	// ```
294	// return (is_second_byte \| is_third_byte \| is_fourth_byte) ^ is_continuation;
295	// ```
296	//
297	// But wait--there's more. The above statement is only 3 operations, but they cannot be done in*
298	// parallel. You have to do 2 `\|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do*
299	// bitwise operations, and we're only using 1!
300	//
301	// Epilogue: Addition For Booleans
302	// -------------------------------
303	//
304	// There is one big case the above code doesn't explicitly talk about--what if is_second_byte
305	// and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
306	// to each other (or any combination), and the continuation could be part of either of them!
307	// Our algorithm using `&` and `\|` won't detect that the continuation byte is problematic.
308	//
309	// Never fear, though. If that situation occurs, we'll already have detected that the second
310	// leading byte was an error, because it was supposed to be a part of the preceding multibyte
311	// character, but it wasn't a continuation.
312	//
313	// We could stop here, but it turns out that we can fix it using `+` and `-` instead of `\|` and
314	// `&`, which is both interesting and possibly useful (even though we're not using it here). It
315	// exploits the fact that in SIMD, a true* value is -1, and a false value is 0. So those*
316	// comparisons were giving us numbers!
317	//
318	// Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
319	// circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
320	// `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
321	// both* or neither are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because*
322	// any* nonzero value is treated as an error (not just -1), we're just fine here :)*
323	//
324	// Further, if more than one* multibyte character overlaps,*
325	// `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
326	// from that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be*
327	// considered an error.
328	//
329	// One reason you might want to do this is parallelism. ^ and \| are not associative, so
330	// (A \| B \| C) ^ D will always be three operations in a row: either you do A \| B -> \| C -> ^ D, or
331	// you do B \| C -> \| A -> ^ D. But addition and subtraction are* associative: (A + B + C) - D can*
332	// be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
333	// then adds the result together. Same number of operations, but if the processor can run
334	// independent things in parallel (which most can), it runs faster.
335	//
336	// This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, \| and ^ have
337	// a super nice advantage in that more of them can be run at the same time (they can run on 3
338	// ports, while + and - can run on 2)! This means that we can do A \| B while we're still doing C,
339	// saving us the cycle we would have earned by using +. Even more, using an instruction with a
340	// wider array of ports can help other* code run ahead, too, since these instructions can "get*
341	// out of the way," running on a port other instructions can't.
342	//
343	// Epilogue II: One More Trick
344	// ---------------------------
345	//
346	// There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
347	// for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
348	// check_special_cases()--but we'll talk about that there :)
349	//
350	really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
351	simd8<uint8_t> prev2 = input.prev<`2`>(prev_input);
352	simd8<uint8_t> prev3 = input.prev<`3`>(prev_input);
353
354	// Cont is 10000000-101111111 (-65...-128)
355	simd8<bool> is_continuation = simd8<int8_t>(input) < int8_t(-`64`);
356	// must_be_continuation is architecture-specific because Intel doesn't have unsigned comparisons
357	return simd8<uint8_t>(must_be_continuation(prev1, prev2, prev3) ^ is_continuation);
358	}
359
360	//
361	// Return nonzero if there are incomplete multibyte characters at the end of the block:
362	// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
363	//
364	really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input) {
365	// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
366	// ... 1111____ 111_____ 11______
367	static const uint8_t max_array[`32`] = {
368	`255`, `255`, `255`, `255`, `255`, `255`, `255`, `255`,
369	`255`, `255`, `255`, `255`, `255`, `255`, `255`, `255`,
370	`255`, `255`, `255`, `255`, `255`, `255`, `255`, `255`,
371	`255`, `255`, `255`, `255`, `255`, `0b11110000u`-`1`, `0b11100000u`-`1`, `0b11000000u`-`1`
372	};
373	const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
374	return input.gt_bits(max_value);
375	}
376
377	struct utf8_checker {
378	// If this is nonzero, there has been a UTF-8 error.
379	simd8<uint8_t> error;
380	// The last input we received
381	simd8<uint8_t> prev_input_block;
382	// Whether the last input we received was incomplete (used for ASCII fast path)
383	simd8<uint8_t> prev_incomplete;
384
385	//
386	// Check whether the current bytes are valid UTF-8.
387	//
388	really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
389	// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
390	// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
391	simd8<uint8_t> prev1 = input.prev<`1`>(prev_input);
392	this->error \|= check_special_cases(input, prev1);
393	this->error \|= check_multibyte_lengths(input, prev_input, prev1);
394	}
395
396	// The only problem that can happen at EOF is that a multibyte character is too short.
397	really_inline void check_eof() {
398	// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
399	// possibly finish them.
400	this->error \|= this->prev_incomplete;
401	}
402
403	really_inline void check_next_input(simd8x64<uint8_t> input) {
404	if (likely(is_ascii(input))) {
405	// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
406	// possibly finish them.
407	this->error \|= this->prev_incomplete;
408	} else {
409	this->check_utf8_bytes(input.chunks[`0`], this->prev_input_block);
410	for (int i=`1`; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
411	this->check_utf8_bytes(input.chunks[i], input.chunks[i-`1`]);
412	}
413	this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-`1`]);
414	this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-`1`];
415	}
416	}
417
418	really_inline ErrorValues errors() {
419	return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
420	}
421
422	}; // struct utf8_checker
423	}
424
425	using utf8_validation::utf8_checker;

Browse the source code of ClickHouse/contrib/simdjson/src/generic/utf8_lookup2_algorithm.h