1//
2// Detect Unicode errors.
3//
4// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
5// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
6// are straight up concatenated into the final value. The first byte of a multibyte character is a
7// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
8// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
9// start with 0, because that's what ASCII looks like. Here's what each size
10//
11// - ASCII (7 bits): 0_______
12// - 2 byte character (11 bits): 110_____ 10______
13// - 3 byte character (17 bits): 1110____ 10______ 10______
14// - 4 byte character (23 bits): 11110___ 10______ 10______ 10______
15// - 5+ byte character (illegal): 11111___ <illegal>
16//
17// There are 5 classes of error that can happen in Unicode:
18//
19// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
20// We detect this by looking for new characters (lead bytes) inside the range of a multibyte
21// character.
22//
23// e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
24//
25// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
26// We detect this by requiring that the next byte after your multibyte character be a new
27// character--so a continuation after your character is wrong.
28//
29// e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
30//
31// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
32//
33// e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
34//
35// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
36// used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
37// technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
38//
39// e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
40//
41// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
42// WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
43//
44// e.g. 11101101 10100000 10000000 (U+D800)
45//
46// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
47// support values with more than 23 bits (which a 4-byte character supports).
48//
49// e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
50//
51// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
52//
53// Code Points 1st 2s 3s 4s
54// U+0000..U+007F 00..7F
55// U+0080..U+07FF C2..DF 80..BF
56// U+0800..U+0FFF E0 A0..BF 80..BF
57// U+1000..U+CFFF E1..EC 80..BF 80..BF
58// U+D000..U+D7FF ED 80..9F 80..BF
59// U+E000..U+FFFF EE..EF 80..BF 80..BF
60// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
61// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
62// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
63//
64using namespace simd;
65
66namespace utf8_validation {
67
68 //
69 // Find special case UTF-8 errors where the character is technically readable (has the right length)
70 // but the *value* is disallowed.
71 //
72 // This includes overlong encodings, surrogates and values too large for Unicode.
73 //
74 // It turns out the bad character ranges can all be detected by looking at the first 12 bits of the
75 // UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a
76 // 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together.
77 // If all 3 lookups detect the same error, it's an error.
78 //
79 really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
80 //
81 // These are the errors we're going to match for bytes 1-2, by looking at the first three
82 // nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
83 //
84 static const int OVERLONG_2 = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
85 static const int OVERLONG_3 = 0x02; // 11100000 100_____ ________
86 static const int OVERLONG_4 = 0x04; // 11110000 1000____ ________ ________
87 static const int SURROGATE = 0x08; // 11101101 [101_]____
88 static const int TOO_LARGE = 0x10; // 11110100 (1001|101_)____
89 static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
90
91 // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
92 // byte 2 to be sure which things are errors and which aren't.
93 // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
94 static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
95 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
96 // ASCII: ________ [0___]____
97 CARRY, CARRY, CARRY, CARRY,
98 // ASCII: ________ [0___]____
99 CARRY, CARRY, CARRY, CARRY,
100 // Continuations: ________ [10__]____
101 CARRY | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
102 CARRY | OVERLONG_3 | TOO_LARGE, // ________ [1001]____
103 CARRY | TOO_LARGE | SURROGATE, // ________ [1010]____
104 CARRY | TOO_LARGE | SURROGATE, // ________ [1011]____
105 // Multibyte Leads: ________ [11__]____
106 CARRY, CARRY, CARRY, CARRY
107 );
108
109 const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
110 // [0___]____ (ASCII)
111 0, 0, 0, 0,
112 0, 0, 0, 0,
113 // [10__]____ (continuation)
114 0, 0, 0, 0,
115 // [11__]____ (2+-byte leads)
116 OVERLONG_2, 0, // [110_]____ (2-byte lead)
117 OVERLONG_3 | SURROGATE, // [1110]____ (3-byte lead)
118 OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]____ (4+-byte lead)
119 );
120
121 const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
122 // ____[00__] ________
123 OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
124 OVERLONG_2, // ____[0001] ________
125 0, 0,
126 // ____[01__] ________
127 TOO_LARGE, // ____[0100] ________
128 TOO_LARGE_2,
129 TOO_LARGE_2,
130 TOO_LARGE_2,
131 // ____[10__] ________
132 TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2,
133 // ____[11__] ________
134 TOO_LARGE_2,
135 TOO_LARGE_2 | SURROGATE, // ____[1101] ________
136 TOO_LARGE_2, TOO_LARGE_2
137 );
138
139 return byte_1_high & byte_1_low & byte_2_high;
140 }
141
142 //
143 // Validate the length of multibyte characters (that each multibyte character has the right number
144 // of continuation characters, and that all continuation characters are part of a multibyte
145 // character).
146 //
147 // Algorithm
148 // =========
149 //
150 // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
151 // and emits an error anytime there is a mismatch.
152 //
153 // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
154 // characters, the file will look like this:
155 //
156 // | Character | 𝄞 | | | | ₿ | | | ֏ | | a | b |
157 // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
158 // | Character Length | 4 | | | | 3 | | | 2 | | 1 | 1 |
159 // | Byte | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
160 // | is_second_byte | | X | | | | X | | | X | | |
161 // | is_third_byte | | | X | | | | X | | | | |
162 // | is_fourth_byte | | | | X | | | | | | | |
163 // | expected_continuation | | X | X | X | | X | X | | X | | |
164 // | is_continuation | | X | X | X | | X | X | | X | | |
165 //
166 // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
167 //
168 // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
169 // part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
170 // floating around extra outside of any character, or that there is an illegal 5-byte character,
171 // or maybe it's at the beginning of the file before any characters have started; but it's an
172 // error in all these cases.
173 // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
174 // we started a new character before we were finished with the current one.
175 //
176 // Getting the Previous Bytes
177 // --------------------------
178 //
179 // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
180 // character, we need to "shift the bytes" to find that out. This is what they mean:
181 //
182 // - `is_continuation`: if the current byte is a continuation.
183 // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
184 // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
185 // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
186 //
187 // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
188 // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
189 // function, because the 1-byte-back data is used by other checks as well.
190 //
191 // Getting the Continuation Mask
192 // -----------------------------
193 //
194 // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
195 // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
196 // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
197 // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
198 //
199 // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
200 // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
201 // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
202 //
203 // When treated as signed numbers, they look like this:
204 //
205 // | Type | High Bits | Binary Range | Signed |
206 // |--------------|------------|--------------|--------|
207 // | ASCII | `0` | `01111111` | 127 |
208 // | | | `00000000` | 0 |
209 // | 4+-Byte Lead | `1111` | `11111111` | -1 |
210 // | | | `11110000 | -16 |
211 // | 3-Byte Lead | `1110` | `11101111` | -17 |
212 // | | | `11100000 | -32 |
213 // | 2-Byte Lead | `110` | `11011111` | -33 |
214 // | | | `11000000 | -64 |
215 // | Continuation | `10` | `10111111` | -65 |
216 // | | | `10000000 | -128 |
217 //
218 // This makes it pretty easy to get the continuation mask! It's just a single comparison:
219 //
220 // ```
221 // is_continuation = input < -64`
222 // ```
223 //
224 // We can do something similar for the others, but it takes two comparisons instead of one: "is
225 // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
226 // `> -64`. Surely we can do better, they're right next to each other!
227 //
228 // Getting the is_xxx Masks: Shifting the Range
229 // --------------------------------------------
230 //
231 // Notice *why* continuations were a single comparison. The actual *range* would require two
232 // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
233 // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
234 // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
235 //
236 // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
237 // ASCII down into the negative, and puts 4+-Byte Lead at the top:
238 //
239 // | Type | High Bits | Binary Range | Signed |
240 // |----------------------|------------|--------------|-------|
241 // | 4+-Byte Lead (+ 127) | `0111` | `01111111` | 127 |
242 // | | | `01110000 | 112 |
243 // |----------------------|------------|--------------|-------|
244 // | 3-Byte Lead (+ 127) | `0110` | `01101111` | 111 |
245 // | | | `01100000 | 96 |
246 // |----------------------|------------|--------------|-------|
247 // | 2-Byte Lead (+ 127) | `010` | `01011111` | 95 |
248 // | | | `01000000 | 64 |
249 // |----------------------|------------|--------------|-------|
250 // | Continuation (+ 127) | `00` | `00111111` | 63 |
251 // | | | `00000000 | 0 |
252 // |----------------------|------------|--------------|-------|
253 // | ASCII (+ 127) | `1` | `11111111` | -1 |
254 // | | | `10000000` | -128 |
255 // |----------------------|------------|--------------|-------|
256 //
257 // *Now* we can use signed `>` on all of them:
258 //
259 // ```
260 // prev1 = input.prev<1>
261 // prev2 = input.prev<2>
262 // prev3 = input.prev<3>
263 // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
264 // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
265 // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
266 // is_second_byte = prev1_flipped > 63; // 2+-byte lead
267 // is_third_byte = prev2_flipped > 95; // 3+-byte lead
268 // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
269 // ```
270 //
271 // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
272 // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
273 // `^`'s at a time on Haswell, but only 2 `+`'s).
274 //
275 // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
276 // same number to all of them, we can save one of those `+ 128` operations by assembling
277 // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
278 // to it. One more instruction saved!
279 //
280 // ```
281 // prev1 = input.prev<1>
282 // prev3 = input.prev<3>
283 // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
284 // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
285 // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3
286 // ```
287 //
288 // ### Bringing It All Together: Detecting the Errors
289 //
290 // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
291 // All we have left to do is check if they match!
292 //
293 // ```
294 // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
295 // ```
296 //
297 // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
298 // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
299 // bitwise operations, and we're only using 1!
300 //
301 // Epilogue: Addition For Booleans
302 // -------------------------------
303 //
304 // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
305 // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
306 // to each other (or any combination), and the continuation could be part of either of them!
307 // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
308 //
309 // Never fear, though. If that situation occurs, we'll already have detected that the second
310 // leading byte was an error, because it was supposed to be a part of the preceding multibyte
311 // character, but it *wasn't a continuation*.
312 //
313 // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
314 // `&`, which is both interesting and possibly useful (even though we're not using it here). It
315 // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
316 // comparisons were giving us numbers!
317 //
318 // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
319 // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
320 // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
321 // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
322 // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
323 //
324 // Further, if *more than one* multibyte character overlaps,
325 // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
326 // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
327 // considered an error.
328 //
329 // One reason you might want to do this is parallelism. ^ and | are not associative, so
330 // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
331 // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
332 // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
333 // then adds the result together. Same number of operations, but if the processor can run
334 // independent things in parallel (which most can), it runs faster.
335 //
336 // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
337 // a super nice advantage in that more of them can be run at the same time (they can run on 3
338 // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
339 // saving us the cycle we would have earned by using +. Even more, using an instruction with a
340 // wider array of ports can help *other* code run ahead, too, since these instructions can "get
341 // out of the way," running on a port other instructions can't.
342 //
343 // Epilogue II: One More Trick
344 // ---------------------------
345 //
346 // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
347 // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
348 // check_special_cases()--but we'll talk about that there :)
349 //
350 really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
351 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
352 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
353
354 // Cont is 10000000-101111111 (-65...-128)
355 simd8<bool> is_continuation = simd8<int8_t>(input) < int8_t(-64);
356 // must_be_continuation is architecture-specific because Intel doesn't have unsigned comparisons
357 return simd8<uint8_t>(must_be_continuation(prev1, prev2, prev3) ^ is_continuation);
358 }
359
360 //
361 // Return nonzero if there are incomplete multibyte characters at the end of the block:
362 // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
363 //
364 really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input) {
365 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
366 // ... 1111____ 111_____ 11______
367 static const uint8_t max_array[32] = {
368 255, 255, 255, 255, 255, 255, 255, 255,
369 255, 255, 255, 255, 255, 255, 255, 255,
370 255, 255, 255, 255, 255, 255, 255, 255,
371 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
372 };
373 const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
374 return input.gt_bits(max_value);
375 }
376
377 struct utf8_checker {
378 // If this is nonzero, there has been a UTF-8 error.
379 simd8<uint8_t> error;
380 // The last input we received
381 simd8<uint8_t> prev_input_block;
382 // Whether the last input we received was incomplete (used for ASCII fast path)
383 simd8<uint8_t> prev_incomplete;
384
385 //
386 // Check whether the current bytes are valid UTF-8.
387 //
388 really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
389 // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
390 // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
391 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
392 this->error |= check_special_cases(input, prev1);
393 this->error |= check_multibyte_lengths(input, prev_input, prev1);
394 }
395
396 // The only problem that can happen at EOF is that a multibyte character is too short.
397 really_inline void check_eof() {
398 // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
399 // possibly finish them.
400 this->error |= this->prev_incomplete;
401 }
402
403 really_inline void check_next_input(simd8x64<uint8_t> input) {
404 if (likely(is_ascii(input))) {
405 // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
406 // possibly finish them.
407 this->error |= this->prev_incomplete;
408 } else {
409 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
410 for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) {
411 this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]);
412 }
413 this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
414 this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
415 }
416 }
417
418 really_inline ErrorValues errors() {
419 return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS;
420 }
421
422 }; // struct utf8_checker
423}
424
425using utf8_validation::utf8_checker;