| 1 | // |
| 2 | // Detect Unicode errors. |
| 3 | // |
| 4 | // UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic |
| 5 | // encoding that uses the first few bits on each byte to denote a "byte type", and all other bits |
| 6 | // are straight up concatenated into the final value. The first byte of a multibyte character is a |
| 7 | // "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte |
| 8 | // lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just |
| 9 | // start with 0, because that's what ASCII looks like. Here's what each size |
| 10 | // |
| 11 | // - ASCII (7 bits): 0_______ |
| 12 | // - 2 byte character (11 bits): 110_____ 10______ |
| 13 | // - 3 byte character (17 bits): 1110____ 10______ 10______ |
| 14 | // - 4 byte character (23 bits): 11110___ 10______ 10______ 10______ |
| 15 | // - 5+ byte character (illegal): 11111___ <illegal> |
| 16 | // |
| 17 | // There are 5 classes of error that can happen in Unicode: |
| 18 | // |
| 19 | // - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation). |
| 20 | // We detect this by looking for new characters (lead bytes) inside the range of a multibyte |
| 21 | // character. |
| 22 | // |
| 23 | // e.g. 11000000 01100001 (2-byte character where second byte is ASCII) |
| 24 | // |
| 25 | // - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation). |
| 26 | // We detect this by requiring that the next byte after your multibyte character be a new |
| 27 | // character--so a continuation after your character is wrong. |
| 28 | // |
| 29 | // e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte) |
| 30 | // |
| 31 | // - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large. |
| 32 | // |
| 33 | // e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF). |
| 34 | // |
| 35 | // - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have |
| 36 | // used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is |
| 37 | // technically possible, but UTF-8 disallows it so that there is only one way to write an "a". |
| 38 | // |
| 39 | // e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001) |
| 40 | // |
| 41 | // - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and |
| 42 | // WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8. |
| 43 | // |
| 44 | // e.g. 11101101 10100000 10000000 (U+D800) |
| 45 | // |
| 46 | // - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not |
| 47 | // support values with more than 23 bits (which a 4-byte character supports). |
| 48 | // |
| 49 | // e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000) |
| 50 | // |
| 51 | // Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: |
| 52 | // |
| 53 | // Code Points 1st 2s 3s 4s |
| 54 | // U+0000..U+007F 00..7F |
| 55 | // U+0080..U+07FF C2..DF 80..BF |
| 56 | // U+0800..U+0FFF E0 A0..BF 80..BF |
| 57 | // U+1000..U+CFFF E1..EC 80..BF 80..BF |
| 58 | // U+D000..U+D7FF ED 80..9F 80..BF |
| 59 | // U+E000..U+FFFF EE..EF 80..BF 80..BF |
| 60 | // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
| 61 | // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
| 62 | // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF |
| 63 | // |
| 64 | using namespace simd; |
| 65 | |
| 66 | namespace utf8_validation { |
| 67 | |
| 68 | // |
| 69 | // Find special case UTF-8 errors where the character is technically readable (has the right length) |
| 70 | // but the *value* is disallowed. |
| 71 | // |
| 72 | // This includes overlong encodings, surrogates and values too large for Unicode. |
| 73 | // |
| 74 | // It turns out the bad character ranges can all be detected by looking at the first 12 bits of the |
| 75 | // UTF-8 encoded character (i.e. all of byte 1, and the high 4 bits of byte 2). This algorithm does a |
| 76 | // 3 4-bit table lookups, identifying which errors that 4 bits could match, and then &'s them together. |
| 77 | // If all 3 lookups detect the same error, it's an error. |
| 78 | // |
| 79 | really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) { |
| 80 | // |
| 81 | // These are the errors we're going to match for bytes 1-2, by looking at the first three |
| 82 | // nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2> |
| 83 | // |
| 84 | static const int OVERLONG_2 = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way) |
| 85 | static const int OVERLONG_3 = 0x02; // 11100000 100_____ ________ |
| 86 | static const int OVERLONG_4 = 0x04; // 11110000 1000____ ________ ________ |
| 87 | static const int SURROGATE = 0x08; // 11101101 [101_]____ |
| 88 | static const int TOO_LARGE = 0x10; // 11110100 (1001|101_)____ |
| 89 | static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______ |
| 90 | |
| 91 | // After processing the rest of byte 1 (the low bits), we're still not done--we have to check |
| 92 | // byte 2 to be sure which things are errors and which aren't. |
| 93 | // Since high_bits is byte 5, byte 2 is high_bits.prev<3> |
| 94 | static const int CARRY = OVERLONG_2 | TOO_LARGE_2; |
| 95 | const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>( |
| 96 | // ASCII: ________ [0___]____ |
| 97 | CARRY, CARRY, CARRY, CARRY, |
| 98 | // ASCII: ________ [0___]____ |
| 99 | CARRY, CARRY, CARRY, CARRY, |
| 100 | // Continuations: ________ [10__]____ |
| 101 | CARRY | OVERLONG_3 | OVERLONG_4, // ________ [1000]____ |
| 102 | CARRY | OVERLONG_3 | TOO_LARGE, // ________ [1001]____ |
| 103 | CARRY | TOO_LARGE | SURROGATE, // ________ [1010]____ |
| 104 | CARRY | TOO_LARGE | SURROGATE, // ________ [1011]____ |
| 105 | // Multibyte Leads: ________ [11__]____ |
| 106 | CARRY, CARRY, CARRY, CARRY |
| 107 | ); |
| 108 | |
| 109 | const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>( |
| 110 | // [0___]____ (ASCII) |
| 111 | 0, 0, 0, 0, |
| 112 | 0, 0, 0, 0, |
| 113 | // [10__]____ (continuation) |
| 114 | 0, 0, 0, 0, |
| 115 | // [11__]____ (2+-byte leads) |
| 116 | OVERLONG_2, 0, // [110_]____ (2-byte lead) |
| 117 | OVERLONG_3 | SURROGATE, // [1110]____ (3-byte lead) |
| 118 | OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]____ (4+-byte lead) |
| 119 | ); |
| 120 | |
| 121 | const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>( |
| 122 | // ____[00__] ________ |
| 123 | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________ |
| 124 | OVERLONG_2, // ____[0001] ________ |
| 125 | 0, 0, |
| 126 | // ____[01__] ________ |
| 127 | TOO_LARGE, // ____[0100] ________ |
| 128 | TOO_LARGE_2, |
| 129 | TOO_LARGE_2, |
| 130 | TOO_LARGE_2, |
| 131 | // ____[10__] ________ |
| 132 | TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, |
| 133 | // ____[11__] ________ |
| 134 | TOO_LARGE_2, |
| 135 | TOO_LARGE_2 | SURROGATE, // ____[1101] ________ |
| 136 | TOO_LARGE_2, TOO_LARGE_2 |
| 137 | ); |
| 138 | |
| 139 | return byte_1_high & byte_1_low & byte_2_high; |
| 140 | } |
| 141 | |
| 142 | // |
| 143 | // Validate the length of multibyte characters (that each multibyte character has the right number |
| 144 | // of continuation characters, and that all continuation characters are part of a multibyte |
| 145 | // character). |
| 146 | // |
| 147 | // Algorithm |
| 148 | // ========= |
| 149 | // |
| 150 | // This algorithm compares *expected* continuation characters with *actual* continuation bytes, |
| 151 | // and emits an error anytime there is a mismatch. |
| 152 | // |
| 153 | // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte |
| 154 | // characters, the file will look like this: |
| 155 | // |
| 156 | // | Character | 𝄞 | | | | ₿ | | | ֏ | | a | b | |
| 157 | // |-----------------------|----|----|----|----|----|----|----|----|----|----|----| |
| 158 | // | Character Length | 4 | | | | 3 | | | 2 | | 1 | 1 | |
| 159 | // | Byte | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 | |
| 160 | // | is_second_byte | | X | | | | X | | | X | | | |
| 161 | // | is_third_byte | | | X | | | | X | | | | | |
| 162 | // | is_fourth_byte | | | | X | | | | | | | | |
| 163 | // | expected_continuation | | X | X | X | | X | X | | X | | | |
| 164 | // | is_continuation | | X | X | X | | X | X | | X | | | |
| 165 | // |
| 166 | // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation): |
| 167 | // |
| 168 | // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not |
| 169 | // part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just |
| 170 | // floating around extra outside of any character, or that there is an illegal 5-byte character, |
| 171 | // or maybe it's at the beginning of the file before any characters have started; but it's an |
| 172 | // error in all these cases. |
| 173 | // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means |
| 174 | // we started a new character before we were finished with the current one. |
| 175 | // |
| 176 | // Getting the Previous Bytes |
| 177 | // -------------------------- |
| 178 | // |
| 179 | // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte |
| 180 | // character, we need to "shift the bytes" to find that out. This is what they mean: |
| 181 | // |
| 182 | // - `is_continuation`: if the current byte is a continuation. |
| 183 | // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character. |
| 184 | // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character. |
| 185 | // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character. |
| 186 | // |
| 187 | // We use shuffles to go n bytes back, selecting part of the current `input` and part of the |
| 188 | // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller |
| 189 | // function, because the 1-byte-back data is used by other checks as well. |
| 190 | // |
| 191 | // Getting the Continuation Mask |
| 192 | // ----------------------------- |
| 193 | // |
| 194 | // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as |
| 195 | // numbers, using signed `<` and `>` operations to check if they are continuations or leads. |
| 196 | // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because |
| 197 | // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones). |
| 198 | // |
| 199 | // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads," |
| 200 | // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them. |
| 201 | // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0. |
| 202 | // |
| 203 | // When treated as signed numbers, they look like this: |
| 204 | // |
| 205 | // | Type | High Bits | Binary Range | Signed | |
| 206 | // |--------------|------------|--------------|--------| |
| 207 | // | ASCII | `0` | `01111111` | 127 | |
| 208 | // | | | `00000000` | 0 | |
| 209 | // | 4+-Byte Lead | `1111` | `11111111` | -1 | |
| 210 | // | | | `11110000 | -16 | |
| 211 | // | 3-Byte Lead | `1110` | `11101111` | -17 | |
| 212 | // | | | `11100000 | -32 | |
| 213 | // | 2-Byte Lead | `110` | `11011111` | -33 | |
| 214 | // | | | `11000000 | -64 | |
| 215 | // | Continuation | `10` | `10111111` | -65 | |
| 216 | // | | | `10000000 | -128 | |
| 217 | // |
| 218 | // This makes it pretty easy to get the continuation mask! It's just a single comparison: |
| 219 | // |
| 220 | // ``` |
| 221 | // is_continuation = input < -64` |
| 222 | // ``` |
| 223 | // |
| 224 | // We can do something similar for the others, but it takes two comparisons instead of one: "is |
| 225 | // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and |
| 226 | // `> -64`. Surely we can do better, they're right next to each other! |
| 227 | // |
| 228 | // Getting the is_xxx Masks: Shifting the Range |
| 229 | // -------------------------------------------- |
| 230 | // |
| 231 | // Notice *why* continuations were a single comparison. The actual *range* would require two |
| 232 | // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get |
| 233 | // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be |
| 234 | // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`. |
| 235 | // |
| 236 | // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps |
| 237 | // ASCII down into the negative, and puts 4+-Byte Lead at the top: |
| 238 | // |
| 239 | // | Type | High Bits | Binary Range | Signed | |
| 240 | // |----------------------|------------|--------------|-------| |
| 241 | // | 4+-Byte Lead (+ 127) | `0111` | `01111111` | 127 | |
| 242 | // | | | `01110000 | 112 | |
| 243 | // |----------------------|------------|--------------|-------| |
| 244 | // | 3-Byte Lead (+ 127) | `0110` | `01101111` | 111 | |
| 245 | // | | | `01100000 | 96 | |
| 246 | // |----------------------|------------|--------------|-------| |
| 247 | // | 2-Byte Lead (+ 127) | `010` | `01011111` | 95 | |
| 248 | // | | | `01000000 | 64 | |
| 249 | // |----------------------|------------|--------------|-------| |
| 250 | // | Continuation (+ 127) | `00` | `00111111` | 63 | |
| 251 | // | | | `00000000 | 0 | |
| 252 | // |----------------------|------------|--------------|-------| |
| 253 | // | ASCII (+ 127) | `1` | `11111111` | -1 | |
| 254 | // | | | `10000000` | -128 | |
| 255 | // |----------------------|------------|--------------|-------| |
| 256 | // |
| 257 | // *Now* we can use signed `>` on all of them: |
| 258 | // |
| 259 | // ``` |
| 260 | // prev1 = input.prev<1> |
| 261 | // prev2 = input.prev<2> |
| 262 | // prev3 = input.prev<3> |
| 263 | // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128` |
| 264 | // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128` |
| 265 | // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128` |
| 266 | // is_second_byte = prev1_flipped > 63; // 2+-byte lead |
| 267 | // is_third_byte = prev2_flipped > 95; // 3+-byte lead |
| 268 | // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead |
| 269 | // ``` |
| 270 | // |
| 271 | // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number |
| 272 | // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3 |
| 273 | // `^`'s at a time on Haswell, but only 2 `+`'s). |
| 274 | // |
| 275 | // That doesn't look like it saved us any instructions, did it? Well, because we're adding the |
| 276 | // same number to all of them, we can save one of those `+ 128` operations by assembling |
| 277 | // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128 |
| 278 | // to it. One more instruction saved! |
| 279 | // |
| 280 | // ``` |
| 281 | // prev1 = input.prev<1> |
| 282 | // prev3 = input.prev<3> |
| 283 | // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128` |
| 284 | // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128` |
| 285 | // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3 |
| 286 | // ``` |
| 287 | // |
| 288 | // ### Bringing It All Together: Detecting the Errors |
| 289 | // |
| 290 | // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`. |
| 291 | // All we have left to do is check if they match! |
| 292 | // |
| 293 | // ``` |
| 294 | // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation; |
| 295 | // ``` |
| 296 | // |
| 297 | // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in |
| 298 | // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do |
| 299 | // bitwise operations, and we're only using 1! |
| 300 | // |
| 301 | // Epilogue: Addition For Booleans |
| 302 | // ------------------------------- |
| 303 | // |
| 304 | // There is one big case the above code doesn't explicitly talk about--what if is_second_byte |
| 305 | // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next |
| 306 | // to each other (or any combination), and the continuation could be part of either of them! |
| 307 | // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic. |
| 308 | // |
| 309 | // Never fear, though. If that situation occurs, we'll already have detected that the second |
| 310 | // leading byte was an error, because it was supposed to be a part of the preceding multibyte |
| 311 | // character, but it *wasn't a continuation*. |
| 312 | // |
| 313 | // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and |
| 314 | // `&`, which is both interesting and possibly useful (even though we're not using it here). It |
| 315 | // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those |
| 316 | // comparisons were giving us numbers! |
| 317 | // |
| 318 | // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal |
| 319 | // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus, |
| 320 | // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if |
| 321 | // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because |
| 322 | // *any* nonzero value is treated as an error (not just -1), we're just fine here :) |
| 323 | // |
| 324 | // Further, if *more than one* multibyte character overlaps, |
| 325 | // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation` |
| 326 | // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be |
| 327 | // considered an error. |
| 328 | // |
| 329 | // One reason you might want to do this is parallelism. ^ and | are not associative, so |
| 330 | // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or |
| 331 | // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can |
| 332 | // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and |
| 333 | // then adds the result together. Same number of operations, but if the processor can run |
| 334 | // independent things in parallel (which most can), it runs faster. |
| 335 | // |
| 336 | // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have |
| 337 | // a super nice advantage in that more of them can be run at the same time (they can run on 3 |
| 338 | // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C, |
| 339 | // saving us the cycle we would have earned by using +. Even more, using an instruction with a |
| 340 | // wider array of ports can help *other* code run ahead, too, since these instructions can "get |
| 341 | // out of the way," running on a port other instructions can't. |
| 342 | // |
| 343 | // Epilogue II: One More Trick |
| 344 | // --------------------------- |
| 345 | // |
| 346 | // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay |
| 347 | // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in |
| 348 | // check_special_cases()--but we'll talk about that there :) |
| 349 | // |
| 350 | really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) { |
| 351 | simd8<uint8_t> prev2 = input.prev<2>(prev_input); |
| 352 | simd8<uint8_t> prev3 = input.prev<3>(prev_input); |
| 353 | |
| 354 | // Cont is 10000000-101111111 (-65...-128) |
| 355 | simd8<bool> is_continuation = simd8<int8_t>(input) < int8_t(-64); |
| 356 | // must_be_continuation is architecture-specific because Intel doesn't have unsigned comparisons |
| 357 | return simd8<uint8_t>(must_be_continuation(prev1, prev2, prev3) ^ is_continuation); |
| 358 | } |
| 359 | |
| 360 | // |
| 361 | // Return nonzero if there are incomplete multibyte characters at the end of the block: |
| 362 | // e.g. if there is a 4-byte character, but it's 3 bytes from the end. |
| 363 | // |
| 364 | really_inline simd8<uint8_t> is_incomplete(simd8<uint8_t> input) { |
| 365 | // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): |
| 366 | // ... 1111____ 111_____ 11______ |
| 367 | static const uint8_t max_array[32] = { |
| 368 | 255, 255, 255, 255, 255, 255, 255, 255, |
| 369 | 255, 255, 255, 255, 255, 255, 255, 255, |
| 370 | 255, 255, 255, 255, 255, 255, 255, 255, |
| 371 | 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 |
| 372 | }; |
| 373 | const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]); |
| 374 | return input.gt_bits(max_value); |
| 375 | } |
| 376 | |
| 377 | struct utf8_checker { |
| 378 | // If this is nonzero, there has been a UTF-8 error. |
| 379 | simd8<uint8_t> error; |
| 380 | // The last input we received |
| 381 | simd8<uint8_t> prev_input_block; |
| 382 | // Whether the last input we received was incomplete (used for ASCII fast path) |
| 383 | simd8<uint8_t> prev_incomplete; |
| 384 | |
| 385 | // |
| 386 | // Check whether the current bytes are valid UTF-8. |
| 387 | // |
| 388 | really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) { |
| 389 | // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes |
| 390 | // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) |
| 391 | simd8<uint8_t> prev1 = input.prev<1>(prev_input); |
| 392 | this->error |= check_special_cases(input, prev1); |
| 393 | this->error |= check_multibyte_lengths(input, prev_input, prev1); |
| 394 | } |
| 395 | |
| 396 | // The only problem that can happen at EOF is that a multibyte character is too short. |
| 397 | really_inline void check_eof() { |
| 398 | // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't |
| 399 | // possibly finish them. |
| 400 | this->error |= this->prev_incomplete; |
| 401 | } |
| 402 | |
| 403 | really_inline void check_next_input(simd8x64<uint8_t> input) { |
| 404 | if (likely(is_ascii(input))) { |
| 405 | // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't |
| 406 | // possibly finish them. |
| 407 | this->error |= this->prev_incomplete; |
| 408 | } else { |
| 409 | this->check_utf8_bytes(input.chunks[0], this->prev_input_block); |
| 410 | for (int i=1; i<simd8x64<uint8_t>::NUM_CHUNKS; i++) { |
| 411 | this->check_utf8_bytes(input.chunks[i], input.chunks[i-1]); |
| 412 | } |
| 413 | this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]); |
| 414 | this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]; |
| 415 | } |
| 416 | } |
| 417 | |
| 418 | really_inline ErrorValues errors() { |
| 419 | return this->error.any_bits_set_anywhere() ? simdjson::UTF8_ERROR : simdjson::SUCCESS; |
| 420 | } |
| 421 | |
| 422 | }; // struct utf8_checker |
| 423 | } |
| 424 | |
| 425 | using utf8_validation::utf8_checker; |