| 1 | /* | 
| 2 |  * Copyright (c) 2015-2017, Intel Corporation | 
| 3 |  * | 
| 4 |  * Redistribution and use in source and binary forms, with or without | 
| 5 |  * modification, are permitted provided that the following conditions are met: | 
| 6 |  * | 
| 7 |  *  * Redistributions of source code must retain the above copyright notice, | 
| 8 |  *    this list of conditions and the following disclaimer. | 
| 9 |  *  * Redistributions in binary form must reproduce the above copyright | 
| 10 |  *    notice, this list of conditions and the following disclaimer in the | 
| 11 |  *    documentation and/or other materials provided with the distribution. | 
| 12 |  *  * Neither the name of Intel Corporation nor the names of its contributors | 
| 13 |  *    may be used to endorse or promote products derived from this software | 
| 14 |  *    without specific prior written permission. | 
| 15 |  * | 
| 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
| 17 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 18 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
| 19 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
| 20 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
| 21 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
| 22 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
| 23 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
| 24 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
| 25 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
| 26 |  * POSSIBILITY OF SUCH DAMAGE. | 
| 27 |  */ | 
| 28 |  | 
| 29 | /* | 
| 30 |  * Matches a byte in a charclass using three shuffles | 
| 31 |  */ | 
| 32 |  | 
| 33 |  | 
| 34 | #include "ue2common.h" | 
| 35 | #include "truffle.h" | 
| 36 | #include "util/arch.h" | 
| 37 | #include "util/bitutils.h" | 
| 38 | #include "util/simd_utils.h" | 
| 39 |  | 
| 40 | #if !defined(HAVE_AVX2) | 
| 41 |  | 
| 42 | static really_inline | 
| 43 | const u8 *lastMatch(const u8 *buf, u32 z) { | 
| 44 |     if (unlikely(z != 0xffff)) { | 
| 45 |         u32 pos = clz32(~z & 0xffff); | 
| 46 |         assert(pos >= 16 && pos < 32); | 
| 47 |         return buf + (31 - pos); | 
| 48 |     } | 
| 49 |  | 
| 50 |     return NULL; // no match | 
| 51 | } | 
| 52 |  | 
| 53 | static really_inline | 
| 54 | const u8 *firstMatch(const u8 *buf, u32 z) { | 
| 55 |     if (unlikely(z != 0xffff)) { | 
| 56 |         u32 pos = ctz32(~z & 0xffff); | 
| 57 |         assert(pos < 16); | 
| 58 |         return buf + pos; | 
| 59 |     } | 
| 60 |  | 
| 61 |     return NULL; // no match | 
| 62 | } | 
| 63 |  | 
| 64 | static really_inline | 
| 65 | u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { | 
| 66 |  | 
| 67 |     m128 highconst = _mm_set1_epi8(0x80); | 
| 68 |     m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201); | 
| 69 |  | 
| 70 |     // and now do the real work | 
| 71 |     m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v); | 
| 72 |     m128 t1 = xor128(v, highconst); | 
| 73 |     m128 shuf2 = pshufb_m128(shuf_mask_lo_highset, t1); | 
| 74 |     m128 t2 = andnot128(highconst, rshift64_m128(v, 4)); | 
| 75 |     m128 shuf3 = pshufb_m128(shuf_mask_hi, t2); | 
| 76 |     m128 tmp = and128(or128(shuf1, shuf2), shuf3); | 
| 77 |     m128 tmp2 = eq128(tmp, zeroes128()); | 
| 78 |     u32 z = movemask128(tmp2); | 
| 79 |  | 
| 80 |     return z; | 
| 81 | } | 
| 82 |  | 
| 83 | static | 
| 84 | const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, | 
| 85 |                        const u8 *buf, const u8 *buf_end) { | 
| 86 |     uintptr_t len = buf_end - buf; | 
| 87 |     assert(len < 16); | 
| 88 |  | 
| 89 |     m128 chars = zeroes128(); | 
| 90 |     memcpy(&chars, buf, len); | 
| 91 |  | 
| 92 |     u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); | 
| 93 |     // can't be these bytes in z | 
| 94 |     u32 mask = (0xffff >> (16 - len)) ^ 0xffff; | 
| 95 |     const u8 *rv = firstMatch(buf, z | mask); | 
| 96 |  | 
| 97 |     if (rv) { | 
| 98 |         return rv; | 
| 99 |     } else { | 
| 100 |         return buf_end; | 
| 101 |     } | 
| 102 | } | 
| 103 |  | 
| 104 | static really_inline | 
| 105 | const u8 *fwdBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, | 
| 106 |                    m128 v, const u8 *buf) { | 
| 107 |     u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); | 
| 108 |     return firstMatch(buf, z); | 
| 109 | } | 
| 110 |  | 
| 111 | static really_inline | 
| 112 | const u8 *revBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, | 
| 113 |                    m128 v, const u8 *buf) { | 
| 114 |     u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); | 
| 115 |     return lastMatch(buf, z); | 
| 116 | } | 
| 117 |  | 
| 118 | const u8 *truffleExec(m128 shuf_mask_lo_highclear, | 
| 119 |                       m128 shuf_mask_lo_highset, | 
| 120 |                       const u8 *buf, const u8 *buf_end) { | 
| 121 |     DEBUG_PRINTF("len %zu\n" , buf_end - buf); | 
| 122 |  | 
| 123 |     assert(buf && buf_end); | 
| 124 |     assert(buf < buf_end); | 
| 125 |     const u8 *rv; | 
| 126 |  | 
| 127 |     if (buf_end - buf < 16) { | 
| 128 |         return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf, | 
| 129 |                            buf_end); | 
| 130 |     } | 
| 131 |  | 
| 132 |     size_t min = (size_t)buf % 16; | 
| 133 |     assert(buf_end - buf >= 16); | 
| 134 |  | 
| 135 |     // Preconditioning: most of the time our buffer won't be aligned. | 
| 136 |     m128 chars = loadu128(buf); | 
| 137 |     rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf); | 
| 138 |     if (rv) { | 
| 139 |         return rv; | 
| 140 |     } | 
| 141 |     buf += (16 - min); | 
| 142 |  | 
| 143 |     const u8 *last_block = buf_end - 16; | 
| 144 |     while (buf < last_block) { | 
| 145 |         m128 lchars = load128(buf); | 
| 146 |         rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars, | 
| 147 |                       buf); | 
| 148 |         if (rv) { | 
| 149 |             return rv; | 
| 150 |         } | 
| 151 |         buf += 16; | 
| 152 |     } | 
| 153 |  | 
| 154 |     // Use an unaligned load to mop up the last 16 bytes and get an accurate | 
| 155 |     // picture to buf_end. | 
| 156 |     assert(buf <= buf_end && buf >= buf_end - 16); | 
| 157 |     chars = loadu128(buf_end - 16); | 
| 158 |     rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, | 
| 159 |                   buf_end - 16); | 
| 160 |     if (rv) { | 
| 161 |         return rv; | 
| 162 |     } | 
| 163 |  | 
| 164 |     return buf_end; | 
| 165 | } | 
| 166 |  | 
| 167 | static | 
| 168 | const u8 *truffleRevMini(m128 shuf_mask_lo_highclear, | 
| 169 |                          m128 shuf_mask_lo_highset, const u8 *buf, | 
| 170 |                          const u8 *buf_end) { | 
| 171 |     uintptr_t len = buf_end - buf; | 
| 172 |     assert(len < 16); | 
| 173 |  | 
| 174 |     m128 chars = zeroes128(); | 
| 175 |     memcpy(&chars, buf, len); | 
| 176 |  | 
| 177 |     u32 mask = (0xffff >> (16 - len)) ^ 0xffff; | 
| 178 |     u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); | 
| 179 |     const u8 *rv = lastMatch(buf, z | mask); | 
| 180 |  | 
| 181 |     if (rv) { | 
| 182 |         return rv; | 
| 183 |     } | 
| 184 |     return buf - 1; | 
| 185 | } | 
| 186 |  | 
| 187 | const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, | 
| 188 |                        m128 shuf_mask_lo_highset, | 
| 189 |                        const u8 *buf, const u8 *buf_end) { | 
| 190 |     assert(buf && buf_end); | 
| 191 |     assert(buf < buf_end); | 
| 192 |     const u8 *rv; | 
| 193 |  | 
| 194 |     DEBUG_PRINTF("len %zu\n" , buf_end - buf); | 
| 195 |  | 
| 196 |     if (buf_end - buf < 16) { | 
| 197 |         return truffleRevMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf, | 
| 198 |                               buf_end); | 
| 199 |     } | 
| 200 |  | 
| 201 |     assert(buf_end - buf >= 16); | 
| 202 |  | 
| 203 |     // Preconditioning: most of the time our buffer won't be aligned. | 
| 204 |     m128 chars = loadu128(buf_end - 16); | 
| 205 |     rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, | 
| 206 |                   buf_end - 16); | 
| 207 |     if (rv) { | 
| 208 |         return rv; | 
| 209 |     } | 
| 210 |     buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf)); | 
| 211 |  | 
| 212 |     const u8 *last_block = buf + 16; | 
| 213 |     while (buf_end > last_block) { | 
| 214 |         buf_end -= 16; | 
| 215 |         m128 lchars = load128(buf_end); | 
| 216 |         rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars, | 
| 217 |                       buf_end); | 
| 218 |         if (rv) { | 
| 219 |             return rv; | 
| 220 |         } | 
| 221 |     } | 
| 222 |  | 
| 223 |     // Use an unaligned load to mop up the last 16 bytes and get an accurate | 
| 224 |     // picture to buf_end. | 
| 225 |     chars = loadu128(buf); | 
| 226 |     rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf); | 
| 227 |     if (rv) { | 
| 228 |         return rv; | 
| 229 |     } | 
| 230 |  | 
| 231 |     return buf - 1; | 
| 232 | } | 
| 233 |  | 
| 234 | #elif !defined(HAVE_AVX512) | 
| 235 |  | 
| 236 | // AVX2 | 
| 237 |  | 
| 238 | static really_inline | 
| 239 | const u8 *lastMatch(const u8 *buf, u32 z) { | 
| 240 |     if (unlikely(z != 0xffffffff)) { | 
| 241 |         u32 pos = clz32(~z); | 
| 242 |         assert(pos < 32); | 
| 243 |         return buf + (31 - pos); | 
| 244 |     } | 
| 245 |  | 
| 246 |     return NULL; // no match | 
| 247 | } | 
| 248 |  | 
| 249 | static really_inline | 
| 250 | const u8 *firstMatch(const u8 *buf, u32 z) { | 
| 251 |     if (unlikely(z != 0xffffffff)) { | 
| 252 |         u32 pos = ctz32(~z); | 
| 253 |         assert(pos < 32); | 
| 254 |         return buf + pos; | 
| 255 |     } | 
| 256 |  | 
| 257 |     return NULL; // no match | 
| 258 | } | 
| 259 |  | 
| 260 | static really_inline | 
| 261 | u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) { | 
| 262 |  | 
| 263 |     m256 highconst = _mm256_set1_epi8(0x80); | 
| 264 |     m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201); | 
| 265 |  | 
| 266 |     // and now do the real work | 
| 267 |     m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v); | 
| 268 |     m256 t1 = xor256(v, highconst); | 
| 269 |     m256 shuf2 = pshufb_m256(shuf_mask_lo_highset, t1); | 
| 270 |     m256 t2 = andnot256(highconst, rshift64_m256(v, 4)); | 
| 271 |     m256 shuf3 = pshufb_m256(shuf_mask_hi, t2); | 
| 272 |     m256 tmp = and256(or256(shuf1, shuf2), shuf3); | 
| 273 |     m256 tmp2 = eq256(tmp, zeroes256()); | 
| 274 |     u32 z = movemask256(tmp2); | 
| 275 |  | 
| 276 |     return z; | 
| 277 | } | 
| 278 |  | 
| 279 | static | 
| 280 | const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, | 
| 281 |                        const u8 *buf, const u8 *buf_end) { | 
| 282 |     uintptr_t len = buf_end - buf; | 
| 283 |     assert(len < 32); | 
| 284 |  | 
| 285 |     m256 chars = zeroes256(); | 
| 286 |     memcpy(&chars, buf, len); | 
| 287 |  | 
| 288 |     u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); | 
| 289 |     // can't be these bytes in z | 
| 290 |     u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff; | 
| 291 |     const u8 *rv = firstMatch(buf, z | mask); | 
| 292 |  | 
| 293 |     if (rv) { | 
| 294 |         return rv; | 
| 295 |     } else { | 
| 296 |         return buf_end; | 
| 297 |     } | 
| 298 | } | 
| 299 |  | 
| 300 | static really_inline | 
| 301 | const u8 *fwdBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, | 
| 302 |                    m256 v, const u8 *buf) { | 
| 303 |     u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); | 
| 304 |     return firstMatch(buf, z); | 
| 305 | } | 
| 306 |  | 
| 307 | static really_inline | 
| 308 | const u8 *revBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, | 
| 309 |                    m256 v, const u8 *buf) { | 
| 310 |     u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); | 
| 311 |     return lastMatch(buf, z); | 
| 312 | } | 
| 313 |  | 
| 314 | const u8 *truffleExec(m128 shuf_mask_lo_highclear, | 
| 315 |                       m128 shuf_mask_lo_highset, | 
| 316 |                       const u8 *buf, const u8 *buf_end) { | 
| 317 |     DEBUG_PRINTF("len %zu\n" , buf_end - buf); | 
| 318 |     const m256 wide_clear = set2x128(shuf_mask_lo_highclear); | 
| 319 |     const m256 wide_set = set2x128(shuf_mask_lo_highset); | 
| 320 |  | 
| 321 |     assert(buf && buf_end); | 
| 322 |     assert(buf < buf_end); | 
| 323 |     const u8 *rv; | 
| 324 |  | 
| 325 |     if (buf_end - buf < 32) { | 
| 326 |         return truffleMini(wide_clear, wide_set, buf, buf_end); | 
| 327 |     } | 
| 328 |  | 
| 329 |     size_t min = (size_t)buf % 32; | 
| 330 |     assert(buf_end - buf >= 32); | 
| 331 |  | 
| 332 |     // Preconditioning: most of the time our buffer won't be aligned. | 
| 333 |     m256 chars = loadu256(buf); | 
| 334 |     rv = fwdBlock(wide_clear, wide_set, chars, buf); | 
| 335 |     if (rv) { | 
| 336 |         return rv; | 
| 337 |     } | 
| 338 |     buf += (32 - min); | 
| 339 |  | 
| 340 |     const u8 *last_block = buf_end - 32; | 
| 341 |     while (buf < last_block) { | 
| 342 |         m256 lchars = load256(buf); | 
| 343 |         rv = fwdBlock(wide_clear, wide_set, lchars, buf); | 
| 344 |         if (rv) { | 
| 345 |             return rv; | 
| 346 |         } | 
| 347 |         buf += 32; | 
| 348 |     } | 
| 349 |  | 
| 350 |     // Use an unaligned load to mop up the last 32 bytes and get an accurate | 
| 351 |     // picture to buf_end. | 
| 352 |     assert(buf <= buf_end && buf >= buf_end - 32); | 
| 353 |     chars = loadu256(buf_end - 32); | 
| 354 |     rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 32); | 
| 355 |     if (rv) { | 
| 356 |         return rv; | 
| 357 |     } | 
| 358 |     return buf_end; | 
| 359 | } | 
| 360 |  | 
| 361 | static | 
| 362 | const u8 *truffleRevMini(m256 shuf_mask_lo_highclear, | 
| 363 |                          m256 shuf_mask_lo_highset, const u8 *buf, | 
| 364 |                          const u8 *buf_end) { | 
| 365 |     uintptr_t len = buf_end - buf; | 
| 366 |     assert(len < 32); | 
| 367 |  | 
| 368 |     m256 chars = zeroes256(); | 
| 369 |     memcpy(&chars, buf, len); | 
| 370 |  | 
| 371 |     u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff; | 
| 372 |     u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); | 
| 373 |     const u8 *rv = lastMatch(buf, z | mask); | 
| 374 |  | 
| 375 |     if (rv) { | 
| 376 |         return rv; | 
| 377 |     } | 
| 378 |     return buf - 1; | 
| 379 | } | 
| 380 |  | 
| 381 |  | 
| 382 | const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, | 
| 383 |                        m128 shuf_mask_lo_highset, | 
| 384 |                        const u8 *buf, const u8 *buf_end) { | 
| 385 |     const m256 wide_clear = set2x128(shuf_mask_lo_highclear); | 
| 386 |     const m256 wide_set = set2x128(shuf_mask_lo_highset); | 
| 387 |     assert(buf && buf_end); | 
| 388 |     assert(buf < buf_end); | 
| 389 |     const u8 *rv; | 
| 390 |  | 
| 391 |     DEBUG_PRINTF("len %zu\n" , buf_end - buf); | 
| 392 |  | 
| 393 |     if (buf_end - buf < 32) { | 
| 394 |         return truffleRevMini(wide_clear, wide_set, buf, buf_end); | 
| 395 |     } | 
| 396 |  | 
| 397 |     assert(buf_end - buf >= 32); | 
| 398 |  | 
| 399 |     // Preconditioning: most of the time our buffer won't be aligned. | 
| 400 |     m256 chars = loadu256(buf_end - 32); | 
| 401 |     rv = revBlock(wide_clear, wide_set, chars, | 
| 402 |                   buf_end - 32); | 
| 403 |     if (rv) { | 
| 404 |         return rv; | 
| 405 |     } | 
| 406 |     buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f)); | 
| 407 |  | 
| 408 |     const u8 *last_block = buf + 32; | 
| 409 |     while (buf_end > last_block) { | 
| 410 |         buf_end -= 32; | 
| 411 |         m256 lchars = load256(buf_end); | 
| 412 |         rv = revBlock(wide_clear, wide_set, lchars, buf_end); | 
| 413 |         if (rv) { | 
| 414 |             return rv; | 
| 415 |         } | 
| 416 |     } | 
| 417 |  | 
| 418 |     // Use an unaligned load to mop up the last 32 bytes and get an accurate | 
| 419 |     // picture to buf_end. | 
| 420 |     chars = loadu256(buf); | 
| 421 |     rv = revBlock(wide_clear, wide_set, chars, buf); | 
| 422 |     if (rv) { | 
| 423 |         return rv; | 
| 424 |     } | 
| 425 |     return buf - 1; | 
| 426 | } | 
| 427 |  | 
| 428 | #else // AVX512 | 
| 429 |  | 
| 430 | static really_inline | 
| 431 | const u8 *lastMatch(const u8 *buf, u64a z) { | 
| 432 |     if (unlikely(z != ~0ULL)) { | 
| 433 |         u64a pos = clz64(~z); | 
| 434 |         assert(pos < 64); | 
| 435 |         return buf + (63 - pos); | 
| 436 |     } | 
| 437 |  | 
| 438 |     return NULL; // no match | 
| 439 | } | 
| 440 |  | 
| 441 | static really_inline | 
| 442 | const u8 *firstMatch(const u8 *buf, u64a z) { | 
| 443 |     if (unlikely(z != ~0ULL)) { | 
| 444 |         u64a pos = ctz64(~z); | 
| 445 |         assert(pos < 64); | 
| 446 |         DEBUG_PRINTF("pos %llu\n" , pos); | 
| 447 |         return buf + pos; | 
| 448 |     } | 
| 449 |  | 
| 450 |     return NULL; // no match | 
| 451 | } | 
| 452 |  | 
| 453 | static really_inline | 
| 454 | u64a block(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, m512 v) { | 
| 455 |     m512 highconst = set64x8(0x80); | 
| 456 |     m512 shuf_mask_hi = set8x64(0x8040201008040201); | 
| 457 |  | 
| 458 |     // and now do the real work | 
| 459 |     m512 shuf1 = pshufb_m512(shuf_mask_lo_highclear, v); | 
| 460 |     m512 t1 = xor512(v, highconst); | 
| 461 |     m512 shuf2 = pshufb_m512(shuf_mask_lo_highset, t1); | 
| 462 |     m512 t2 = andnot512(highconst, rshift64_m512(v, 4)); | 
| 463 |     m512 shuf3 = pshufb_m512(shuf_mask_hi, t2); | 
| 464 |     m512 tmp = and512(or512(shuf1, shuf2), shuf3); | 
| 465 |     u64a z = eq512mask(tmp, zeroes512()); | 
| 466 |  | 
| 467 |     return z; | 
| 468 | } | 
| 469 |  | 
| 470 | static really_inline | 
| 471 | const u8 *truffleMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, | 
| 472 |                       const u8 *buf, const u8 *buf_end) { | 
| 473 |     uintptr_t len = buf_end - buf; | 
| 474 |     assert(len <= 64); | 
| 475 |  | 
| 476 |     __mmask64 mask = (~0ULL) >> (64 - len); | 
| 477 |  | 
| 478 |     m512 chars = loadu_maskz_m512(mask, buf); | 
| 479 |  | 
| 480 |     u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); | 
| 481 |  | 
| 482 |     const u8 *rv = firstMatch(buf, z | ~mask); | 
| 483 |  | 
| 484 |     return rv; | 
| 485 | } | 
| 486 |  | 
| 487 | static really_inline | 
| 488 | const u8 *fwdBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, | 
| 489 |                    m512 v, const u8 *buf) { | 
| 490 |     u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); | 
| 491 |     return firstMatch(buf, z); | 
| 492 | } | 
| 493 |  | 
| 494 | static really_inline | 
| 495 | const u8 *revBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, | 
| 496 |                    m512 v, const u8 *buf) { | 
| 497 |     u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); | 
| 498 |     return lastMatch(buf, z); | 
| 499 | } | 
| 500 |  | 
| 501 | const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, | 
| 502 |                       const u8 *buf, const u8 *buf_end) { | 
| 503 |     DEBUG_PRINTF("len %zu\n" , buf_end - buf); | 
| 504 |     const m512 wide_clear = set4x128(shuf_mask_lo_highclear); | 
| 505 |     const m512 wide_set = set4x128(shuf_mask_lo_highset); | 
| 506 |  | 
| 507 |     assert(buf && buf_end); | 
| 508 |     assert(buf < buf_end); | 
| 509 |     const u8 *rv; | 
| 510 |  | 
| 511 |     if (buf_end - buf <= 64) { | 
| 512 |         rv = truffleMini(wide_clear, wide_set, buf, buf_end); | 
| 513 |         return rv ? rv : buf_end; | 
| 514 |     } | 
| 515 |  | 
| 516 |     assert(buf_end - buf >= 64); | 
| 517 |     if ((uintptr_t)buf % 64) { | 
| 518 |         // Preconditioning: most of the time our buffer won't be aligned. | 
| 519 |         rv = truffleMini(wide_clear, wide_set, buf, ROUNDUP_PTR(buf, 64)); | 
| 520 |         if (rv) { | 
| 521 |             return rv; | 
| 522 |         } | 
| 523 |         buf = ROUNDUP_PTR(buf, 64); | 
| 524 |     } | 
| 525 |     const u8 *last_block = buf_end - 64; | 
| 526 |     while (buf < last_block) { | 
| 527 |         m512 lchars = load512(buf); | 
| 528 |         rv = fwdBlock(wide_clear, wide_set, lchars, buf); | 
| 529 |         if (rv) { | 
| 530 |             return rv; | 
| 531 |         } | 
| 532 |         buf += 64; | 
| 533 |     } | 
| 534 |  | 
| 535 |     // Use an unaligned load to mop up the last 64 bytes and get an accurate | 
| 536 |     // picture to buf_end. | 
| 537 |     assert(buf <= buf_end && buf >= buf_end - 64); | 
| 538 |     m512 chars = loadu512(buf_end - 64); | 
| 539 |     rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 64); | 
| 540 |     if (rv) { | 
| 541 |         return rv; | 
| 542 |     } | 
| 543 |     return buf_end; | 
| 544 | } | 
| 545 |  | 
| 546 | static really_inline | 
| 547 | const u8 *truffleRevMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, | 
| 548 |                          const u8 *buf, const u8 *buf_end) { | 
| 549 |     uintptr_t len = buf_end - buf; | 
| 550 |     assert(len < 64); | 
| 551 |  | 
| 552 |     __mmask64 mask = (~0ULL) >> (64 - len); | 
| 553 |     m512 chars = loadu_maskz_m512(mask, buf); | 
| 554 |     u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); | 
| 555 |     DEBUG_PRINTF("mask 0x%016llx z 0x%016llx\n" , mask, z); | 
| 556 |     const u8 *rv = lastMatch(buf, z | ~mask); | 
| 557 |  | 
| 558 |     if (rv) { | 
| 559 |         return rv; | 
| 560 |     } | 
| 561 |     return buf - 1; | 
| 562 | } | 
| 563 |  | 
| 564 | const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, | 
| 565 |                        const u8 *buf, const u8 *buf_end) { | 
| 566 |     const m512 wide_clear = set4x128(shuf_mask_lo_highclear); | 
| 567 |     const m512 wide_set = set4x128(shuf_mask_lo_highset); | 
| 568 |     assert(buf && buf_end); | 
| 569 |     assert(buf < buf_end); | 
| 570 |     const u8 *rv; | 
| 571 |  | 
| 572 |     DEBUG_PRINTF("len %zu\n" , buf_end - buf); | 
| 573 |  | 
| 574 |     if (buf_end - buf < 64) { | 
| 575 |         return truffleRevMini(wide_clear, wide_set, buf, buf_end); | 
| 576 |     } | 
| 577 |  | 
| 578 |     assert(buf_end - buf >= 64); | 
| 579 |  | 
| 580 |     // Preconditioning: most of the time our buffer won't be aligned. | 
| 581 |     m512 chars = loadu512(buf_end - 64); | 
| 582 |     rv = revBlock(wide_clear, wide_set, chars, buf_end - 64); | 
| 583 |     if (rv) { | 
| 584 |         return rv; | 
| 585 |     } | 
| 586 |     buf_end = (const u8 *)ROUNDDOWN_N((uintptr_t)buf_end, 64); | 
| 587 |  | 
| 588 |     const u8 *last_block = buf + 64; | 
| 589 |     while (buf_end > last_block) { | 
| 590 |         buf_end -= 64; | 
| 591 |         m512 lchars = load512(buf_end); | 
| 592 |         rv = revBlock(wide_clear, wide_set, lchars, buf_end); | 
| 593 |         if (rv) { | 
| 594 |             return rv; | 
| 595 |         } | 
| 596 |     } | 
| 597 |  | 
| 598 |     // Use an unaligned load to mop up the last 64 bytes and get an accurate | 
| 599 |     // picture to buf_end. | 
| 600 |     chars = loadu512(buf); | 
| 601 |     rv = revBlock(wide_clear, wide_set, chars, buf); | 
| 602 |     if (rv) { | 
| 603 |         return rv; | 
| 604 |     } | 
| 605 |     return buf - 1; | 
| 606 | } | 
| 607 |  | 
| 608 | #endif | 
| 609 |  |