1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /* |
30 | * Matches a byte in a charclass using three shuffles |
31 | */ |
32 | |
33 | |
34 | #include "ue2common.h" |
35 | #include "truffle.h" |
36 | #include "util/arch.h" |
37 | #include "util/bitutils.h" |
38 | #include "util/simd_utils.h" |
39 | |
40 | #if !defined(HAVE_AVX2) |
41 | |
42 | static really_inline |
43 | const u8 *lastMatch(const u8 *buf, u32 z) { |
44 | if (unlikely(z != 0xffff)) { |
45 | u32 pos = clz32(~z & 0xffff); |
46 | assert(pos >= 16 && pos < 32); |
47 | return buf + (31 - pos); |
48 | } |
49 | |
50 | return NULL; // no match |
51 | } |
52 | |
53 | static really_inline |
54 | const u8 *firstMatch(const u8 *buf, u32 z) { |
55 | if (unlikely(z != 0xffff)) { |
56 | u32 pos = ctz32(~z & 0xffff); |
57 | assert(pos < 16); |
58 | return buf + pos; |
59 | } |
60 | |
61 | return NULL; // no match |
62 | } |
63 | |
64 | static really_inline |
65 | u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { |
66 | |
67 | m128 highconst = _mm_set1_epi8(0x80); |
68 | m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201); |
69 | |
70 | // and now do the real work |
71 | m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v); |
72 | m128 t1 = xor128(v, highconst); |
73 | m128 shuf2 = pshufb_m128(shuf_mask_lo_highset, t1); |
74 | m128 t2 = andnot128(highconst, rshift64_m128(v, 4)); |
75 | m128 shuf3 = pshufb_m128(shuf_mask_hi, t2); |
76 | m128 tmp = and128(or128(shuf1, shuf2), shuf3); |
77 | m128 tmp2 = eq128(tmp, zeroes128()); |
78 | u32 z = movemask128(tmp2); |
79 | |
80 | return z; |
81 | } |
82 | |
83 | static |
84 | const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, |
85 | const u8 *buf, const u8 *buf_end) { |
86 | uintptr_t len = buf_end - buf; |
87 | assert(len < 16); |
88 | |
89 | m128 chars = zeroes128(); |
90 | memcpy(&chars, buf, len); |
91 | |
92 | u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); |
93 | // can't be these bytes in z |
94 | u32 mask = (0xffff >> (16 - len)) ^ 0xffff; |
95 | const u8 *rv = firstMatch(buf, z | mask); |
96 | |
97 | if (rv) { |
98 | return rv; |
99 | } else { |
100 | return buf_end; |
101 | } |
102 | } |
103 | |
104 | static really_inline |
105 | const u8 *fwdBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, |
106 | m128 v, const u8 *buf) { |
107 | u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); |
108 | return firstMatch(buf, z); |
109 | } |
110 | |
111 | static really_inline |
112 | const u8 *revBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, |
113 | m128 v, const u8 *buf) { |
114 | u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); |
115 | return lastMatch(buf, z); |
116 | } |
117 | |
118 | const u8 *truffleExec(m128 shuf_mask_lo_highclear, |
119 | m128 shuf_mask_lo_highset, |
120 | const u8 *buf, const u8 *buf_end) { |
121 | DEBUG_PRINTF("len %zu\n" , buf_end - buf); |
122 | |
123 | assert(buf && buf_end); |
124 | assert(buf < buf_end); |
125 | const u8 *rv; |
126 | |
127 | if (buf_end - buf < 16) { |
128 | return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf, |
129 | buf_end); |
130 | } |
131 | |
132 | size_t min = (size_t)buf % 16; |
133 | assert(buf_end - buf >= 16); |
134 | |
135 | // Preconditioning: most of the time our buffer won't be aligned. |
136 | m128 chars = loadu128(buf); |
137 | rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf); |
138 | if (rv) { |
139 | return rv; |
140 | } |
141 | buf += (16 - min); |
142 | |
143 | const u8 *last_block = buf_end - 16; |
144 | while (buf < last_block) { |
145 | m128 lchars = load128(buf); |
146 | rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars, |
147 | buf); |
148 | if (rv) { |
149 | return rv; |
150 | } |
151 | buf += 16; |
152 | } |
153 | |
154 | // Use an unaligned load to mop up the last 16 bytes and get an accurate |
155 | // picture to buf_end. |
156 | assert(buf <= buf_end && buf >= buf_end - 16); |
157 | chars = loadu128(buf_end - 16); |
158 | rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, |
159 | buf_end - 16); |
160 | if (rv) { |
161 | return rv; |
162 | } |
163 | |
164 | return buf_end; |
165 | } |
166 | |
167 | static |
168 | const u8 *truffleRevMini(m128 shuf_mask_lo_highclear, |
169 | m128 shuf_mask_lo_highset, const u8 *buf, |
170 | const u8 *buf_end) { |
171 | uintptr_t len = buf_end - buf; |
172 | assert(len < 16); |
173 | |
174 | m128 chars = zeroes128(); |
175 | memcpy(&chars, buf, len); |
176 | |
177 | u32 mask = (0xffff >> (16 - len)) ^ 0xffff; |
178 | u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); |
179 | const u8 *rv = lastMatch(buf, z | mask); |
180 | |
181 | if (rv) { |
182 | return rv; |
183 | } |
184 | return buf - 1; |
185 | } |
186 | |
187 | const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, |
188 | m128 shuf_mask_lo_highset, |
189 | const u8 *buf, const u8 *buf_end) { |
190 | assert(buf && buf_end); |
191 | assert(buf < buf_end); |
192 | const u8 *rv; |
193 | |
194 | DEBUG_PRINTF("len %zu\n" , buf_end - buf); |
195 | |
196 | if (buf_end - buf < 16) { |
197 | return truffleRevMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf, |
198 | buf_end); |
199 | } |
200 | |
201 | assert(buf_end - buf >= 16); |
202 | |
203 | // Preconditioning: most of the time our buffer won't be aligned. |
204 | m128 chars = loadu128(buf_end - 16); |
205 | rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, |
206 | buf_end - 16); |
207 | if (rv) { |
208 | return rv; |
209 | } |
210 | buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf)); |
211 | |
212 | const u8 *last_block = buf + 16; |
213 | while (buf_end > last_block) { |
214 | buf_end -= 16; |
215 | m128 lchars = load128(buf_end); |
216 | rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars, |
217 | buf_end); |
218 | if (rv) { |
219 | return rv; |
220 | } |
221 | } |
222 | |
223 | // Use an unaligned load to mop up the last 16 bytes and get an accurate |
224 | // picture to buf_end. |
225 | chars = loadu128(buf); |
226 | rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf); |
227 | if (rv) { |
228 | return rv; |
229 | } |
230 | |
231 | return buf - 1; |
232 | } |
233 | |
234 | #elif !defined(HAVE_AVX512) |
235 | |
236 | // AVX2 |
237 | |
238 | static really_inline |
239 | const u8 *lastMatch(const u8 *buf, u32 z) { |
240 | if (unlikely(z != 0xffffffff)) { |
241 | u32 pos = clz32(~z); |
242 | assert(pos < 32); |
243 | return buf + (31 - pos); |
244 | } |
245 | |
246 | return NULL; // no match |
247 | } |
248 | |
249 | static really_inline |
250 | const u8 *firstMatch(const u8 *buf, u32 z) { |
251 | if (unlikely(z != 0xffffffff)) { |
252 | u32 pos = ctz32(~z); |
253 | assert(pos < 32); |
254 | return buf + pos; |
255 | } |
256 | |
257 | return NULL; // no match |
258 | } |
259 | |
260 | static really_inline |
261 | u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) { |
262 | |
263 | m256 highconst = _mm256_set1_epi8(0x80); |
264 | m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201); |
265 | |
266 | // and now do the real work |
267 | m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v); |
268 | m256 t1 = xor256(v, highconst); |
269 | m256 shuf2 = pshufb_m256(shuf_mask_lo_highset, t1); |
270 | m256 t2 = andnot256(highconst, rshift64_m256(v, 4)); |
271 | m256 shuf3 = pshufb_m256(shuf_mask_hi, t2); |
272 | m256 tmp = and256(or256(shuf1, shuf2), shuf3); |
273 | m256 tmp2 = eq256(tmp, zeroes256()); |
274 | u32 z = movemask256(tmp2); |
275 | |
276 | return z; |
277 | } |
278 | |
279 | static |
280 | const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, |
281 | const u8 *buf, const u8 *buf_end) { |
282 | uintptr_t len = buf_end - buf; |
283 | assert(len < 32); |
284 | |
285 | m256 chars = zeroes256(); |
286 | memcpy(&chars, buf, len); |
287 | |
288 | u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); |
289 | // can't be these bytes in z |
290 | u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff; |
291 | const u8 *rv = firstMatch(buf, z | mask); |
292 | |
293 | if (rv) { |
294 | return rv; |
295 | } else { |
296 | return buf_end; |
297 | } |
298 | } |
299 | |
300 | static really_inline |
301 | const u8 *fwdBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, |
302 | m256 v, const u8 *buf) { |
303 | u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); |
304 | return firstMatch(buf, z); |
305 | } |
306 | |
307 | static really_inline |
308 | const u8 *revBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, |
309 | m256 v, const u8 *buf) { |
310 | u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); |
311 | return lastMatch(buf, z); |
312 | } |
313 | |
314 | const u8 *truffleExec(m128 shuf_mask_lo_highclear, |
315 | m128 shuf_mask_lo_highset, |
316 | const u8 *buf, const u8 *buf_end) { |
317 | DEBUG_PRINTF("len %zu\n" , buf_end - buf); |
318 | const m256 wide_clear = set2x128(shuf_mask_lo_highclear); |
319 | const m256 wide_set = set2x128(shuf_mask_lo_highset); |
320 | |
321 | assert(buf && buf_end); |
322 | assert(buf < buf_end); |
323 | const u8 *rv; |
324 | |
325 | if (buf_end - buf < 32) { |
326 | return truffleMini(wide_clear, wide_set, buf, buf_end); |
327 | } |
328 | |
329 | size_t min = (size_t)buf % 32; |
330 | assert(buf_end - buf >= 32); |
331 | |
332 | // Preconditioning: most of the time our buffer won't be aligned. |
333 | m256 chars = loadu256(buf); |
334 | rv = fwdBlock(wide_clear, wide_set, chars, buf); |
335 | if (rv) { |
336 | return rv; |
337 | } |
338 | buf += (32 - min); |
339 | |
340 | const u8 *last_block = buf_end - 32; |
341 | while (buf < last_block) { |
342 | m256 lchars = load256(buf); |
343 | rv = fwdBlock(wide_clear, wide_set, lchars, buf); |
344 | if (rv) { |
345 | return rv; |
346 | } |
347 | buf += 32; |
348 | } |
349 | |
350 | // Use an unaligned load to mop up the last 32 bytes and get an accurate |
351 | // picture to buf_end. |
352 | assert(buf <= buf_end && buf >= buf_end - 32); |
353 | chars = loadu256(buf_end - 32); |
354 | rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 32); |
355 | if (rv) { |
356 | return rv; |
357 | } |
358 | return buf_end; |
359 | } |
360 | |
361 | static |
362 | const u8 *truffleRevMini(m256 shuf_mask_lo_highclear, |
363 | m256 shuf_mask_lo_highset, const u8 *buf, |
364 | const u8 *buf_end) { |
365 | uintptr_t len = buf_end - buf; |
366 | assert(len < 32); |
367 | |
368 | m256 chars = zeroes256(); |
369 | memcpy(&chars, buf, len); |
370 | |
371 | u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff; |
372 | u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); |
373 | const u8 *rv = lastMatch(buf, z | mask); |
374 | |
375 | if (rv) { |
376 | return rv; |
377 | } |
378 | return buf - 1; |
379 | } |
380 | |
381 | |
382 | const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, |
383 | m128 shuf_mask_lo_highset, |
384 | const u8 *buf, const u8 *buf_end) { |
385 | const m256 wide_clear = set2x128(shuf_mask_lo_highclear); |
386 | const m256 wide_set = set2x128(shuf_mask_lo_highset); |
387 | assert(buf && buf_end); |
388 | assert(buf < buf_end); |
389 | const u8 *rv; |
390 | |
391 | DEBUG_PRINTF("len %zu\n" , buf_end - buf); |
392 | |
393 | if (buf_end - buf < 32) { |
394 | return truffleRevMini(wide_clear, wide_set, buf, buf_end); |
395 | } |
396 | |
397 | assert(buf_end - buf >= 32); |
398 | |
399 | // Preconditioning: most of the time our buffer won't be aligned. |
400 | m256 chars = loadu256(buf_end - 32); |
401 | rv = revBlock(wide_clear, wide_set, chars, |
402 | buf_end - 32); |
403 | if (rv) { |
404 | return rv; |
405 | } |
406 | buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f)); |
407 | |
408 | const u8 *last_block = buf + 32; |
409 | while (buf_end > last_block) { |
410 | buf_end -= 32; |
411 | m256 lchars = load256(buf_end); |
412 | rv = revBlock(wide_clear, wide_set, lchars, buf_end); |
413 | if (rv) { |
414 | return rv; |
415 | } |
416 | } |
417 | |
418 | // Use an unaligned load to mop up the last 32 bytes and get an accurate |
419 | // picture to buf_end. |
420 | chars = loadu256(buf); |
421 | rv = revBlock(wide_clear, wide_set, chars, buf); |
422 | if (rv) { |
423 | return rv; |
424 | } |
425 | return buf - 1; |
426 | } |
427 | |
428 | #else // AVX512 |
429 | |
430 | static really_inline |
431 | const u8 *lastMatch(const u8 *buf, u64a z) { |
432 | if (unlikely(z != ~0ULL)) { |
433 | u64a pos = clz64(~z); |
434 | assert(pos < 64); |
435 | return buf + (63 - pos); |
436 | } |
437 | |
438 | return NULL; // no match |
439 | } |
440 | |
441 | static really_inline |
442 | const u8 *firstMatch(const u8 *buf, u64a z) { |
443 | if (unlikely(z != ~0ULL)) { |
444 | u64a pos = ctz64(~z); |
445 | assert(pos < 64); |
446 | DEBUG_PRINTF("pos %llu\n" , pos); |
447 | return buf + pos; |
448 | } |
449 | |
450 | return NULL; // no match |
451 | } |
452 | |
453 | static really_inline |
454 | u64a block(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, m512 v) { |
455 | m512 highconst = set64x8(0x80); |
456 | m512 shuf_mask_hi = set8x64(0x8040201008040201); |
457 | |
458 | // and now do the real work |
459 | m512 shuf1 = pshufb_m512(shuf_mask_lo_highclear, v); |
460 | m512 t1 = xor512(v, highconst); |
461 | m512 shuf2 = pshufb_m512(shuf_mask_lo_highset, t1); |
462 | m512 t2 = andnot512(highconst, rshift64_m512(v, 4)); |
463 | m512 shuf3 = pshufb_m512(shuf_mask_hi, t2); |
464 | m512 tmp = and512(or512(shuf1, shuf2), shuf3); |
465 | u64a z = eq512mask(tmp, zeroes512()); |
466 | |
467 | return z; |
468 | } |
469 | |
470 | static really_inline |
471 | const u8 *truffleMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, |
472 | const u8 *buf, const u8 *buf_end) { |
473 | uintptr_t len = buf_end - buf; |
474 | assert(len <= 64); |
475 | |
476 | __mmask64 mask = (~0ULL) >> (64 - len); |
477 | |
478 | m512 chars = loadu_maskz_m512(mask, buf); |
479 | |
480 | u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); |
481 | |
482 | const u8 *rv = firstMatch(buf, z | ~mask); |
483 | |
484 | return rv; |
485 | } |
486 | |
487 | static really_inline |
488 | const u8 *fwdBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, |
489 | m512 v, const u8 *buf) { |
490 | u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); |
491 | return firstMatch(buf, z); |
492 | } |
493 | |
494 | static really_inline |
495 | const u8 *revBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, |
496 | m512 v, const u8 *buf) { |
497 | u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); |
498 | return lastMatch(buf, z); |
499 | } |
500 | |
501 | const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, |
502 | const u8 *buf, const u8 *buf_end) { |
503 | DEBUG_PRINTF("len %zu\n" , buf_end - buf); |
504 | const m512 wide_clear = set4x128(shuf_mask_lo_highclear); |
505 | const m512 wide_set = set4x128(shuf_mask_lo_highset); |
506 | |
507 | assert(buf && buf_end); |
508 | assert(buf < buf_end); |
509 | const u8 *rv; |
510 | |
511 | if (buf_end - buf <= 64) { |
512 | rv = truffleMini(wide_clear, wide_set, buf, buf_end); |
513 | return rv ? rv : buf_end; |
514 | } |
515 | |
516 | assert(buf_end - buf >= 64); |
517 | if ((uintptr_t)buf % 64) { |
518 | // Preconditioning: most of the time our buffer won't be aligned. |
519 | rv = truffleMini(wide_clear, wide_set, buf, ROUNDUP_PTR(buf, 64)); |
520 | if (rv) { |
521 | return rv; |
522 | } |
523 | buf = ROUNDUP_PTR(buf, 64); |
524 | } |
525 | const u8 *last_block = buf_end - 64; |
526 | while (buf < last_block) { |
527 | m512 lchars = load512(buf); |
528 | rv = fwdBlock(wide_clear, wide_set, lchars, buf); |
529 | if (rv) { |
530 | return rv; |
531 | } |
532 | buf += 64; |
533 | } |
534 | |
535 | // Use an unaligned load to mop up the last 64 bytes and get an accurate |
536 | // picture to buf_end. |
537 | assert(buf <= buf_end && buf >= buf_end - 64); |
538 | m512 chars = loadu512(buf_end - 64); |
539 | rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 64); |
540 | if (rv) { |
541 | return rv; |
542 | } |
543 | return buf_end; |
544 | } |
545 | |
546 | static really_inline |
547 | const u8 *truffleRevMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, |
548 | const u8 *buf, const u8 *buf_end) { |
549 | uintptr_t len = buf_end - buf; |
550 | assert(len < 64); |
551 | |
552 | __mmask64 mask = (~0ULL) >> (64 - len); |
553 | m512 chars = loadu_maskz_m512(mask, buf); |
554 | u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); |
555 | DEBUG_PRINTF("mask 0x%016llx z 0x%016llx\n" , mask, z); |
556 | const u8 *rv = lastMatch(buf, z | ~mask); |
557 | |
558 | if (rv) { |
559 | return rv; |
560 | } |
561 | return buf - 1; |
562 | } |
563 | |
564 | const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, |
565 | const u8 *buf, const u8 *buf_end) { |
566 | const m512 wide_clear = set4x128(shuf_mask_lo_highclear); |
567 | const m512 wide_set = set4x128(shuf_mask_lo_highset); |
568 | assert(buf && buf_end); |
569 | assert(buf < buf_end); |
570 | const u8 *rv; |
571 | |
572 | DEBUG_PRINTF("len %zu\n" , buf_end - buf); |
573 | |
574 | if (buf_end - buf < 64) { |
575 | return truffleRevMini(wide_clear, wide_set, buf, buf_end); |
576 | } |
577 | |
578 | assert(buf_end - buf >= 64); |
579 | |
580 | // Preconditioning: most of the time our buffer won't be aligned. |
581 | m512 chars = loadu512(buf_end - 64); |
582 | rv = revBlock(wide_clear, wide_set, chars, buf_end - 64); |
583 | if (rv) { |
584 | return rv; |
585 | } |
586 | buf_end = (const u8 *)ROUNDDOWN_N((uintptr_t)buf_end, 64); |
587 | |
588 | const u8 *last_block = buf + 64; |
589 | while (buf_end > last_block) { |
590 | buf_end -= 64; |
591 | m512 lchars = load512(buf_end); |
592 | rv = revBlock(wide_clear, wide_set, lchars, buf_end); |
593 | if (rv) { |
594 | return rv; |
595 | } |
596 | } |
597 | |
598 | // Use an unaligned load to mop up the last 64 bytes and get an accurate |
599 | // picture to buf_end. |
600 | chars = loadu512(buf); |
601 | rv = revBlock(wide_clear, wide_set, chars, buf); |
602 | if (rv) { |
603 | return rv; |
604 | } |
605 | return buf - 1; |
606 | } |
607 | |
608 | #endif |
609 | |