1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * Matches a byte in a charclass using three shuffles
31 */
32
33
34#include "ue2common.h"
35#include "truffle.h"
36#include "util/arch.h"
37#include "util/bitutils.h"
38#include "util/simd_utils.h"
39
40#if !defined(HAVE_AVX2)
41
42static really_inline
43const u8 *lastMatch(const u8 *buf, u32 z) {
44 if (unlikely(z != 0xffff)) {
45 u32 pos = clz32(~z & 0xffff);
46 assert(pos >= 16 && pos < 32);
47 return buf + (31 - pos);
48 }
49
50 return NULL; // no match
51}
52
53static really_inline
54const u8 *firstMatch(const u8 *buf, u32 z) {
55 if (unlikely(z != 0xffff)) {
56 u32 pos = ctz32(~z & 0xffff);
57 assert(pos < 16);
58 return buf + pos;
59 }
60
61 return NULL; // no match
62}
63
64static really_inline
65u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
66
67 m128 highconst = _mm_set1_epi8(0x80);
68 m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
69
70 // and now do the real work
71 m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
72 m128 t1 = xor128(v, highconst);
73 m128 shuf2 = pshufb_m128(shuf_mask_lo_highset, t1);
74 m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
75 m128 shuf3 = pshufb_m128(shuf_mask_hi, t2);
76 m128 tmp = and128(or128(shuf1, shuf2), shuf3);
77 m128 tmp2 = eq128(tmp, zeroes128());
78 u32 z = movemask128(tmp2);
79
80 return z;
81}
82
83static
84const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
85 const u8 *buf, const u8 *buf_end) {
86 uintptr_t len = buf_end - buf;
87 assert(len < 16);
88
89 m128 chars = zeroes128();
90 memcpy(&chars, buf, len);
91
92 u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
93 // can't be these bytes in z
94 u32 mask = (0xffff >> (16 - len)) ^ 0xffff;
95 const u8 *rv = firstMatch(buf, z | mask);
96
97 if (rv) {
98 return rv;
99 } else {
100 return buf_end;
101 }
102}
103
104static really_inline
105const u8 *fwdBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
106 m128 v, const u8 *buf) {
107 u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
108 return firstMatch(buf, z);
109}
110
111static really_inline
112const u8 *revBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
113 m128 v, const u8 *buf) {
114 u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
115 return lastMatch(buf, z);
116}
117
118const u8 *truffleExec(m128 shuf_mask_lo_highclear,
119 m128 shuf_mask_lo_highset,
120 const u8 *buf, const u8 *buf_end) {
121 DEBUG_PRINTF("len %zu\n", buf_end - buf);
122
123 assert(buf && buf_end);
124 assert(buf < buf_end);
125 const u8 *rv;
126
127 if (buf_end - buf < 16) {
128 return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf,
129 buf_end);
130 }
131
132 size_t min = (size_t)buf % 16;
133 assert(buf_end - buf >= 16);
134
135 // Preconditioning: most of the time our buffer won't be aligned.
136 m128 chars = loadu128(buf);
137 rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf);
138 if (rv) {
139 return rv;
140 }
141 buf += (16 - min);
142
143 const u8 *last_block = buf_end - 16;
144 while (buf < last_block) {
145 m128 lchars = load128(buf);
146 rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars,
147 buf);
148 if (rv) {
149 return rv;
150 }
151 buf += 16;
152 }
153
154 // Use an unaligned load to mop up the last 16 bytes and get an accurate
155 // picture to buf_end.
156 assert(buf <= buf_end && buf >= buf_end - 16);
157 chars = loadu128(buf_end - 16);
158 rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars,
159 buf_end - 16);
160 if (rv) {
161 return rv;
162 }
163
164 return buf_end;
165}
166
167static
168const u8 *truffleRevMini(m128 shuf_mask_lo_highclear,
169 m128 shuf_mask_lo_highset, const u8 *buf,
170 const u8 *buf_end) {
171 uintptr_t len = buf_end - buf;
172 assert(len < 16);
173
174 m128 chars = zeroes128();
175 memcpy(&chars, buf, len);
176
177 u32 mask = (0xffff >> (16 - len)) ^ 0xffff;
178 u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
179 const u8 *rv = lastMatch(buf, z | mask);
180
181 if (rv) {
182 return rv;
183 }
184 return buf - 1;
185}
186
187const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
188 m128 shuf_mask_lo_highset,
189 const u8 *buf, const u8 *buf_end) {
190 assert(buf && buf_end);
191 assert(buf < buf_end);
192 const u8 *rv;
193
194 DEBUG_PRINTF("len %zu\n", buf_end - buf);
195
196 if (buf_end - buf < 16) {
197 return truffleRevMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf,
198 buf_end);
199 }
200
201 assert(buf_end - buf >= 16);
202
203 // Preconditioning: most of the time our buffer won't be aligned.
204 m128 chars = loadu128(buf_end - 16);
205 rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars,
206 buf_end - 16);
207 if (rv) {
208 return rv;
209 }
210 buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf));
211
212 const u8 *last_block = buf + 16;
213 while (buf_end > last_block) {
214 buf_end -= 16;
215 m128 lchars = load128(buf_end);
216 rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars,
217 buf_end);
218 if (rv) {
219 return rv;
220 }
221 }
222
223 // Use an unaligned load to mop up the last 16 bytes and get an accurate
224 // picture to buf_end.
225 chars = loadu128(buf);
226 rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf);
227 if (rv) {
228 return rv;
229 }
230
231 return buf - 1;
232}
233
234#elif !defined(HAVE_AVX512)
235
236// AVX2
237
238static really_inline
239const u8 *lastMatch(const u8 *buf, u32 z) {
240 if (unlikely(z != 0xffffffff)) {
241 u32 pos = clz32(~z);
242 assert(pos < 32);
243 return buf + (31 - pos);
244 }
245
246 return NULL; // no match
247}
248
249static really_inline
250const u8 *firstMatch(const u8 *buf, u32 z) {
251 if (unlikely(z != 0xffffffff)) {
252 u32 pos = ctz32(~z);
253 assert(pos < 32);
254 return buf + pos;
255 }
256
257 return NULL; // no match
258}
259
260static really_inline
261u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
262
263 m256 highconst = _mm256_set1_epi8(0x80);
264 m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
265
266 // and now do the real work
267 m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
268 m256 t1 = xor256(v, highconst);
269 m256 shuf2 = pshufb_m256(shuf_mask_lo_highset, t1);
270 m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
271 m256 shuf3 = pshufb_m256(shuf_mask_hi, t2);
272 m256 tmp = and256(or256(shuf1, shuf2), shuf3);
273 m256 tmp2 = eq256(tmp, zeroes256());
274 u32 z = movemask256(tmp2);
275
276 return z;
277}
278
279static
280const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
281 const u8 *buf, const u8 *buf_end) {
282 uintptr_t len = buf_end - buf;
283 assert(len < 32);
284
285 m256 chars = zeroes256();
286 memcpy(&chars, buf, len);
287
288 u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
289 // can't be these bytes in z
290 u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff;
291 const u8 *rv = firstMatch(buf, z | mask);
292
293 if (rv) {
294 return rv;
295 } else {
296 return buf_end;
297 }
298}
299
300static really_inline
301const u8 *fwdBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
302 m256 v, const u8 *buf) {
303 u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
304 return firstMatch(buf, z);
305}
306
307static really_inline
308const u8 *revBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
309 m256 v, const u8 *buf) {
310 u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
311 return lastMatch(buf, z);
312}
313
314const u8 *truffleExec(m128 shuf_mask_lo_highclear,
315 m128 shuf_mask_lo_highset,
316 const u8 *buf, const u8 *buf_end) {
317 DEBUG_PRINTF("len %zu\n", buf_end - buf);
318 const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
319 const m256 wide_set = set2x128(shuf_mask_lo_highset);
320
321 assert(buf && buf_end);
322 assert(buf < buf_end);
323 const u8 *rv;
324
325 if (buf_end - buf < 32) {
326 return truffleMini(wide_clear, wide_set, buf, buf_end);
327 }
328
329 size_t min = (size_t)buf % 32;
330 assert(buf_end - buf >= 32);
331
332 // Preconditioning: most of the time our buffer won't be aligned.
333 m256 chars = loadu256(buf);
334 rv = fwdBlock(wide_clear, wide_set, chars, buf);
335 if (rv) {
336 return rv;
337 }
338 buf += (32 - min);
339
340 const u8 *last_block = buf_end - 32;
341 while (buf < last_block) {
342 m256 lchars = load256(buf);
343 rv = fwdBlock(wide_clear, wide_set, lchars, buf);
344 if (rv) {
345 return rv;
346 }
347 buf += 32;
348 }
349
350 // Use an unaligned load to mop up the last 32 bytes and get an accurate
351 // picture to buf_end.
352 assert(buf <= buf_end && buf >= buf_end - 32);
353 chars = loadu256(buf_end - 32);
354 rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 32);
355 if (rv) {
356 return rv;
357 }
358 return buf_end;
359}
360
361static
362const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
363 m256 shuf_mask_lo_highset, const u8 *buf,
364 const u8 *buf_end) {
365 uintptr_t len = buf_end - buf;
366 assert(len < 32);
367
368 m256 chars = zeroes256();
369 memcpy(&chars, buf, len);
370
371 u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff;
372 u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
373 const u8 *rv = lastMatch(buf, z | mask);
374
375 if (rv) {
376 return rv;
377 }
378 return buf - 1;
379}
380
381
382const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
383 m128 shuf_mask_lo_highset,
384 const u8 *buf, const u8 *buf_end) {
385 const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
386 const m256 wide_set = set2x128(shuf_mask_lo_highset);
387 assert(buf && buf_end);
388 assert(buf < buf_end);
389 const u8 *rv;
390
391 DEBUG_PRINTF("len %zu\n", buf_end - buf);
392
393 if (buf_end - buf < 32) {
394 return truffleRevMini(wide_clear, wide_set, buf, buf_end);
395 }
396
397 assert(buf_end - buf >= 32);
398
399 // Preconditioning: most of the time our buffer won't be aligned.
400 m256 chars = loadu256(buf_end - 32);
401 rv = revBlock(wide_clear, wide_set, chars,
402 buf_end - 32);
403 if (rv) {
404 return rv;
405 }
406 buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f));
407
408 const u8 *last_block = buf + 32;
409 while (buf_end > last_block) {
410 buf_end -= 32;
411 m256 lchars = load256(buf_end);
412 rv = revBlock(wide_clear, wide_set, lchars, buf_end);
413 if (rv) {
414 return rv;
415 }
416 }
417
418 // Use an unaligned load to mop up the last 32 bytes and get an accurate
419 // picture to buf_end.
420 chars = loadu256(buf);
421 rv = revBlock(wide_clear, wide_set, chars, buf);
422 if (rv) {
423 return rv;
424 }
425 return buf - 1;
426}
427
428#else // AVX512
429
430static really_inline
431const u8 *lastMatch(const u8 *buf, u64a z) {
432 if (unlikely(z != ~0ULL)) {
433 u64a pos = clz64(~z);
434 assert(pos < 64);
435 return buf + (63 - pos);
436 }
437
438 return NULL; // no match
439}
440
441static really_inline
442const u8 *firstMatch(const u8 *buf, u64a z) {
443 if (unlikely(z != ~0ULL)) {
444 u64a pos = ctz64(~z);
445 assert(pos < 64);
446 DEBUG_PRINTF("pos %llu\n", pos);
447 return buf + pos;
448 }
449
450 return NULL; // no match
451}
452
453static really_inline
454u64a block(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, m512 v) {
455 m512 highconst = set64x8(0x80);
456 m512 shuf_mask_hi = set8x64(0x8040201008040201);
457
458 // and now do the real work
459 m512 shuf1 = pshufb_m512(shuf_mask_lo_highclear, v);
460 m512 t1 = xor512(v, highconst);
461 m512 shuf2 = pshufb_m512(shuf_mask_lo_highset, t1);
462 m512 t2 = andnot512(highconst, rshift64_m512(v, 4));
463 m512 shuf3 = pshufb_m512(shuf_mask_hi, t2);
464 m512 tmp = and512(or512(shuf1, shuf2), shuf3);
465 u64a z = eq512mask(tmp, zeroes512());
466
467 return z;
468}
469
470static really_inline
471const u8 *truffleMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
472 const u8 *buf, const u8 *buf_end) {
473 uintptr_t len = buf_end - buf;
474 assert(len <= 64);
475
476 __mmask64 mask = (~0ULL) >> (64 - len);
477
478 m512 chars = loadu_maskz_m512(mask, buf);
479
480 u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
481
482 const u8 *rv = firstMatch(buf, z | ~mask);
483
484 return rv;
485}
486
487static really_inline
488const u8 *fwdBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
489 m512 v, const u8 *buf) {
490 u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
491 return firstMatch(buf, z);
492}
493
494static really_inline
495const u8 *revBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
496 m512 v, const u8 *buf) {
497 u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
498 return lastMatch(buf, z);
499}
500
501const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
502 const u8 *buf, const u8 *buf_end) {
503 DEBUG_PRINTF("len %zu\n", buf_end - buf);
504 const m512 wide_clear = set4x128(shuf_mask_lo_highclear);
505 const m512 wide_set = set4x128(shuf_mask_lo_highset);
506
507 assert(buf && buf_end);
508 assert(buf < buf_end);
509 const u8 *rv;
510
511 if (buf_end - buf <= 64) {
512 rv = truffleMini(wide_clear, wide_set, buf, buf_end);
513 return rv ? rv : buf_end;
514 }
515
516 assert(buf_end - buf >= 64);
517 if ((uintptr_t)buf % 64) {
518 // Preconditioning: most of the time our buffer won't be aligned.
519 rv = truffleMini(wide_clear, wide_set, buf, ROUNDUP_PTR(buf, 64));
520 if (rv) {
521 return rv;
522 }
523 buf = ROUNDUP_PTR(buf, 64);
524 }
525 const u8 *last_block = buf_end - 64;
526 while (buf < last_block) {
527 m512 lchars = load512(buf);
528 rv = fwdBlock(wide_clear, wide_set, lchars, buf);
529 if (rv) {
530 return rv;
531 }
532 buf += 64;
533 }
534
535 // Use an unaligned load to mop up the last 64 bytes and get an accurate
536 // picture to buf_end.
537 assert(buf <= buf_end && buf >= buf_end - 64);
538 m512 chars = loadu512(buf_end - 64);
539 rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 64);
540 if (rv) {
541 return rv;
542 }
543 return buf_end;
544}
545
546static really_inline
547const u8 *truffleRevMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
548 const u8 *buf, const u8 *buf_end) {
549 uintptr_t len = buf_end - buf;
550 assert(len < 64);
551
552 __mmask64 mask = (~0ULL) >> (64 - len);
553 m512 chars = loadu_maskz_m512(mask, buf);
554 u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
555 DEBUG_PRINTF("mask 0x%016llx z 0x%016llx\n", mask, z);
556 const u8 *rv = lastMatch(buf, z | ~mask);
557
558 if (rv) {
559 return rv;
560 }
561 return buf - 1;
562}
563
564const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
565 const u8 *buf, const u8 *buf_end) {
566 const m512 wide_clear = set4x128(shuf_mask_lo_highclear);
567 const m512 wide_set = set4x128(shuf_mask_lo_highset);
568 assert(buf && buf_end);
569 assert(buf < buf_end);
570 const u8 *rv;
571
572 DEBUG_PRINTF("len %zu\n", buf_end - buf);
573
574 if (buf_end - buf < 64) {
575 return truffleRevMini(wide_clear, wide_set, buf, buf_end);
576 }
577
578 assert(buf_end - buf >= 64);
579
580 // Preconditioning: most of the time our buffer won't be aligned.
581 m512 chars = loadu512(buf_end - 64);
582 rv = revBlock(wide_clear, wide_set, chars, buf_end - 64);
583 if (rv) {
584 return rv;
585 }
586 buf_end = (const u8 *)ROUNDDOWN_N((uintptr_t)buf_end, 64);
587
588 const u8 *last_block = buf + 64;
589 while (buf_end > last_block) {
590 buf_end -= 64;
591 m512 lchars = load512(buf_end);
592 rv = revBlock(wide_clear, wide_set, lchars, buf_end);
593 if (rv) {
594 return rv;
595 }
596 }
597
598 // Use an unaligned load to mop up the last 64 bytes and get an accurate
599 // picture to buf_end.
600 chars = loadu512(buf);
601 rv = revBlock(wide_clear, wide_set, chars, buf);
602 if (rv) {
603 return rv;
604 }
605 return buf - 1;
606}
607
608#endif
609