1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Shufti: character class acceleration.
31 *
32 * Utilises the SSSE3 pshufb shuffle instruction
33 */
34
35#include "shufti.h"
36#include "ue2common.h"
37#include "util/arch.h"
38#include "util/bitutils.h"
39#include "util/simd_utils.h"
40#include "util/unaligned.h"
41
42#ifdef DEBUG
43#include <ctype.h>
44
45#define DUMP_MSK(_t) \
46static UNUSED \
47void dumpMsk##_t(m##_t msk) { \
48 u8 * mskAsU8 = (u8 *)&msk; \
49 for (unsigned i = 0; i < sizeof(msk); i++) { \
50 u8 c = mskAsU8[i]; \
51 for (int j = 0; j < 8; j++) { \
52 if ((c >> (7-j)) & 0x1) \
53 printf("1"); \
54 else \
55 printf("0"); \
56 } \
57 printf(" "); \
58 } \
59} \
60static UNUSED \
61void dumpMsk##_t##AsChars(m##_t msk) { \
62 u8 * mskAsU8 = (u8 *)&msk; \
63 for (unsigned i = 0; i < sizeof(msk); i++) { \
64 u8 c = mskAsU8[i]; \
65 if (isprint(c)) \
66 printf("%c",c); \
67 else \
68 printf("."); \
69 } \
70}
71
72#endif
73
74/** \brief Naive byte-by-byte implementation. */
75static really_inline
76const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
77 const u8 *buf_end) {
78 assert(buf < buf_end);
79
80 for (; buf < buf_end; ++buf) {
81 u8 c = *buf;
82 if (lo[c & 0xf] & hi[c >> 4]) {
83 break;
84 }
85 }
86 return buf;
87}
88
89/** \brief Naive byte-by-byte implementation. */
90static really_inline
91const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
92 const u8 *buf_end) {
93 assert(buf < buf_end);
94
95 for (buf_end--; buf_end >= buf; buf_end--) {
96 u8 c = *buf_end;
97 if (lo[c & 0xf] & hi[c >> 4]) {
98 break;
99 }
100 }
101 return buf_end;
102}
103
104#if !defined(HAVE_AVX2)
105/* Normal SSSE3 shufti */
106
107#ifdef DEBUG
108DUMP_MSK(128)
109#endif
110
111#define GET_LO_4(chars) and128(chars, low4bits)
112#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
113
114static really_inline
115u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
116 const m128 compare) {
117 m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars));
118 m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars));
119 m128 t = and128(c_lo, c_hi);
120
121#ifdef DEBUG
122 DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
123 DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n");
124 DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n");
125 DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n");
126 DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n");
127#endif
128 return movemask128(eq128(t, compare));
129}
130
131static really_inline
132const u8 *firstMatch(const u8 *buf, u32 z) {
133 if (unlikely(z != 0xffff)) {
134 u32 pos = ctz32(~z & 0xffff);
135 assert(pos < 16);
136 return buf + pos;
137 } else {
138 return NULL; // no match
139 }
140}
141
142static really_inline
143const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
144 const m128 low4bits, const m128 zeroes) {
145 u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
146
147 return firstMatch(buf, z);
148}
149
150const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
151 const u8 *buf_end) {
152 assert(buf && buf_end);
153 assert(buf < buf_end);
154
155 // Slow path for small cases.
156 if (buf_end - buf < 16) {
157 return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
158 buf, buf_end);
159 }
160
161 const m128 zeroes = zeroes128();
162 const m128 low4bits = _mm_set1_epi8(0xf);
163 const u8 *rv;
164
165 size_t min = (size_t)buf % 16;
166 assert(buf_end - buf >= 16);
167
168 // Preconditioning: most of the time our buffer won't be aligned.
169 m128 chars = loadu128(buf);
170 rv = fwdBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes);
171 if (rv) {
172 return rv;
173 }
174 buf += (16 - min);
175
176 // Unrolling was here, but it wasn't doing anything but taking up space.
177 // Reroll FTW.
178
179 const u8 *last_block = buf_end - 16;
180 while (buf < last_block) {
181 m128 lchars = load128(buf);
182 rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);
183 if (rv) {
184 return rv;
185 }
186 buf += 16;
187 }
188
189 // Use an unaligned load to mop up the last 16 bytes and get an accurate
190 // picture to buf_end.
191 assert(buf <= buf_end && buf >= buf_end - 16);
192 chars = loadu128(buf_end - 16);
193 rv = fwdBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes);
194 if (rv) {
195 return rv;
196 }
197
198 return buf_end;
199}
200
201static really_inline
202const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
203#ifdef DEBUG
204 DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n");
205#endif
206
207 u32 z = movemask128(eq128(t, compare));
208 if (unlikely(z != 0xffff)) {
209 u32 pos = clz32(~z & 0xffff);
210 DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
211 assert(pos >= 16 && pos < 32);
212 return buf + (31 - pos);
213 } else {
214 return NULL; // no match
215 }
216}
217
218
219static really_inline
220const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
221 const m128 low4bits, const m128 zeroes) {
222 m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars));
223 m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars));
224 m128 t = and128(c_lo, c_hi);
225
226#ifdef DEBUG
227 DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
228 DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n");
229 DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n");
230 DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n");
231 DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n");
232#endif
233
234 return lastMatch(buf, t, zeroes);
235}
236
237const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
238 const u8 *buf_end) {
239 assert(buf && buf_end);
240 assert(buf < buf_end);
241
242 // Slow path for small cases.
243 if (buf_end - buf < 16) {
244 return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
245 buf, buf_end);
246 }
247
248 const m128 zeroes = zeroes128();
249 const m128 low4bits = _mm_set1_epi8(0xf);
250 const u8 *rv;
251
252 assert(buf_end - buf >= 16);
253
254 // Preconditioning: most of the time our buffer won't be aligned.
255 m128 chars = loadu128(buf_end - 16);
256 rv = revBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes);
257 if (rv) {
258 return rv;
259 }
260 buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf));
261
262 // Unrolling was here, but it wasn't doing anything but taking up space.
263 // Reroll FTW.
264
265 const u8 *last_block = buf + 16;
266 while (buf_end > last_block) {
267 buf_end -= 16;
268 m128 lchars = load128(buf_end);
269 rv = revBlock(mask_lo, mask_hi, lchars, buf_end, low4bits, zeroes);
270 if (rv) {
271 return rv;
272 }
273 }
274
275 // Use an unaligned load to mop up the last 16 bytes and get an accurate
276 // picture to buf.
277 chars = loadu128(buf);
278 rv = revBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes);
279 if (rv) {
280 return rv;
281 }
282
283 return buf - 1;
284}
285
286static really_inline
287const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
288 m128 chars, const u8 *buf, const m128 low4bits,
289 const m128 ones) {
290 m128 chars_lo = GET_LO_4(chars);
291 m128 chars_hi = GET_HI_4(chars);
292 m128 c_lo = pshufb_m128(mask1_lo, chars_lo);
293 m128 c_hi = pshufb_m128(mask1_hi, chars_hi);
294 m128 t = or128(c_lo, c_hi);
295
296#ifdef DEBUG
297 DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
298 DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n");
299 DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n");
300 DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n");
301 DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n");
302#endif
303
304 m128 c2_lo = pshufb_m128(mask2_lo, chars_lo);
305 m128 c2_hi = pshufb_m128(mask2_hi, chars_hi);
306 m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
307
308#ifdef DEBUG
309 DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo); printf("\n");
310 DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi); printf("\n");
311 DEBUG_PRINTF(" t2: "); dumpMsk128(t2); printf("\n");
312#endif
313
314 u32 z = movemask128(eq128(t2, ones));
315 DEBUG_PRINTF(" z: 0x%08x\n", z);
316 return firstMatch(buf, z);
317}
318
319const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
320 m128 mask2_lo, m128 mask2_hi,
321 const u8 *buf, const u8 *buf_end) {
322 const m128 ones = ones128();
323 const m128 low4bits = _mm_set1_epi8(0xf);
324 const u8 *rv;
325
326 size_t min = (size_t)buf % 16;
327
328 // Preconditioning: most of the time our buffer won't be aligned.
329 m128 chars = loadu128(buf);
330 rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
331 chars, buf, low4bits, ones);
332 if (rv) {
333 return rv;
334 }
335 buf += (16 - min);
336
337 // Unrolling was here, but it wasn't doing anything but taking up space.
338 // Reroll FTW.
339
340 const u8 *last_block = buf_end - 16;
341 while (buf < last_block) {
342 m128 lchars = load128(buf);
343 rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
344 lchars, buf, low4bits, ones);
345 if (rv) {
346 return rv;
347 }
348 buf += 16;
349 }
350
351 // Use an unaligned load to mop up the last 16 bytes and get an accurate
352 // picture to buf_end.
353 chars = loadu128(buf_end - 16);
354 rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
355 chars, buf_end - 16, low4bits, ones);
356 if (rv) {
357 return rv;
358 }
359
360 return buf_end;
361}
362
363#elif !defined(HAVE_AVX512)
364// AVX2 - 256 wide shuftis
365
366#ifdef DEBUG
367DUMP_MSK(256)
368#endif
369
370#define GET_LO_4(chars) and256(chars, low4bits)
371#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
372
373static really_inline
374u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
375 const m256 compare) {
376 m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
377 m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
378 m256 t = and256(c_lo, c_hi);
379
380#ifdef DEBUG
381 DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
382 DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n");
383 DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n");
384 DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n");
385 DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n");
386#endif
387
388 return movemask256(eq256(t, compare));
389}
390
391static really_inline
392const u8 *firstMatch(const u8 *buf, u32 z) {
393 DEBUG_PRINTF("z 0x%08x\n", z);
394 if (unlikely(z != 0xffffffff)) {
395 u32 pos = ctz32(~z);
396 assert(pos < 32);
397 DEBUG_PRINTF("match @ pos %u\n", pos);
398 return buf + pos;
399 } else {
400 return NULL; // no match
401 }
402}
403
404static really_inline
405const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
406 const m256 low4bits) {
407 // do the hi and lo shuffles in the one avx register
408 m256 c = combine2x128(rshift64_m128(chars, 4), chars);
409 c = and256(c, low4bits);
410 m256 c_shuf = pshufb_m256(mask, c);
411 m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
412 // the upper 32-bits can't match
413 u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
414
415 return firstMatch(buf, z);
416}
417
418static really_inline
419const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
420 const u8 *buf_end, const m256 low4bits) {
421 // run shufti over two overlapping 16-byte unaligned reads
422 const m256 mask = combine2x128(mask_hi, mask_lo);
423 m128 chars = loadu128(buf);
424 const u8 *rv = fwdBlockShort(mask, chars, buf, low4bits);
425 if (rv) {
426 return rv;
427 }
428
429 chars = loadu128(buf_end - 16);
430 rv = fwdBlockShort(mask, chars, buf_end - 16, low4bits);
431 if (rv) {
432 return rv;
433 }
434 return buf_end;
435}
436
437static really_inline
438const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
439 const m256 low4bits, const m256 zeroes) {
440 u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
441
442 return firstMatch(buf, z);
443}
444
445/* takes 128 bit masks, but operates on 256 bits of data */
446const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
447 const u8 *buf_end) {
448 assert(buf && buf_end);
449 assert(buf < buf_end);
450 DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
451
452 // Slow path for small cases.
453 if (buf_end - buf < 16) {
454 return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
455 buf, buf_end);
456 }
457
458 const m256 low4bits = set32x8(0xf);
459
460 if (buf_end - buf <= 32) {
461 return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
462 }
463
464 const m256 zeroes = zeroes256();
465 const m256 wide_mask_lo = set2x128(mask_lo);
466 const m256 wide_mask_hi = set2x128(mask_hi);
467 const u8 *rv;
468
469 size_t min = (size_t)buf % 32;
470 assert(buf_end - buf >= 32);
471
472 // Preconditioning: most of the time our buffer won't be aligned.
473 m256 chars = loadu256(buf);
474 rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
475 if (rv) {
476 return rv;
477 }
478 buf += (32 - min);
479
480 // Unrolling was here, but it wasn't doing anything but taking up space.
481 // Reroll FTW.
482
483 const u8 *last_block = buf_end - 32;
484 while (buf < last_block) {
485 m256 lchars = load256(buf);
486 rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
487 if (rv) {
488 return rv;
489 }
490 buf += 32;
491 }
492
493 // Use an unaligned load to mop up the last 32 bytes and get an accurate
494 // picture to buf_end.
495 assert(buf <= buf_end && buf >= buf_end - 32);
496 chars = loadu256(buf_end - 32);
497 rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
498 if (rv) {
499 return rv;
500 }
501
502 return buf_end;
503}
504
505static really_inline
506const u8 *lastMatch(const u8 *buf, u32 z) {
507 if (unlikely(z != 0xffffffff)) {
508 u32 pos = clz32(~z);
509 DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
510 return buf + (31 - pos);
511 } else {
512 return NULL; // no match
513 }
514}
515
516static really_inline
517const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
518 const m256 low4bits, const m256 zeroes) {
519 m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
520 m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
521 m256 t = and256(c_lo, c_hi);
522
523#ifdef DEBUG
524 DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
525 DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n");
526 DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n");
527 DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n");
528 DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n");
529#endif
530
531 u32 z = movemask256(eq256(t, zeroes));
532 return lastMatch(buf, z);
533}
534
535static really_inline
536const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
537 const m256 low4bits) {
538 // do the hi and lo shuffles in the one avx register
539 m256 c = combine2x128(rshift64_m128(chars, 4), chars);
540 c = and256(c, low4bits);
541 m256 c_shuf = pshufb_m256(mask, c);
542 m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
543 // the upper 32-bits can't match
544 u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
545
546 return lastMatch(buf, z);
547}
548
549static really_inline
550const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
551 const u8 *buf_end, const m256 low4bits) {
552 // run shufti over two overlapping 16-byte unaligned reads
553 const m256 mask = combine2x128(mask_hi, mask_lo);
554
555 m128 chars = loadu128(buf_end - 16);
556 const u8 *rv = revBlockShort(mask, chars, buf_end - 16, low4bits);
557 if (rv) {
558 return rv;
559 }
560
561 chars = loadu128(buf);
562 rv = revBlockShort(mask, chars, buf, low4bits);
563 if (rv) {
564 return rv;
565 }
566 return buf - 1;
567}
568
569
570/* takes 128 bit masks, but operates on 256 bits of data */
571const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
572 const u8 *buf_end) {
573 assert(buf && buf_end);
574 assert(buf < buf_end);
575
576 // Slow path for small cases.
577 if (buf_end - buf < 16) {
578 return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
579 buf, buf_end);
580 }
581
582 const m256 low4bits = set32x8(0xf);
583
584 if (buf_end - buf <= 32) {
585 return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
586 }
587
588 const m256 zeroes = zeroes256();
589 const m256 wide_mask_lo = set2x128(mask_lo);
590 const m256 wide_mask_hi = set2x128(mask_hi);
591 const u8 *rv;
592
593 assert(buf_end - buf >= 32);
594
595 // Preconditioning: most of the time our buffer won't be aligned.
596 m256 chars = loadu256(buf_end - 32);
597 rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
598 if (rv) {
599 return rv;
600 }
601 buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f));
602
603 // Unrolling was here, but it wasn't doing anything but taking up space.
604 // Reroll FTW.
605 const u8 *last_block = buf + 32;
606 while (buf_end > last_block) {
607 buf_end -= 32;
608 m256 lchars = load256(buf_end);
609 rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes);
610 if (rv) {
611 return rv;
612 }
613 }
614
615 // Use an unaligned load to mop up the last 32 bytes and get an accurate
616 // picture to buf.
617 chars = loadu256(buf);
618 rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
619 if (rv) {
620 return rv;
621 }
622
623 return buf - 1;
624}
625
626static really_inline
627const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
628 m256 chars, const u8 *buf, const m256 low4bits,
629 const m256 ones) {
630 DEBUG_PRINTF("buf %p\n", buf);
631 m256 chars_lo = GET_LO_4(chars);
632 m256 chars_hi = GET_HI_4(chars);
633 m256 c_lo = pshufb_m256(mask1_lo, chars_lo);
634 m256 c_hi = pshufb_m256(mask1_hi, chars_hi);
635 m256 t = or256(c_lo, c_hi);
636
637#ifdef DEBUG
638 DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
639 DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n");
640 DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n");
641 DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n");
642 DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n");
643#endif
644
645 m256 c2_lo = pshufb_m256(mask2_lo, chars_lo);
646 m256 c2_hi = pshufb_m256(mask2_hi, chars_hi);
647 m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
648
649#ifdef DEBUG
650 DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo); printf("\n");
651 DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi); printf("\n");
652 DEBUG_PRINTF(" t2: "); dumpMsk256(t2); printf("\n");
653#endif
654 u32 z = movemask256(eq256(t2, ones));
655
656 return firstMatch(buf, z);
657}
658
659static really_inline
660const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
661 const m256 low4bits) {
662 // do the hi and lo shuffles in the one avx register
663 m256 c = combine2x128(rshift64_m128(chars, 4), chars);
664 c = and256(c, low4bits);
665 m256 c_shuf1 = pshufb_m256(mask1, c);
666 m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), 1);
667 m256 t0 = or256(c_shuf1, c_shuf2);
668 m128 t = or128(movdq_hi(t0), cast256to128(t0));
669 // the upper 32-bits can't match
670 u32 z = 0xffff0000U | movemask128(eq128(t, ones128()));
671
672 return firstMatch(buf, z);
673}
674
675static really_inline
676const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
677 m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
678 DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
679 const m256 low4bits = set32x8(0xf);
680 // run shufti over two overlapping 16-byte unaligned reads
681 const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
682 const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
683 m128 chars = loadu128(buf);
684 const u8 *rv = fwdBlockShort2(mask1, mask2, chars, buf, low4bits);
685 if (rv) {
686 return rv;
687 }
688
689 chars = loadu128(buf_end - 16);
690 rv = fwdBlockShort2(mask1, mask2, chars, buf_end - 16, low4bits);
691 if (rv) {
692 return rv;
693 }
694 return buf_end;
695}
696
697/* takes 128 bit masks, but operates on 256 bits of data */
698const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
699 m128 mask2_lo, m128 mask2_hi,
700 const u8 *buf, const u8 *buf_end) {
701 /* we should always have at least 16 bytes */
702 assert(buf_end - buf >= 16);
703 DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
704
705 if (buf_end - buf < 32) {
706 return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf,
707 buf_end);
708 }
709
710 const m256 ones = ones256();
711 const m256 low4bits = set32x8(0xf);
712 const m256 wide_mask1_lo = set2x128(mask1_lo);
713 const m256 wide_mask1_hi = set2x128(mask1_hi);
714 const m256 wide_mask2_lo = set2x128(mask2_lo);
715 const m256 wide_mask2_hi = set2x128(mask2_hi);
716 const u8 *rv;
717
718 size_t min = (size_t)buf % 32;
719
720 // Preconditioning: most of the time our buffer won't be aligned.
721 m256 chars = loadu256(buf);
722 rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
723 chars, buf, low4bits, ones);
724 if (rv) {
725 return rv;
726 }
727 buf += (32 - min);
728
729 // Unrolling was here, but it wasn't doing anything but taking up space.
730 // Reroll FTW.
731 const u8 *last_block = buf_end - 32;
732 while (buf < last_block) {
733 m256 lchars = load256(buf);
734 rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
735 lchars, buf, low4bits, ones);
736 if (rv) {
737 return rv;
738 }
739 buf += 32;
740 }
741
742 // Use an unaligned load to mop up the last 32 bytes and get an accurate
743 // picture to buf_end.
744 chars = loadu256(buf_end - 32);
745 rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
746 chars, buf_end - 32, low4bits, ones);
747 if (rv) {
748 return rv;
749 }
750
751 return buf_end;
752}
753
754#else // defined(HAVE_AVX512)
755
756#ifdef DEBUG
757DUMP_MSK(512)
758#endif
759
760static really_inline
761u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits,
762 const m512 compare) {
763 m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
764 m512 c_hi = pshufb_m512(mask_hi,
765 rshift64_m512(andnot512(low4bits, chars), 4));
766 m512 t = and512(c_lo, c_hi);
767
768#ifdef DEBUG
769 DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
770 DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n");
771 DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n");
772 DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n");
773 DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n");
774#endif
775
776 return eq512mask(t, compare);
777}
778static really_inline
779const u8 *firstMatch64(const u8 *buf, u64a z) {
780 DEBUG_PRINTF("z 0x%016llx\n", z);
781 if (unlikely(z != ~0ULL)) {
782 u32 pos = ctz64(~z);
783 DEBUG_PRINTF("match @ pos %u\n", pos);
784 assert(pos < 64);
785 return buf + pos;
786 } else {
787 return NULL; // no match
788 }
789}
790
791static really_inline
792const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
793 const m512 low4bits, const m512 zeroes) {
794 u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
795
796 return firstMatch64(buf, z);
797}
798
799static really_inline
800const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
801 const u8 *buf_end, const m512 low4bits,
802 const m512 zeroes) {
803 DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf);
804 uintptr_t len = buf_end - buf;
805 assert(len <= 64);
806
807 // load mask
808 u64a k = (~0ULL) >> (64 - len);
809 DEBUG_PRINTF("load mask 0x%016llx\n", k);
810
811 m512 chars = loadu_maskz_m512(k, buf);
812
813 u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
814
815 // reuse the load mask to indicate valid bytes
816 return firstMatch64(buf, z | ~k);
817}
818
819/* takes 128 bit masks, but operates on 512 bits of data */
820const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
821 const u8 *buf_end) {
822 assert(buf && buf_end);
823 assert(buf < buf_end);
824 DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
825 DEBUG_PRINTF("b %s\n", buf);
826
827 const m512 low4bits = set64x8(0xf);
828 const m512 zeroes = zeroes512();
829 const m512 wide_mask_lo = set4x128(mask_lo);
830 const m512 wide_mask_hi = set4x128(mask_hi);
831 const u8 *rv;
832
833 // small cases.
834 if (buf_end - buf <= 64) {
835 rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits,
836 zeroes);
837 return rv ? rv : buf_end;
838 }
839
840 assert(buf_end - buf >= 64);
841
842 // Preconditioning: most of the time our buffer won't be aligned.
843 if ((uintptr_t)buf % 64) {
844 rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf,
845 ROUNDUP_PTR(buf, 64), low4bits, zeroes);
846 if (rv) {
847 return rv;
848 }
849 buf = ROUNDUP_PTR(buf, 64);
850 }
851
852 const u8 *last_block = ROUNDDOWN_PTR(buf_end, 64);
853 while (buf < last_block) {
854 m512 lchars = load512(buf);
855 rv = fwdBlock512(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits,
856 zeroes);
857 if (rv) {
858 return rv;
859 }
860 buf += 64;
861 }
862
863 if (buf == buf_end) {
864 goto done;
865 }
866
867 // Use an unaligned load to mop up the last 64 bytes and get an accurate
868 // picture to buf_end.
869 assert(buf <= buf_end && buf >= buf_end - 64);
870 m512 chars = loadu512(buf_end - 64);
871 rv = fwdBlock512(wide_mask_lo, wide_mask_hi, chars, buf_end - 64, low4bits,
872 zeroes);
873 if (rv) {
874 return rv;
875 }
876done:
877 return buf_end;
878}
879
880static really_inline
881const u8 *lastMatch64(const u8 *buf, u64a z) {
882 DEBUG_PRINTF("z 0x%016llx\n", z);
883 if (unlikely(z != ~0ULL)) {
884 u32 pos = clz64(~z);
885 DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
886 return buf + (63 - pos);
887 } else {
888 return NULL; // no match
889 }
890}
891
892static really_inline
893const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
894 const u8 *buf_end, const m512 low4bits,
895 const m512 zeroes) {
896 DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
897 uintptr_t len = buf_end - buf;
898 assert(len <= 64);
899
900 // load mask
901 u64a k = (~0ULL) >> (64 - len);
902 DEBUG_PRINTF("load mask 0x%016llx\n", k);
903
904 m512 chars = loadu_maskz_m512(k, buf);
905
906 u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
907
908 // reuse the load mask to indicate valid bytes
909 return lastMatch64(buf, z | ~k);
910}
911
912static really_inline
913const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
914 const m512 low4bits, const m512 zeroes) {
915 m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
916 m512 c_hi = pshufb_m512(mask_hi,
917 rshift64_m512(andnot512(low4bits, chars), 4));
918 m512 t = and512(c_lo, c_hi);
919
920#ifdef DEBUG
921 DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
922 DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n");
923 DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n");
924 DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n");
925 DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n");
926#endif
927
928 u64a z = eq512mask(t, zeroes);
929 return lastMatch64(buf, z);
930}
931
932/* takes 128 bit masks, but operates on 512 bits of data */
933const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
934 const u8 *buf_end) {
935 DEBUG_PRINTF("buf %p buf_end %p\n", buf, buf_end);
936 assert(buf && buf_end);
937 assert(buf < buf_end);
938
939 const m512 low4bits = set64x8(0xf);
940 const m512 zeroes = zeroes512();
941 const m512 wide_mask_lo = set4x128(mask_lo);
942 const m512 wide_mask_hi = set4x128(mask_hi);
943 const u8 *rv;
944
945 if (buf_end - buf < 64) {
946 rv = rshortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits,
947 zeroes);
948 return rv ? rv : buf - 1;
949 }
950
951 if (ROUNDDOWN_PTR(buf_end, 64) != buf_end) {
952 // peel off unaligned portion
953 assert(buf_end - buf >= 64);
954 DEBUG_PRINTF("start\n");
955 rv = rshortShufti512(wide_mask_lo, wide_mask_hi,
956 ROUNDDOWN_PTR(buf_end, 64), buf_end, low4bits,
957 zeroes);
958 if (rv) {
959 return rv;
960 }
961 buf_end = ROUNDDOWN_PTR(buf_end, 64);
962 }
963
964 const u8 *last_block = ROUNDUP_PTR(buf, 64);
965 while (buf_end > last_block) {
966 buf_end -= 64;
967 m512 lchars = load512(buf_end);
968 rv = revBlock512(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits,
969 zeroes);
970 if (rv) {
971 return rv;
972 }
973 }
974 if (buf_end == buf) {
975 goto done;
976 }
977 // Use an unaligned load to mop up the last 64 bytes and get an accurate
978 // picture to buf.
979 m512 chars = loadu512(buf);
980 rv = revBlock512(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
981 if (rv) {
982 return rv;
983 }
984done:
985 return buf - 1;
986}
987
988static really_inline
989const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
990 m512 chars, const u8 *buf, const m512 low4bits,
991 const m512 ones, __mmask64 k) {
992 DEBUG_PRINTF("buf %p %.64s\n", buf, buf);
993 m512 chars_lo = and512(chars, low4bits);
994 m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4);
995 m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo);
996 m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi);
997 m512 t = or512(c_lo, c_hi);
998
999#ifdef DEBUG
1000 DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
1001 DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n");
1002 DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n");
1003 DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n");
1004 DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n");
1005#endif
1006
1007 m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo);
1008 m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi);
1009 m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1));
1010
1011#ifdef DEBUG
1012 DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo); printf("\n");
1013 DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi); printf("\n");
1014 DEBUG_PRINTF(" t2: "); dumpMsk512(t2); printf("\n");
1015#endif
1016 u64a z = eq512mask(t2, ones);
1017
1018 return firstMatch64(buf, z | ~k);
1019}
1020
1021static really_inline
1022const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo,
1023 m512 mask2_hi, const u8 *buf, const u8 *buf_end,
1024 const m512 low4bits, const m512 ones) {
1025 DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
1026 uintptr_t len = buf_end - buf;
1027 assert(len <= 64);
1028
1029 u64a k = (~0ULL) >> (64 - len);
1030 DEBUG_PRINTF("load mask 0x%016llx\n", k);
1031
1032 m512 chars = loadu_mask_m512(ones, k, buf);
1033
1034 const u8 *rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf,
1035 low4bits, ones, k);
1036
1037 return rv;
1038}
1039
1040/* takes 128 bit masks, but operates on 512 bits of data */
1041const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
1042 m128 mask2_lo, m128 mask2_hi,
1043 const u8 *buf, const u8 *buf_end) {
1044 /* we should always have at least 16 bytes */
1045 assert(buf_end - buf >= 16);
1046 DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
1047
1048 const m512 ones = ones512();
1049 const m512 low4bits = set64x8(0xf);
1050 const m512 wide_mask1_lo = set4x128(mask1_lo);
1051 const m512 wide_mask1_hi = set4x128(mask1_hi);
1052 const m512 wide_mask2_lo = set4x128(mask2_lo);
1053 const m512 wide_mask2_hi = set4x128(mask2_hi);
1054 const u8 *rv;
1055
1056 if (buf_end - buf <= 64) {
1057 rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
1058 wide_mask2_hi, buf, buf_end, low4bits, ones);
1059 DEBUG_PRINTF("rv %p\n", rv);
1060 return rv ? rv : buf_end;
1061 }
1062
1063 // Preconditioning: most of the time our buffer won't be aligned.
1064 if ((uintptr_t)buf % 64) {
1065 rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
1066 wide_mask2_hi, buf, ROUNDUP_PTR(buf, 64),
1067 low4bits, ones);
1068 if (rv) {
1069 return rv;
1070 }
1071
1072 buf = ROUNDUP_PTR(buf, 64);
1073 }
1074
1075 const u8 *last_block = buf_end - 64;
1076 while (buf < last_block) {
1077 m512 lchars = load512(buf);
1078 rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
1079 wide_mask2_hi, lchars, buf, low4bits, ones, ~0);
1080 if (rv) {
1081 return rv;
1082 }
1083 buf += 64;
1084 }
1085
1086 // Use an unaligned load to mop up the last 64 bytes and get an accurate
1087 // picture to buf_end.
1088 m512 chars = loadu512(buf_end - 64);
1089 rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
1090 chars, buf_end - 64, low4bits, ones, ~0);
1091 if (rv) {
1092 return rv;
1093 }
1094
1095 return buf_end;
1096}
1097#endif
1098