1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Shufti: character class acceleration. |
31 | * |
32 | * Utilises the SSSE3 pshufb shuffle instruction |
33 | */ |
34 | |
35 | #include "shufti.h" |
36 | #include "ue2common.h" |
37 | #include "util/arch.h" |
38 | #include "util/bitutils.h" |
39 | #include "util/simd_utils.h" |
40 | #include "util/unaligned.h" |
41 | |
42 | #ifdef DEBUG |
43 | #include <ctype.h> |
44 | |
45 | #define DUMP_MSK(_t) \ |
46 | static UNUSED \ |
47 | void dumpMsk##_t(m##_t msk) { \ |
48 | u8 * mskAsU8 = (u8 *)&msk; \ |
49 | for (unsigned i = 0; i < sizeof(msk); i++) { \ |
50 | u8 c = mskAsU8[i]; \ |
51 | for (int j = 0; j < 8; j++) { \ |
52 | if ((c >> (7-j)) & 0x1) \ |
53 | printf("1"); \ |
54 | else \ |
55 | printf("0"); \ |
56 | } \ |
57 | printf(" "); \ |
58 | } \ |
59 | } \ |
60 | static UNUSED \ |
61 | void dumpMsk##_t##AsChars(m##_t msk) { \ |
62 | u8 * mskAsU8 = (u8 *)&msk; \ |
63 | for (unsigned i = 0; i < sizeof(msk); i++) { \ |
64 | u8 c = mskAsU8[i]; \ |
65 | if (isprint(c)) \ |
66 | printf("%c",c); \ |
67 | else \ |
68 | printf("."); \ |
69 | } \ |
70 | } |
71 | |
72 | #endif |
73 | |
74 | /** \brief Naive byte-by-byte implementation. */ |
75 | static really_inline |
76 | const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, |
77 | const u8 *buf_end) { |
78 | assert(buf < buf_end); |
79 | |
80 | for (; buf < buf_end; ++buf) { |
81 | u8 c = *buf; |
82 | if (lo[c & 0xf] & hi[c >> 4]) { |
83 | break; |
84 | } |
85 | } |
86 | return buf; |
87 | } |
88 | |
89 | /** \brief Naive byte-by-byte implementation. */ |
90 | static really_inline |
91 | const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, |
92 | const u8 *buf_end) { |
93 | assert(buf < buf_end); |
94 | |
95 | for (buf_end--; buf_end >= buf; buf_end--) { |
96 | u8 c = *buf_end; |
97 | if (lo[c & 0xf] & hi[c >> 4]) { |
98 | break; |
99 | } |
100 | } |
101 | return buf_end; |
102 | } |
103 | |
104 | #if !defined(HAVE_AVX2) |
105 | /* Normal SSSE3 shufti */ |
106 | |
107 | #ifdef DEBUG |
108 | DUMP_MSK(128) |
109 | #endif |
110 | |
111 | #define GET_LO_4(chars) and128(chars, low4bits) |
112 | #define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) |
113 | |
114 | static really_inline |
115 | u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, |
116 | const m128 compare) { |
117 | m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); |
118 | m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); |
119 | m128 t = and128(c_lo, c_hi); |
120 | |
121 | #ifdef DEBUG |
122 | DEBUG_PRINTF(" chars: " ); dumpMsk128AsChars(chars); printf("\n" ); |
123 | DEBUG_PRINTF(" char: " ); dumpMsk128(chars); printf("\n" ); |
124 | DEBUG_PRINTF(" c_lo: " ); dumpMsk128(c_lo); printf("\n" ); |
125 | DEBUG_PRINTF(" c_hi: " ); dumpMsk128(c_hi); printf("\n" ); |
126 | DEBUG_PRINTF(" t: " ); dumpMsk128(t); printf("\n" ); |
127 | #endif |
128 | return movemask128(eq128(t, compare)); |
129 | } |
130 | |
131 | static really_inline |
132 | const u8 *firstMatch(const u8 *buf, u32 z) { |
133 | if (unlikely(z != 0xffff)) { |
134 | u32 pos = ctz32(~z & 0xffff); |
135 | assert(pos < 16); |
136 | return buf + pos; |
137 | } else { |
138 | return NULL; // no match |
139 | } |
140 | } |
141 | |
142 | static really_inline |
143 | const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, |
144 | const m128 low4bits, const m128 zeroes) { |
145 | u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); |
146 | |
147 | return firstMatch(buf, z); |
148 | } |
149 | |
150 | const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, |
151 | const u8 *buf_end) { |
152 | assert(buf && buf_end); |
153 | assert(buf < buf_end); |
154 | |
155 | // Slow path for small cases. |
156 | if (buf_end - buf < 16) { |
157 | return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, |
158 | buf, buf_end); |
159 | } |
160 | |
161 | const m128 zeroes = zeroes128(); |
162 | const m128 low4bits = _mm_set1_epi8(0xf); |
163 | const u8 *rv; |
164 | |
165 | size_t min = (size_t)buf % 16; |
166 | assert(buf_end - buf >= 16); |
167 | |
168 | // Preconditioning: most of the time our buffer won't be aligned. |
169 | m128 chars = loadu128(buf); |
170 | rv = fwdBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes); |
171 | if (rv) { |
172 | return rv; |
173 | } |
174 | buf += (16 - min); |
175 | |
176 | // Unrolling was here, but it wasn't doing anything but taking up space. |
177 | // Reroll FTW. |
178 | |
179 | const u8 *last_block = buf_end - 16; |
180 | while (buf < last_block) { |
181 | m128 lchars = load128(buf); |
182 | rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes); |
183 | if (rv) { |
184 | return rv; |
185 | } |
186 | buf += 16; |
187 | } |
188 | |
189 | // Use an unaligned load to mop up the last 16 bytes and get an accurate |
190 | // picture to buf_end. |
191 | assert(buf <= buf_end && buf >= buf_end - 16); |
192 | chars = loadu128(buf_end - 16); |
193 | rv = fwdBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes); |
194 | if (rv) { |
195 | return rv; |
196 | } |
197 | |
198 | return buf_end; |
199 | } |
200 | |
201 | static really_inline |
202 | const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) { |
203 | #ifdef DEBUG |
204 | DEBUG_PRINTF("confirming match in:" ); dumpMsk128(t); printf("\n" ); |
205 | #endif |
206 | |
207 | u32 z = movemask128(eq128(t, compare)); |
208 | if (unlikely(z != 0xffff)) { |
209 | u32 pos = clz32(~z & 0xffff); |
210 | DEBUG_PRINTF("buf=%p, pos=%u\n" , buf, pos); |
211 | assert(pos >= 16 && pos < 32); |
212 | return buf + (31 - pos); |
213 | } else { |
214 | return NULL; // no match |
215 | } |
216 | } |
217 | |
218 | |
219 | static really_inline |
220 | const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, |
221 | const m128 low4bits, const m128 zeroes) { |
222 | m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); |
223 | m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); |
224 | m128 t = and128(c_lo, c_hi); |
225 | |
226 | #ifdef DEBUG |
227 | DEBUG_PRINTF(" chars: " ); dumpMsk128AsChars(chars); printf("\n" ); |
228 | DEBUG_PRINTF(" char: " ); dumpMsk128(chars); printf("\n" ); |
229 | DEBUG_PRINTF(" c_lo: " ); dumpMsk128(c_lo); printf("\n" ); |
230 | DEBUG_PRINTF(" c_hi: " ); dumpMsk128(c_hi); printf("\n" ); |
231 | DEBUG_PRINTF(" t: " ); dumpMsk128(t); printf("\n" ); |
232 | #endif |
233 | |
234 | return lastMatch(buf, t, zeroes); |
235 | } |
236 | |
237 | const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, |
238 | const u8 *buf_end) { |
239 | assert(buf && buf_end); |
240 | assert(buf < buf_end); |
241 | |
242 | // Slow path for small cases. |
243 | if (buf_end - buf < 16) { |
244 | return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, |
245 | buf, buf_end); |
246 | } |
247 | |
248 | const m128 zeroes = zeroes128(); |
249 | const m128 low4bits = _mm_set1_epi8(0xf); |
250 | const u8 *rv; |
251 | |
252 | assert(buf_end - buf >= 16); |
253 | |
254 | // Preconditioning: most of the time our buffer won't be aligned. |
255 | m128 chars = loadu128(buf_end - 16); |
256 | rv = revBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes); |
257 | if (rv) { |
258 | return rv; |
259 | } |
260 | buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf)); |
261 | |
262 | // Unrolling was here, but it wasn't doing anything but taking up space. |
263 | // Reroll FTW. |
264 | |
265 | const u8 *last_block = buf + 16; |
266 | while (buf_end > last_block) { |
267 | buf_end -= 16; |
268 | m128 lchars = load128(buf_end); |
269 | rv = revBlock(mask_lo, mask_hi, lchars, buf_end, low4bits, zeroes); |
270 | if (rv) { |
271 | return rv; |
272 | } |
273 | } |
274 | |
275 | // Use an unaligned load to mop up the last 16 bytes and get an accurate |
276 | // picture to buf. |
277 | chars = loadu128(buf); |
278 | rv = revBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes); |
279 | if (rv) { |
280 | return rv; |
281 | } |
282 | |
283 | return buf - 1; |
284 | } |
285 | |
286 | static really_inline |
287 | const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, |
288 | m128 chars, const u8 *buf, const m128 low4bits, |
289 | const m128 ones) { |
290 | m128 chars_lo = GET_LO_4(chars); |
291 | m128 chars_hi = GET_HI_4(chars); |
292 | m128 c_lo = pshufb_m128(mask1_lo, chars_lo); |
293 | m128 c_hi = pshufb_m128(mask1_hi, chars_hi); |
294 | m128 t = or128(c_lo, c_hi); |
295 | |
296 | #ifdef DEBUG |
297 | DEBUG_PRINTF(" chars: " ); dumpMsk128AsChars(chars); printf("\n" ); |
298 | DEBUG_PRINTF(" char: " ); dumpMsk128(chars); printf("\n" ); |
299 | DEBUG_PRINTF(" c_lo: " ); dumpMsk128(c_lo); printf("\n" ); |
300 | DEBUG_PRINTF(" c_hi: " ); dumpMsk128(c_hi); printf("\n" ); |
301 | DEBUG_PRINTF(" t: " ); dumpMsk128(t); printf("\n" ); |
302 | #endif |
303 | |
304 | m128 c2_lo = pshufb_m128(mask2_lo, chars_lo); |
305 | m128 c2_hi = pshufb_m128(mask2_hi, chars_hi); |
306 | m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1)); |
307 | |
308 | #ifdef DEBUG |
309 | DEBUG_PRINTF(" c2_lo: " ); dumpMsk128(c2_lo); printf("\n" ); |
310 | DEBUG_PRINTF(" c2_hi: " ); dumpMsk128(c2_hi); printf("\n" ); |
311 | DEBUG_PRINTF(" t2: " ); dumpMsk128(t2); printf("\n" ); |
312 | #endif |
313 | |
314 | u32 z = movemask128(eq128(t2, ones)); |
315 | DEBUG_PRINTF(" z: 0x%08x\n" , z); |
316 | return firstMatch(buf, z); |
317 | } |
318 | |
319 | const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, |
320 | m128 mask2_lo, m128 mask2_hi, |
321 | const u8 *buf, const u8 *buf_end) { |
322 | const m128 ones = ones128(); |
323 | const m128 low4bits = _mm_set1_epi8(0xf); |
324 | const u8 *rv; |
325 | |
326 | size_t min = (size_t)buf % 16; |
327 | |
328 | // Preconditioning: most of the time our buffer won't be aligned. |
329 | m128 chars = loadu128(buf); |
330 | rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, |
331 | chars, buf, low4bits, ones); |
332 | if (rv) { |
333 | return rv; |
334 | } |
335 | buf += (16 - min); |
336 | |
337 | // Unrolling was here, but it wasn't doing anything but taking up space. |
338 | // Reroll FTW. |
339 | |
340 | const u8 *last_block = buf_end - 16; |
341 | while (buf < last_block) { |
342 | m128 lchars = load128(buf); |
343 | rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, |
344 | lchars, buf, low4bits, ones); |
345 | if (rv) { |
346 | return rv; |
347 | } |
348 | buf += 16; |
349 | } |
350 | |
351 | // Use an unaligned load to mop up the last 16 bytes and get an accurate |
352 | // picture to buf_end. |
353 | chars = loadu128(buf_end - 16); |
354 | rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, |
355 | chars, buf_end - 16, low4bits, ones); |
356 | if (rv) { |
357 | return rv; |
358 | } |
359 | |
360 | return buf_end; |
361 | } |
362 | |
363 | #elif !defined(HAVE_AVX512) |
364 | // AVX2 - 256 wide shuftis |
365 | |
366 | #ifdef DEBUG |
367 | DUMP_MSK(256) |
368 | #endif |
369 | |
370 | #define GET_LO_4(chars) and256(chars, low4bits) |
371 | #define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4) |
372 | |
373 | static really_inline |
374 | u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits, |
375 | const m256 compare) { |
376 | m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); |
377 | m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); |
378 | m256 t = and256(c_lo, c_hi); |
379 | |
380 | #ifdef DEBUG |
381 | DEBUG_PRINTF(" chars: " ); dumpMsk256AsChars(chars); printf("\n" ); |
382 | DEBUG_PRINTF(" char: " ); dumpMsk256(chars); printf("\n" ); |
383 | DEBUG_PRINTF(" c_lo: " ); dumpMsk256(c_lo); printf("\n" ); |
384 | DEBUG_PRINTF(" c_hi: " ); dumpMsk256(c_hi); printf("\n" ); |
385 | DEBUG_PRINTF(" t: " ); dumpMsk256(t); printf("\n" ); |
386 | #endif |
387 | |
388 | return movemask256(eq256(t, compare)); |
389 | } |
390 | |
391 | static really_inline |
392 | const u8 *firstMatch(const u8 *buf, u32 z) { |
393 | DEBUG_PRINTF("z 0x%08x\n" , z); |
394 | if (unlikely(z != 0xffffffff)) { |
395 | u32 pos = ctz32(~z); |
396 | assert(pos < 32); |
397 | DEBUG_PRINTF("match @ pos %u\n" , pos); |
398 | return buf + pos; |
399 | } else { |
400 | return NULL; // no match |
401 | } |
402 | } |
403 | |
404 | static really_inline |
405 | const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, |
406 | const m256 low4bits) { |
407 | // do the hi and lo shuffles in the one avx register |
408 | m256 c = combine2x128(rshift64_m128(chars, 4), chars); |
409 | c = and256(c, low4bits); |
410 | m256 c_shuf = pshufb_m256(mask, c); |
411 | m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); |
412 | // the upper 32-bits can't match |
413 | u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128())); |
414 | |
415 | return firstMatch(buf, z); |
416 | } |
417 | |
418 | static really_inline |
419 | const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf, |
420 | const u8 *buf_end, const m256 low4bits) { |
421 | // run shufti over two overlapping 16-byte unaligned reads |
422 | const m256 mask = combine2x128(mask_hi, mask_lo); |
423 | m128 chars = loadu128(buf); |
424 | const u8 *rv = fwdBlockShort(mask, chars, buf, low4bits); |
425 | if (rv) { |
426 | return rv; |
427 | } |
428 | |
429 | chars = loadu128(buf_end - 16); |
430 | rv = fwdBlockShort(mask, chars, buf_end - 16, low4bits); |
431 | if (rv) { |
432 | return rv; |
433 | } |
434 | return buf_end; |
435 | } |
436 | |
437 | static really_inline |
438 | const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, |
439 | const m256 low4bits, const m256 zeroes) { |
440 | u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); |
441 | |
442 | return firstMatch(buf, z); |
443 | } |
444 | |
445 | /* takes 128 bit masks, but operates on 256 bits of data */ |
446 | const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, |
447 | const u8 *buf_end) { |
448 | assert(buf && buf_end); |
449 | assert(buf < buf_end); |
450 | DEBUG_PRINTF("shufti %p len %zu\n" , buf, buf_end - buf); |
451 | |
452 | // Slow path for small cases. |
453 | if (buf_end - buf < 16) { |
454 | return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, |
455 | buf, buf_end); |
456 | } |
457 | |
458 | const m256 low4bits = set32x8(0xf); |
459 | |
460 | if (buf_end - buf <= 32) { |
461 | return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits); |
462 | } |
463 | |
464 | const m256 zeroes = zeroes256(); |
465 | const m256 wide_mask_lo = set2x128(mask_lo); |
466 | const m256 wide_mask_hi = set2x128(mask_hi); |
467 | const u8 *rv; |
468 | |
469 | size_t min = (size_t)buf % 32; |
470 | assert(buf_end - buf >= 32); |
471 | |
472 | // Preconditioning: most of the time our buffer won't be aligned. |
473 | m256 chars = loadu256(buf); |
474 | rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); |
475 | if (rv) { |
476 | return rv; |
477 | } |
478 | buf += (32 - min); |
479 | |
480 | // Unrolling was here, but it wasn't doing anything but taking up space. |
481 | // Reroll FTW. |
482 | |
483 | const u8 *last_block = buf_end - 32; |
484 | while (buf < last_block) { |
485 | m256 lchars = load256(buf); |
486 | rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes); |
487 | if (rv) { |
488 | return rv; |
489 | } |
490 | buf += 32; |
491 | } |
492 | |
493 | // Use an unaligned load to mop up the last 32 bytes and get an accurate |
494 | // picture to buf_end. |
495 | assert(buf <= buf_end && buf >= buf_end - 32); |
496 | chars = loadu256(buf_end - 32); |
497 | rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes); |
498 | if (rv) { |
499 | return rv; |
500 | } |
501 | |
502 | return buf_end; |
503 | } |
504 | |
505 | static really_inline |
506 | const u8 *lastMatch(const u8 *buf, u32 z) { |
507 | if (unlikely(z != 0xffffffff)) { |
508 | u32 pos = clz32(~z); |
509 | DEBUG_PRINTF("buf=%p, pos=%u\n" , buf, pos); |
510 | return buf + (31 - pos); |
511 | } else { |
512 | return NULL; // no match |
513 | } |
514 | } |
515 | |
516 | static really_inline |
517 | const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, |
518 | const m256 low4bits, const m256 zeroes) { |
519 | m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); |
520 | m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); |
521 | m256 t = and256(c_lo, c_hi); |
522 | |
523 | #ifdef DEBUG |
524 | DEBUG_PRINTF(" chars: " ); dumpMsk256AsChars(chars); printf("\n" ); |
525 | DEBUG_PRINTF(" char: " ); dumpMsk256(chars); printf("\n" ); |
526 | DEBUG_PRINTF(" c_lo: " ); dumpMsk256(c_lo); printf("\n" ); |
527 | DEBUG_PRINTF(" c_hi: " ); dumpMsk256(c_hi); printf("\n" ); |
528 | DEBUG_PRINTF(" t: " ); dumpMsk256(t); printf("\n" ); |
529 | #endif |
530 | |
531 | u32 z = movemask256(eq256(t, zeroes)); |
532 | return lastMatch(buf, z); |
533 | } |
534 | |
535 | static really_inline |
536 | const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, |
537 | const m256 low4bits) { |
538 | // do the hi and lo shuffles in the one avx register |
539 | m256 c = combine2x128(rshift64_m128(chars, 4), chars); |
540 | c = and256(c, low4bits); |
541 | m256 c_shuf = pshufb_m256(mask, c); |
542 | m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); |
543 | // the upper 32-bits can't match |
544 | u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128())); |
545 | |
546 | return lastMatch(buf, z); |
547 | } |
548 | |
549 | static really_inline |
550 | const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf, |
551 | const u8 *buf_end, const m256 low4bits) { |
552 | // run shufti over two overlapping 16-byte unaligned reads |
553 | const m256 mask = combine2x128(mask_hi, mask_lo); |
554 | |
555 | m128 chars = loadu128(buf_end - 16); |
556 | const u8 *rv = revBlockShort(mask, chars, buf_end - 16, low4bits); |
557 | if (rv) { |
558 | return rv; |
559 | } |
560 | |
561 | chars = loadu128(buf); |
562 | rv = revBlockShort(mask, chars, buf, low4bits); |
563 | if (rv) { |
564 | return rv; |
565 | } |
566 | return buf - 1; |
567 | } |
568 | |
569 | |
570 | /* takes 128 bit masks, but operates on 256 bits of data */ |
571 | const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, |
572 | const u8 *buf_end) { |
573 | assert(buf && buf_end); |
574 | assert(buf < buf_end); |
575 | |
576 | // Slow path for small cases. |
577 | if (buf_end - buf < 16) { |
578 | return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, |
579 | buf, buf_end); |
580 | } |
581 | |
582 | const m256 low4bits = set32x8(0xf); |
583 | |
584 | if (buf_end - buf <= 32) { |
585 | return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits); |
586 | } |
587 | |
588 | const m256 zeroes = zeroes256(); |
589 | const m256 wide_mask_lo = set2x128(mask_lo); |
590 | const m256 wide_mask_hi = set2x128(mask_hi); |
591 | const u8 *rv; |
592 | |
593 | assert(buf_end - buf >= 32); |
594 | |
595 | // Preconditioning: most of the time our buffer won't be aligned. |
596 | m256 chars = loadu256(buf_end - 32); |
597 | rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes); |
598 | if (rv) { |
599 | return rv; |
600 | } |
601 | buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f)); |
602 | |
603 | // Unrolling was here, but it wasn't doing anything but taking up space. |
604 | // Reroll FTW. |
605 | const u8 *last_block = buf + 32; |
606 | while (buf_end > last_block) { |
607 | buf_end -= 32; |
608 | m256 lchars = load256(buf_end); |
609 | rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes); |
610 | if (rv) { |
611 | return rv; |
612 | } |
613 | } |
614 | |
615 | // Use an unaligned load to mop up the last 32 bytes and get an accurate |
616 | // picture to buf. |
617 | chars = loadu256(buf); |
618 | rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); |
619 | if (rv) { |
620 | return rv; |
621 | } |
622 | |
623 | return buf - 1; |
624 | } |
625 | |
626 | static really_inline |
627 | const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi, |
628 | m256 chars, const u8 *buf, const m256 low4bits, |
629 | const m256 ones) { |
630 | DEBUG_PRINTF("buf %p\n" , buf); |
631 | m256 chars_lo = GET_LO_4(chars); |
632 | m256 chars_hi = GET_HI_4(chars); |
633 | m256 c_lo = pshufb_m256(mask1_lo, chars_lo); |
634 | m256 c_hi = pshufb_m256(mask1_hi, chars_hi); |
635 | m256 t = or256(c_lo, c_hi); |
636 | |
637 | #ifdef DEBUG |
638 | DEBUG_PRINTF(" chars: " ); dumpMsk256AsChars(chars); printf("\n" ); |
639 | DEBUG_PRINTF(" char: " ); dumpMsk256(chars); printf("\n" ); |
640 | DEBUG_PRINTF(" c_lo: " ); dumpMsk256(c_lo); printf("\n" ); |
641 | DEBUG_PRINTF(" c_hi: " ); dumpMsk256(c_hi); printf("\n" ); |
642 | DEBUG_PRINTF(" t: " ); dumpMsk256(t); printf("\n" ); |
643 | #endif |
644 | |
645 | m256 c2_lo = pshufb_m256(mask2_lo, chars_lo); |
646 | m256 c2_hi = pshufb_m256(mask2_hi, chars_hi); |
647 | m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1)); |
648 | |
649 | #ifdef DEBUG |
650 | DEBUG_PRINTF(" c2_lo: " ); dumpMsk256(c2_lo); printf("\n" ); |
651 | DEBUG_PRINTF(" c2_hi: " ); dumpMsk256(c2_hi); printf("\n" ); |
652 | DEBUG_PRINTF(" t2: " ); dumpMsk256(t2); printf("\n" ); |
653 | #endif |
654 | u32 z = movemask256(eq256(t2, ones)); |
655 | |
656 | return firstMatch(buf, z); |
657 | } |
658 | |
659 | static really_inline |
660 | const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, |
661 | const m256 low4bits) { |
662 | // do the hi and lo shuffles in the one avx register |
663 | m256 c = combine2x128(rshift64_m128(chars, 4), chars); |
664 | c = and256(c, low4bits); |
665 | m256 c_shuf1 = pshufb_m256(mask1, c); |
666 | m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), 1); |
667 | m256 t0 = or256(c_shuf1, c_shuf2); |
668 | m128 t = or128(movdq_hi(t0), cast256to128(t0)); |
669 | // the upper 32-bits can't match |
670 | u32 z = 0xffff0000U | movemask128(eq128(t, ones128())); |
671 | |
672 | return firstMatch(buf, z); |
673 | } |
674 | |
675 | static really_inline |
676 | const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, |
677 | m128 mask2_hi, const u8 *buf, const u8 *buf_end) { |
678 | DEBUG_PRINTF("buf %p len %zu\n" , buf, buf_end - buf); |
679 | const m256 low4bits = set32x8(0xf); |
680 | // run shufti over two overlapping 16-byte unaligned reads |
681 | const m256 mask1 = combine2x128(mask1_hi, mask1_lo); |
682 | const m256 mask2 = combine2x128(mask2_hi, mask2_lo); |
683 | m128 chars = loadu128(buf); |
684 | const u8 *rv = fwdBlockShort2(mask1, mask2, chars, buf, low4bits); |
685 | if (rv) { |
686 | return rv; |
687 | } |
688 | |
689 | chars = loadu128(buf_end - 16); |
690 | rv = fwdBlockShort2(mask1, mask2, chars, buf_end - 16, low4bits); |
691 | if (rv) { |
692 | return rv; |
693 | } |
694 | return buf_end; |
695 | } |
696 | |
697 | /* takes 128 bit masks, but operates on 256 bits of data */ |
698 | const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, |
699 | m128 mask2_lo, m128 mask2_hi, |
700 | const u8 *buf, const u8 *buf_end) { |
701 | /* we should always have at least 16 bytes */ |
702 | assert(buf_end - buf >= 16); |
703 | DEBUG_PRINTF("buf %p len %zu\n" , buf, buf_end - buf); |
704 | |
705 | if (buf_end - buf < 32) { |
706 | return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf, |
707 | buf_end); |
708 | } |
709 | |
710 | const m256 ones = ones256(); |
711 | const m256 low4bits = set32x8(0xf); |
712 | const m256 wide_mask1_lo = set2x128(mask1_lo); |
713 | const m256 wide_mask1_hi = set2x128(mask1_hi); |
714 | const m256 wide_mask2_lo = set2x128(mask2_lo); |
715 | const m256 wide_mask2_hi = set2x128(mask2_hi); |
716 | const u8 *rv; |
717 | |
718 | size_t min = (size_t)buf % 32; |
719 | |
720 | // Preconditioning: most of the time our buffer won't be aligned. |
721 | m256 chars = loadu256(buf); |
722 | rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, |
723 | chars, buf, low4bits, ones); |
724 | if (rv) { |
725 | return rv; |
726 | } |
727 | buf += (32 - min); |
728 | |
729 | // Unrolling was here, but it wasn't doing anything but taking up space. |
730 | // Reroll FTW. |
731 | const u8 *last_block = buf_end - 32; |
732 | while (buf < last_block) { |
733 | m256 lchars = load256(buf); |
734 | rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, |
735 | lchars, buf, low4bits, ones); |
736 | if (rv) { |
737 | return rv; |
738 | } |
739 | buf += 32; |
740 | } |
741 | |
742 | // Use an unaligned load to mop up the last 32 bytes and get an accurate |
743 | // picture to buf_end. |
744 | chars = loadu256(buf_end - 32); |
745 | rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, |
746 | chars, buf_end - 32, low4bits, ones); |
747 | if (rv) { |
748 | return rv; |
749 | } |
750 | |
751 | return buf_end; |
752 | } |
753 | |
754 | #else // defined(HAVE_AVX512) |
755 | |
756 | #ifdef DEBUG |
757 | DUMP_MSK(512) |
758 | #endif |
759 | |
760 | static really_inline |
761 | u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits, |
762 | const m512 compare) { |
763 | m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); |
764 | m512 c_hi = pshufb_m512(mask_hi, |
765 | rshift64_m512(andnot512(low4bits, chars), 4)); |
766 | m512 t = and512(c_lo, c_hi); |
767 | |
768 | #ifdef DEBUG |
769 | DEBUG_PRINTF(" chars: " ); dumpMsk512AsChars(chars); printf("\n" ); |
770 | DEBUG_PRINTF(" char: " ); dumpMsk512(chars); printf("\n" ); |
771 | DEBUG_PRINTF(" c_lo: " ); dumpMsk512(c_lo); printf("\n" ); |
772 | DEBUG_PRINTF(" c_hi: " ); dumpMsk512(c_hi); printf("\n" ); |
773 | DEBUG_PRINTF(" t: " ); dumpMsk512(t); printf("\n" ); |
774 | #endif |
775 | |
776 | return eq512mask(t, compare); |
777 | } |
778 | static really_inline |
779 | const u8 *firstMatch64(const u8 *buf, u64a z) { |
780 | DEBUG_PRINTF("z 0x%016llx\n" , z); |
781 | if (unlikely(z != ~0ULL)) { |
782 | u32 pos = ctz64(~z); |
783 | DEBUG_PRINTF("match @ pos %u\n" , pos); |
784 | assert(pos < 64); |
785 | return buf + pos; |
786 | } else { |
787 | return NULL; // no match |
788 | } |
789 | } |
790 | |
791 | static really_inline |
792 | const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, |
793 | const m512 low4bits, const m512 zeroes) { |
794 | u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); |
795 | |
796 | return firstMatch64(buf, z); |
797 | } |
798 | |
799 | static really_inline |
800 | const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, |
801 | const u8 *buf_end, const m512 low4bits, |
802 | const m512 zeroes) { |
803 | DEBUG_PRINTF("short shufti %p len %zu\n" , buf, buf_end - buf); |
804 | uintptr_t len = buf_end - buf; |
805 | assert(len <= 64); |
806 | |
807 | // load mask |
808 | u64a k = (~0ULL) >> (64 - len); |
809 | DEBUG_PRINTF("load mask 0x%016llx\n" , k); |
810 | |
811 | m512 chars = loadu_maskz_m512(k, buf); |
812 | |
813 | u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); |
814 | |
815 | // reuse the load mask to indicate valid bytes |
816 | return firstMatch64(buf, z | ~k); |
817 | } |
818 | |
819 | /* takes 128 bit masks, but operates on 512 bits of data */ |
820 | const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, |
821 | const u8 *buf_end) { |
822 | assert(buf && buf_end); |
823 | assert(buf < buf_end); |
824 | DEBUG_PRINTF("shufti %p len %zu\n" , buf, buf_end - buf); |
825 | DEBUG_PRINTF("b %s\n" , buf); |
826 | |
827 | const m512 low4bits = set64x8(0xf); |
828 | const m512 zeroes = zeroes512(); |
829 | const m512 wide_mask_lo = set4x128(mask_lo); |
830 | const m512 wide_mask_hi = set4x128(mask_hi); |
831 | const u8 *rv; |
832 | |
833 | // small cases. |
834 | if (buf_end - buf <= 64) { |
835 | rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits, |
836 | zeroes); |
837 | return rv ? rv : buf_end; |
838 | } |
839 | |
840 | assert(buf_end - buf >= 64); |
841 | |
842 | // Preconditioning: most of the time our buffer won't be aligned. |
843 | if ((uintptr_t)buf % 64) { |
844 | rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, |
845 | ROUNDUP_PTR(buf, 64), low4bits, zeroes); |
846 | if (rv) { |
847 | return rv; |
848 | } |
849 | buf = ROUNDUP_PTR(buf, 64); |
850 | } |
851 | |
852 | const u8 *last_block = ROUNDDOWN_PTR(buf_end, 64); |
853 | while (buf < last_block) { |
854 | m512 lchars = load512(buf); |
855 | rv = fwdBlock512(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, |
856 | zeroes); |
857 | if (rv) { |
858 | return rv; |
859 | } |
860 | buf += 64; |
861 | } |
862 | |
863 | if (buf == buf_end) { |
864 | goto done; |
865 | } |
866 | |
867 | // Use an unaligned load to mop up the last 64 bytes and get an accurate |
868 | // picture to buf_end. |
869 | assert(buf <= buf_end && buf >= buf_end - 64); |
870 | m512 chars = loadu512(buf_end - 64); |
871 | rv = fwdBlock512(wide_mask_lo, wide_mask_hi, chars, buf_end - 64, low4bits, |
872 | zeroes); |
873 | if (rv) { |
874 | return rv; |
875 | } |
876 | done: |
877 | return buf_end; |
878 | } |
879 | |
880 | static really_inline |
881 | const u8 *lastMatch64(const u8 *buf, u64a z) { |
882 | DEBUG_PRINTF("z 0x%016llx\n" , z); |
883 | if (unlikely(z != ~0ULL)) { |
884 | u32 pos = clz64(~z); |
885 | DEBUG_PRINTF("buf=%p, pos=%u\n" , buf, pos); |
886 | return buf + (63 - pos); |
887 | } else { |
888 | return NULL; // no match |
889 | } |
890 | } |
891 | |
892 | static really_inline |
893 | const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, |
894 | const u8 *buf_end, const m512 low4bits, |
895 | const m512 zeroes) { |
896 | DEBUG_PRINTF("short %p len %zu\n" , buf, buf_end - buf); |
897 | uintptr_t len = buf_end - buf; |
898 | assert(len <= 64); |
899 | |
900 | // load mask |
901 | u64a k = (~0ULL) >> (64 - len); |
902 | DEBUG_PRINTF("load mask 0x%016llx\n" , k); |
903 | |
904 | m512 chars = loadu_maskz_m512(k, buf); |
905 | |
906 | u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); |
907 | |
908 | // reuse the load mask to indicate valid bytes |
909 | return lastMatch64(buf, z | ~k); |
910 | } |
911 | |
912 | static really_inline |
913 | const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, |
914 | const m512 low4bits, const m512 zeroes) { |
915 | m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); |
916 | m512 c_hi = pshufb_m512(mask_hi, |
917 | rshift64_m512(andnot512(low4bits, chars), 4)); |
918 | m512 t = and512(c_lo, c_hi); |
919 | |
920 | #ifdef DEBUG |
921 | DEBUG_PRINTF(" chars: " ); dumpMsk512AsChars(chars); printf("\n" ); |
922 | DEBUG_PRINTF(" char: " ); dumpMsk512(chars); printf("\n" ); |
923 | DEBUG_PRINTF(" c_lo: " ); dumpMsk512(c_lo); printf("\n" ); |
924 | DEBUG_PRINTF(" c_hi: " ); dumpMsk512(c_hi); printf("\n" ); |
925 | DEBUG_PRINTF(" t: " ); dumpMsk512(t); printf("\n" ); |
926 | #endif |
927 | |
928 | u64a z = eq512mask(t, zeroes); |
929 | return lastMatch64(buf, z); |
930 | } |
931 | |
932 | /* takes 128 bit masks, but operates on 512 bits of data */ |
933 | const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, |
934 | const u8 *buf_end) { |
935 | DEBUG_PRINTF("buf %p buf_end %p\n" , buf, buf_end); |
936 | assert(buf && buf_end); |
937 | assert(buf < buf_end); |
938 | |
939 | const m512 low4bits = set64x8(0xf); |
940 | const m512 zeroes = zeroes512(); |
941 | const m512 wide_mask_lo = set4x128(mask_lo); |
942 | const m512 wide_mask_hi = set4x128(mask_hi); |
943 | const u8 *rv; |
944 | |
945 | if (buf_end - buf < 64) { |
946 | rv = rshortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits, |
947 | zeroes); |
948 | return rv ? rv : buf - 1; |
949 | } |
950 | |
951 | if (ROUNDDOWN_PTR(buf_end, 64) != buf_end) { |
952 | // peel off unaligned portion |
953 | assert(buf_end - buf >= 64); |
954 | DEBUG_PRINTF("start\n" ); |
955 | rv = rshortShufti512(wide_mask_lo, wide_mask_hi, |
956 | ROUNDDOWN_PTR(buf_end, 64), buf_end, low4bits, |
957 | zeroes); |
958 | if (rv) { |
959 | return rv; |
960 | } |
961 | buf_end = ROUNDDOWN_PTR(buf_end, 64); |
962 | } |
963 | |
964 | const u8 *last_block = ROUNDUP_PTR(buf, 64); |
965 | while (buf_end > last_block) { |
966 | buf_end -= 64; |
967 | m512 lchars = load512(buf_end); |
968 | rv = revBlock512(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, |
969 | zeroes); |
970 | if (rv) { |
971 | return rv; |
972 | } |
973 | } |
974 | if (buf_end == buf) { |
975 | goto done; |
976 | } |
977 | // Use an unaligned load to mop up the last 64 bytes and get an accurate |
978 | // picture to buf. |
979 | m512 chars = loadu512(buf); |
980 | rv = revBlock512(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); |
981 | if (rv) { |
982 | return rv; |
983 | } |
984 | done: |
985 | return buf - 1; |
986 | } |
987 | |
988 | static really_inline |
989 | const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi, |
990 | m512 chars, const u8 *buf, const m512 low4bits, |
991 | const m512 ones, __mmask64 k) { |
992 | DEBUG_PRINTF("buf %p %.64s\n" , buf, buf); |
993 | m512 chars_lo = and512(chars, low4bits); |
994 | m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4); |
995 | m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo); |
996 | m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi); |
997 | m512 t = or512(c_lo, c_hi); |
998 | |
999 | #ifdef DEBUG |
1000 | DEBUG_PRINTF(" chars: " ); dumpMsk512AsChars(chars); printf("\n" ); |
1001 | DEBUG_PRINTF(" char: " ); dumpMsk512(chars); printf("\n" ); |
1002 | DEBUG_PRINTF(" c_lo: " ); dumpMsk512(c_lo); printf("\n" ); |
1003 | DEBUG_PRINTF(" c_hi: " ); dumpMsk512(c_hi); printf("\n" ); |
1004 | DEBUG_PRINTF(" t: " ); dumpMsk512(t); printf("\n" ); |
1005 | #endif |
1006 | |
1007 | m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo); |
1008 | m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi); |
1009 | m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1)); |
1010 | |
1011 | #ifdef DEBUG |
1012 | DEBUG_PRINTF(" c2_lo: " ); dumpMsk512(c2_lo); printf("\n" ); |
1013 | DEBUG_PRINTF(" c2_hi: " ); dumpMsk512(c2_hi); printf("\n" ); |
1014 | DEBUG_PRINTF(" t2: " ); dumpMsk512(t2); printf("\n" ); |
1015 | #endif |
1016 | u64a z = eq512mask(t2, ones); |
1017 | |
1018 | return firstMatch64(buf, z | ~k); |
1019 | } |
1020 | |
1021 | static really_inline |
1022 | const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, |
1023 | m512 mask2_hi, const u8 *buf, const u8 *buf_end, |
1024 | const m512 low4bits, const m512 ones) { |
1025 | DEBUG_PRINTF("short %p len %zu\n" , buf, buf_end - buf); |
1026 | uintptr_t len = buf_end - buf; |
1027 | assert(len <= 64); |
1028 | |
1029 | u64a k = (~0ULL) >> (64 - len); |
1030 | DEBUG_PRINTF("load mask 0x%016llx\n" , k); |
1031 | |
1032 | m512 chars = loadu_mask_m512(ones, k, buf); |
1033 | |
1034 | const u8 *rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf, |
1035 | low4bits, ones, k); |
1036 | |
1037 | return rv; |
1038 | } |
1039 | |
1040 | /* takes 128 bit masks, but operates on 512 bits of data */ |
1041 | const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, |
1042 | m128 mask2_lo, m128 mask2_hi, |
1043 | const u8 *buf, const u8 *buf_end) { |
1044 | /* we should always have at least 16 bytes */ |
1045 | assert(buf_end - buf >= 16); |
1046 | DEBUG_PRINTF("buf %p len %zu\n" , buf, buf_end - buf); |
1047 | |
1048 | const m512 ones = ones512(); |
1049 | const m512 low4bits = set64x8(0xf); |
1050 | const m512 wide_mask1_lo = set4x128(mask1_lo); |
1051 | const m512 wide_mask1_hi = set4x128(mask1_hi); |
1052 | const m512 wide_mask2_lo = set4x128(mask2_lo); |
1053 | const m512 wide_mask2_hi = set4x128(mask2_hi); |
1054 | const u8 *rv; |
1055 | |
1056 | if (buf_end - buf <= 64) { |
1057 | rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, |
1058 | wide_mask2_hi, buf, buf_end, low4bits, ones); |
1059 | DEBUG_PRINTF("rv %p\n" , rv); |
1060 | return rv ? rv : buf_end; |
1061 | } |
1062 | |
1063 | // Preconditioning: most of the time our buffer won't be aligned. |
1064 | if ((uintptr_t)buf % 64) { |
1065 | rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, |
1066 | wide_mask2_hi, buf, ROUNDUP_PTR(buf, 64), |
1067 | low4bits, ones); |
1068 | if (rv) { |
1069 | return rv; |
1070 | } |
1071 | |
1072 | buf = ROUNDUP_PTR(buf, 64); |
1073 | } |
1074 | |
1075 | const u8 *last_block = buf_end - 64; |
1076 | while (buf < last_block) { |
1077 | m512 lchars = load512(buf); |
1078 | rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, |
1079 | wide_mask2_hi, lchars, buf, low4bits, ones, ~0); |
1080 | if (rv) { |
1081 | return rv; |
1082 | } |
1083 | buf += 64; |
1084 | } |
1085 | |
1086 | // Use an unaligned load to mop up the last 64 bytes and get an accurate |
1087 | // picture to buf_end. |
1088 | m512 chars = loadu512(buf_end - 64); |
1089 | rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, |
1090 | chars, buf_end - 64, low4bits, ones, ~0); |
1091 | if (rv) { |
1092 | return rv; |
1093 | } |
1094 | |
1095 | return buf_end; |
1096 | } |
1097 | #endif |
1098 | |