1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief SIMD types and primitive operations. |
31 | */ |
32 | |
33 | #ifndef SIMD_UTILS |
34 | #define SIMD_UTILS |
35 | |
36 | #if !defined(_WIN32) && !defined(__SSSE3__) |
37 | #error SSSE3 instructions must be enabled |
38 | #endif |
39 | |
40 | #include "config.h" |
41 | #include "ue2common.h" |
42 | #include "simd_types.h" |
43 | #include "unaligned.h" |
44 | #include "util/arch.h" |
45 | #include "util/intrinsics.h" |
46 | |
47 | #include <string.h> // for memcpy |
48 | |
49 | // Define a common assume_aligned using an appropriate compiler built-in, if |
50 | // it's available. Note that we need to handle C or C++ compilation. |
51 | #ifdef __cplusplus |
52 | # ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED |
53 | # define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) |
54 | # endif |
55 | #else |
56 | # ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED |
57 | # define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) |
58 | # endif |
59 | #endif |
60 | |
61 | // Fallback to identity case. |
62 | #ifndef assume_aligned |
63 | #define assume_aligned(x, y) (x) |
64 | #endif |
65 | |
66 | #ifdef __cplusplus |
67 | extern "C" { |
68 | #endif |
69 | extern const char vbs_mask_data[]; |
70 | #ifdef __cplusplus |
71 | } |
72 | #endif |
73 | |
74 | static really_inline m128 ones128(void) { |
75 | #if defined(__GNUC__) || defined(__INTEL_COMPILER) |
76 | /* gcc gets this right */ |
77 | return _mm_set1_epi8(0xFF); |
78 | #else |
79 | /* trick from Intel's optimization guide to generate all-ones. |
80 | * ICC converts this to the single cmpeq instruction */ |
81 | return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); |
82 | #endif |
83 | } |
84 | |
85 | static really_inline m128 zeroes128(void) { |
86 | return _mm_setzero_si128(); |
87 | } |
88 | |
89 | /** \brief Bitwise not for m128*/ |
90 | static really_inline m128 not128(m128 a) { |
91 | return _mm_xor_si128(a, ones128()); |
92 | } |
93 | |
94 | /** \brief Return 1 if a and b are different otherwise 0 */ |
95 | static really_inline int diff128(m128 a, m128 b) { |
96 | return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff); |
97 | } |
98 | |
99 | static really_inline int isnonzero128(m128 a) { |
100 | return !!diff128(a, zeroes128()); |
101 | } |
102 | |
103 | /** |
104 | * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit |
105 | * mask indicating which 32-bit words contain differences. |
106 | */ |
107 | static really_inline u32 diffrich128(m128 a, m128 b) { |
108 | a = _mm_cmpeq_epi32(a, b); |
109 | return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf; |
110 | } |
111 | |
112 | /** |
113 | * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and |
114 | * returns a 4-bit mask indicating which 64-bit words contain differences. |
115 | */ |
116 | static really_inline u32 diffrich64_128(m128 a, m128 b) { |
117 | #if defined(HAVE_SSE41) |
118 | a = _mm_cmpeq_epi64(a, b); |
119 | return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; |
120 | #else |
121 | u32 d = diffrich128(a, b); |
122 | return (d | (d >> 1)) & 0x5; |
123 | #endif |
124 | } |
125 | |
126 | static really_really_inline |
127 | m128 lshift64_m128(m128 a, unsigned b) { |
128 | #if defined(HAVE__BUILTIN_CONSTANT_P) |
129 | if (__builtin_constant_p(b)) { |
130 | return _mm_slli_epi64(a, b); |
131 | } |
132 | #endif |
133 | m128 x = _mm_cvtsi32_si128(b); |
134 | return _mm_sll_epi64(a, x); |
135 | } |
136 | |
137 | #define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) |
138 | #define eq128(a, b) _mm_cmpeq_epi8((a), (b)) |
139 | #define movemask128(a) ((u32)_mm_movemask_epi8((a))) |
140 | |
141 | static really_inline m128 set16x8(u8 c) { |
142 | return _mm_set1_epi8(c); |
143 | } |
144 | |
145 | static really_inline m128 set4x32(u32 c) { |
146 | return _mm_set1_epi32(c); |
147 | } |
148 | |
149 | static really_inline u32 movd(const m128 in) { |
150 | return _mm_cvtsi128_si32(in); |
151 | } |
152 | |
153 | static really_inline u64a movq(const m128 in) { |
154 | #if defined(ARCH_X86_64) |
155 | return _mm_cvtsi128_si64(in); |
156 | #else // 32-bit - this is horrific |
157 | u32 lo = movd(in); |
158 | u32 hi = movd(_mm_srli_epi64(in, 32)); |
159 | return (u64a)hi << 32 | lo; |
160 | #endif |
161 | } |
162 | |
163 | /* another form of movq */ |
164 | static really_inline |
165 | m128 load_m128_from_u64a(const u64a *p) { |
166 | return _mm_set_epi64x(0LL, *p); |
167 | } |
168 | |
169 | #define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) |
170 | #define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) |
171 | |
172 | #if defined(HAVE_SSE41) |
173 | #define (a, imm) _mm_extract_epi32(a, imm) |
174 | #define (a, imm) _mm_extract_epi64(a, imm) |
175 | #else |
176 | #define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2)) |
177 | #define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3)) |
178 | #endif |
179 | |
180 | #if !defined(HAVE_AVX2) |
181 | // TODO: this entire file needs restructuring - this carveout is awful |
182 | #define (a) movq(a.lo) |
183 | #define (a) movd(a.lo) |
184 | #if defined(HAVE_SSE41) |
185 | #define (a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) |
186 | #define (a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2) |
187 | #else |
188 | #define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4)) |
189 | #define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8)) |
190 | #endif |
191 | |
192 | #endif // !AVX2 |
193 | |
194 | static really_inline m128 and128(m128 a, m128 b) { |
195 | return _mm_and_si128(a,b); |
196 | } |
197 | |
198 | static really_inline m128 xor128(m128 a, m128 b) { |
199 | return _mm_xor_si128(a,b); |
200 | } |
201 | |
202 | static really_inline m128 or128(m128 a, m128 b) { |
203 | return _mm_or_si128(a,b); |
204 | } |
205 | |
206 | static really_inline m128 andnot128(m128 a, m128 b) { |
207 | return _mm_andnot_si128(a, b); |
208 | } |
209 | |
210 | // aligned load |
211 | static really_inline m128 load128(const void *ptr) { |
212 | assert(ISALIGNED_N(ptr, alignof(m128))); |
213 | ptr = assume_aligned(ptr, 16); |
214 | return _mm_load_si128((const m128 *)ptr); |
215 | } |
216 | |
217 | // aligned store |
218 | static really_inline void store128(void *ptr, m128 a) { |
219 | assert(ISALIGNED_N(ptr, alignof(m128))); |
220 | ptr = assume_aligned(ptr, 16); |
221 | *(m128 *)ptr = a; |
222 | } |
223 | |
224 | // unaligned load |
225 | static really_inline m128 loadu128(const void *ptr) { |
226 | return _mm_loadu_si128((const m128 *)ptr); |
227 | } |
228 | |
229 | // unaligned store |
230 | static really_inline void storeu128(void *ptr, m128 a) { |
231 | _mm_storeu_si128 ((m128 *)ptr, a); |
232 | } |
233 | |
234 | // packed unaligned store of first N bytes |
235 | static really_inline |
236 | void storebytes128(void *ptr, m128 a, unsigned int n) { |
237 | assert(n <= sizeof(a)); |
238 | memcpy(ptr, &a, n); |
239 | } |
240 | |
241 | // packed unaligned load of first N bytes, pad with zero |
242 | static really_inline |
243 | m128 loadbytes128(const void *ptr, unsigned int n) { |
244 | m128 a = zeroes128(); |
245 | assert(n <= sizeof(a)); |
246 | memcpy(&a, ptr, n); |
247 | return a; |
248 | } |
249 | |
250 | #ifdef __cplusplus |
251 | extern "C" { |
252 | #endif |
253 | extern const u8 simd_onebit_masks[]; |
254 | #ifdef __cplusplus |
255 | } |
256 | #endif |
257 | |
258 | static really_inline |
259 | m128 mask1bit128(unsigned int n) { |
260 | assert(n < sizeof(m128) * 8); |
261 | u32 mask_idx = ((n % 8) * 64) + 95; |
262 | mask_idx -= n / 8; |
263 | return loadu128(&simd_onebit_masks[mask_idx]); |
264 | } |
265 | |
266 | // switches on bit N in the given vector. |
267 | static really_inline |
268 | void setbit128(m128 *ptr, unsigned int n) { |
269 | *ptr = or128(mask1bit128(n), *ptr); |
270 | } |
271 | |
272 | // switches off bit N in the given vector. |
273 | static really_inline |
274 | void clearbit128(m128 *ptr, unsigned int n) { |
275 | *ptr = andnot128(mask1bit128(n), *ptr); |
276 | } |
277 | |
278 | // tests bit N in the given vector. |
279 | static really_inline |
280 | char testbit128(m128 val, unsigned int n) { |
281 | const m128 mask = mask1bit128(n); |
282 | #if defined(HAVE_SSE41) |
283 | return !_mm_testz_si128(mask, val); |
284 | #else |
285 | return isnonzero128(and128(mask, val)); |
286 | #endif |
287 | } |
288 | |
289 | // offset must be an immediate |
290 | #define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) |
291 | |
292 | static really_inline |
293 | m128 pshufb_m128(m128 a, m128 b) { |
294 | m128 result; |
295 | result = _mm_shuffle_epi8(a, b); |
296 | return result; |
297 | } |
298 | |
299 | static really_inline |
300 | m256 pshufb_m256(m256 a, m256 b) { |
301 | #if defined(HAVE_AVX2) |
302 | return _mm256_shuffle_epi8(a, b); |
303 | #else |
304 | m256 rv; |
305 | rv.lo = pshufb_m128(a.lo, b.lo); |
306 | rv.hi = pshufb_m128(a.hi, b.hi); |
307 | return rv; |
308 | #endif |
309 | } |
310 | |
311 | #if defined(HAVE_AVX512) |
312 | static really_inline |
313 | m512 pshufb_m512(m512 a, m512 b) { |
314 | return _mm512_shuffle_epi8(a, b); |
315 | } |
316 | |
317 | static really_inline |
318 | m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { |
319 | return _mm512_maskz_shuffle_epi8(k, a, b); |
320 | } |
321 | #endif |
322 | |
323 | static really_inline |
324 | m128 variable_byte_shift_m128(m128 in, s32 amount) { |
325 | assert(amount >= -16 && amount <= 16); |
326 | m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); |
327 | return pshufb_m128(in, shift_mask); |
328 | } |
329 | |
330 | static really_inline |
331 | m128 max_u8_m128(m128 a, m128 b) { |
332 | return _mm_max_epu8(a, b); |
333 | } |
334 | |
335 | static really_inline |
336 | m128 min_u8_m128(m128 a, m128 b) { |
337 | return _mm_min_epu8(a, b); |
338 | } |
339 | |
340 | static really_inline |
341 | m128 sadd_u8_m128(m128 a, m128 b) { |
342 | return _mm_adds_epu8(a, b); |
343 | } |
344 | |
345 | static really_inline |
346 | m128 sub_u8_m128(m128 a, m128 b) { |
347 | return _mm_sub_epi8(a, b); |
348 | } |
349 | |
350 | static really_inline |
351 | m128 set64x2(u64a hi, u64a lo) { |
352 | return _mm_set_epi64x(hi, lo); |
353 | } |
354 | |
355 | /**** |
356 | **** 256-bit Primitives |
357 | ****/ |
358 | |
359 | #if defined(HAVE_AVX2) |
360 | |
361 | static really_really_inline |
362 | m256 lshift64_m256(m256 a, unsigned b) { |
363 | #if defined(HAVE__BUILTIN_CONSTANT_P) |
364 | if (__builtin_constant_p(b)) { |
365 | return _mm256_slli_epi64(a, b); |
366 | } |
367 | #endif |
368 | m128 x = _mm_cvtsi32_si128(b); |
369 | return _mm256_sll_epi64(a, x); |
370 | } |
371 | |
372 | #define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) |
373 | |
374 | static really_inline |
375 | m256 set32x8(u32 in) { |
376 | return _mm256_set1_epi8(in); |
377 | } |
378 | |
379 | #define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) |
380 | #define movemask256(a) ((u32)_mm256_movemask_epi8((a))) |
381 | |
382 | static really_inline |
383 | m256 set2x128(m128 a) { |
384 | return _mm256_broadcastsi128_si256(a); |
385 | } |
386 | |
387 | #else |
388 | |
389 | static really_really_inline |
390 | m256 lshift64_m256(m256 a, int b) { |
391 | m256 rv = a; |
392 | rv.lo = lshift64_m128(rv.lo, b); |
393 | rv.hi = lshift64_m128(rv.hi, b); |
394 | return rv; |
395 | } |
396 | |
397 | static really_inline |
398 | m256 rshift64_m256(m256 a, int b) { |
399 | m256 rv = a; |
400 | rv.lo = rshift64_m128(rv.lo, b); |
401 | rv.hi = rshift64_m128(rv.hi, b); |
402 | return rv; |
403 | } |
404 | static really_inline |
405 | m256 set32x8(u32 in) { |
406 | m256 rv; |
407 | rv.lo = set16x8((u8) in); |
408 | rv.hi = rv.lo; |
409 | return rv; |
410 | } |
411 | |
412 | static really_inline |
413 | m256 eq256(m256 a, m256 b) { |
414 | m256 rv; |
415 | rv.lo = eq128(a.lo, b.lo); |
416 | rv.hi = eq128(a.hi, b.hi); |
417 | return rv; |
418 | } |
419 | |
420 | static really_inline |
421 | u32 movemask256(m256 a) { |
422 | u32 lo_mask = movemask128(a.lo); |
423 | u32 hi_mask = movemask128(a.hi); |
424 | return lo_mask | (hi_mask << 16); |
425 | } |
426 | |
427 | static really_inline |
428 | m256 set2x128(m128 a) { |
429 | m256 rv = {a, a}; |
430 | return rv; |
431 | } |
432 | #endif |
433 | |
434 | static really_inline m256 zeroes256(void) { |
435 | #if defined(HAVE_AVX2) |
436 | return _mm256_setzero_si256(); |
437 | #else |
438 | m256 rv = {zeroes128(), zeroes128()}; |
439 | return rv; |
440 | #endif |
441 | } |
442 | |
443 | static really_inline m256 ones256(void) { |
444 | #if defined(HAVE_AVX2) |
445 | m256 rv = _mm256_set1_epi8(0xFF); |
446 | #else |
447 | m256 rv = {ones128(), ones128()}; |
448 | #endif |
449 | return rv; |
450 | } |
451 | |
452 | #if defined(HAVE_AVX2) |
453 | static really_inline m256 and256(m256 a, m256 b) { |
454 | return _mm256_and_si256(a, b); |
455 | } |
456 | #else |
457 | static really_inline m256 and256(m256 a, m256 b) { |
458 | m256 rv; |
459 | rv.lo = and128(a.lo, b.lo); |
460 | rv.hi = and128(a.hi, b.hi); |
461 | return rv; |
462 | } |
463 | #endif |
464 | |
465 | #if defined(HAVE_AVX2) |
466 | static really_inline m256 or256(m256 a, m256 b) { |
467 | return _mm256_or_si256(a, b); |
468 | } |
469 | #else |
470 | static really_inline m256 or256(m256 a, m256 b) { |
471 | m256 rv; |
472 | rv.lo = or128(a.lo, b.lo); |
473 | rv.hi = or128(a.hi, b.hi); |
474 | return rv; |
475 | } |
476 | #endif |
477 | |
478 | #if defined(HAVE_AVX2) |
479 | static really_inline m256 xor256(m256 a, m256 b) { |
480 | return _mm256_xor_si256(a, b); |
481 | } |
482 | #else |
483 | static really_inline m256 xor256(m256 a, m256 b) { |
484 | m256 rv; |
485 | rv.lo = xor128(a.lo, b.lo); |
486 | rv.hi = xor128(a.hi, b.hi); |
487 | return rv; |
488 | } |
489 | #endif |
490 | |
491 | #if defined(HAVE_AVX2) |
492 | static really_inline m256 not256(m256 a) { |
493 | return _mm256_xor_si256(a, ones256()); |
494 | } |
495 | #else |
496 | static really_inline m256 not256(m256 a) { |
497 | m256 rv; |
498 | rv.lo = not128(a.lo); |
499 | rv.hi = not128(a.hi); |
500 | return rv; |
501 | } |
502 | #endif |
503 | |
504 | #if defined(HAVE_AVX2) |
505 | static really_inline m256 andnot256(m256 a, m256 b) { |
506 | return _mm256_andnot_si256(a, b); |
507 | } |
508 | #else |
509 | static really_inline m256 andnot256(m256 a, m256 b) { |
510 | m256 rv; |
511 | rv.lo = andnot128(a.lo, b.lo); |
512 | rv.hi = andnot128(a.hi, b.hi); |
513 | return rv; |
514 | } |
515 | #endif |
516 | |
517 | static really_inline int diff256(m256 a, m256 b) { |
518 | #if defined(HAVE_AVX2) |
519 | return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); |
520 | #else |
521 | return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); |
522 | #endif |
523 | } |
524 | |
525 | static really_inline int isnonzero256(m256 a) { |
526 | #if defined(HAVE_AVX2) |
527 | return !!diff256(a, zeroes256()); |
528 | #else |
529 | return isnonzero128(or128(a.lo, a.hi)); |
530 | #endif |
531 | } |
532 | |
533 | /** |
534 | * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit |
535 | * mask indicating which 32-bit words contain differences. |
536 | */ |
537 | static really_inline u32 diffrich256(m256 a, m256 b) { |
538 | #if defined(HAVE_AVX2) |
539 | a = _mm256_cmpeq_epi32(a, b); |
540 | return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; |
541 | #else |
542 | m128 z = zeroes128(); |
543 | a.lo = _mm_cmpeq_epi32(a.lo, b.lo); |
544 | a.hi = _mm_cmpeq_epi32(a.hi, b.hi); |
545 | m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); |
546 | return ~(_mm_movemask_epi8(packed)) & 0xff; |
547 | #endif |
548 | } |
549 | |
550 | /** |
551 | * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and |
552 | * returns an 8-bit mask indicating which 64-bit words contain differences. |
553 | */ |
554 | static really_inline u32 diffrich64_256(m256 a, m256 b) { |
555 | u32 d = diffrich256(a, b); |
556 | return (d | (d >> 1)) & 0x55555555; |
557 | } |
558 | |
559 | // aligned load |
560 | static really_inline m256 load256(const void *ptr) { |
561 | assert(ISALIGNED_N(ptr, alignof(m256))); |
562 | #if defined(HAVE_AVX2) |
563 | return _mm256_load_si256((const m256 *)ptr); |
564 | #else |
565 | m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; |
566 | return rv; |
567 | #endif |
568 | } |
569 | |
570 | // aligned load of 128-bit value to low and high part of 256-bit value |
571 | static really_inline m256 load2x128(const void *ptr) { |
572 | #if defined(HAVE_AVX2) |
573 | return set2x128(load128(ptr)); |
574 | #else |
575 | assert(ISALIGNED_N(ptr, alignof(m128))); |
576 | m256 rv; |
577 | rv.hi = rv.lo = load128(ptr); |
578 | return rv; |
579 | #endif |
580 | } |
581 | |
582 | static really_inline m256 loadu2x128(const void *ptr) { |
583 | return set2x128(loadu128(ptr)); |
584 | } |
585 | |
586 | // aligned store |
587 | static really_inline void store256(void *ptr, m256 a) { |
588 | assert(ISALIGNED_N(ptr, alignof(m256))); |
589 | #if defined(HAVE_AVX2) |
590 | _mm256_store_si256((m256 *)ptr, a); |
591 | #else |
592 | ptr = assume_aligned(ptr, 16); |
593 | *(m256 *)ptr = a; |
594 | #endif |
595 | } |
596 | |
597 | // unaligned load |
598 | static really_inline m256 loadu256(const void *ptr) { |
599 | #if defined(HAVE_AVX2) |
600 | return _mm256_loadu_si256((const m256 *)ptr); |
601 | #else |
602 | m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; |
603 | return rv; |
604 | #endif |
605 | } |
606 | |
607 | // unaligned store |
608 | static really_inline void storeu256(void *ptr, m256 a) { |
609 | #if defined(HAVE_AVX2) |
610 | _mm256_storeu_si256((m256 *)ptr, a); |
611 | #else |
612 | storeu128(ptr, a.lo); |
613 | storeu128((char *)ptr + 16, a.hi); |
614 | #endif |
615 | } |
616 | |
617 | // packed unaligned store of first N bytes |
618 | static really_inline |
619 | void storebytes256(void *ptr, m256 a, unsigned int n) { |
620 | assert(n <= sizeof(a)); |
621 | memcpy(ptr, &a, n); |
622 | } |
623 | |
624 | // packed unaligned load of first N bytes, pad with zero |
625 | static really_inline |
626 | m256 loadbytes256(const void *ptr, unsigned int n) { |
627 | m256 a = zeroes256(); |
628 | assert(n <= sizeof(a)); |
629 | memcpy(&a, ptr, n); |
630 | return a; |
631 | } |
632 | |
633 | static really_inline |
634 | m256 mask1bit256(unsigned int n) { |
635 | assert(n < sizeof(m256) * 8); |
636 | u32 mask_idx = ((n % 8) * 64) + 95; |
637 | mask_idx -= n / 8; |
638 | return loadu256(&simd_onebit_masks[mask_idx]); |
639 | } |
640 | |
641 | static really_inline |
642 | m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { |
643 | #if defined(HAVE_AVX2) |
644 | return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); |
645 | #else |
646 | m256 rv; |
647 | rv.hi = set64x2(hi_1, hi_0); |
648 | rv.lo = set64x2(lo_1, lo_0); |
649 | return rv; |
650 | #endif |
651 | } |
652 | |
653 | #if !defined(HAVE_AVX2) |
654 | // switches on bit N in the given vector. |
655 | static really_inline |
656 | void setbit256(m256 *ptr, unsigned int n) { |
657 | assert(n < sizeof(*ptr) * 8); |
658 | m128 *sub; |
659 | if (n < 128) { |
660 | sub = &ptr->lo; |
661 | } else { |
662 | sub = &ptr->hi; |
663 | n -= 128; |
664 | } |
665 | setbit128(sub, n); |
666 | } |
667 | |
668 | // switches off bit N in the given vector. |
669 | static really_inline |
670 | void clearbit256(m256 *ptr, unsigned int n) { |
671 | assert(n < sizeof(*ptr) * 8); |
672 | m128 *sub; |
673 | if (n < 128) { |
674 | sub = &ptr->lo; |
675 | } else { |
676 | sub = &ptr->hi; |
677 | n -= 128; |
678 | } |
679 | clearbit128(sub, n); |
680 | } |
681 | |
682 | // tests bit N in the given vector. |
683 | static really_inline |
684 | char testbit256(m256 val, unsigned int n) { |
685 | assert(n < sizeof(val) * 8); |
686 | m128 sub; |
687 | if (n < 128) { |
688 | sub = val.lo; |
689 | } else { |
690 | sub = val.hi; |
691 | n -= 128; |
692 | } |
693 | return testbit128(sub, n); |
694 | } |
695 | |
696 | static really_really_inline |
697 | m128 movdq_hi(m256 x) { |
698 | return x.hi; |
699 | } |
700 | |
701 | static really_really_inline |
702 | m128 movdq_lo(m256 x) { |
703 | return x.lo; |
704 | } |
705 | |
706 | static really_inline |
707 | m256 combine2x128(m128 hi, m128 lo) { |
708 | m256 rv = {lo, hi}; |
709 | return rv; |
710 | } |
711 | |
712 | #else // AVX2 |
713 | |
714 | // switches on bit N in the given vector. |
715 | static really_inline |
716 | void setbit256(m256 *ptr, unsigned int n) { |
717 | *ptr = or256(mask1bit256(n), *ptr); |
718 | } |
719 | |
720 | static really_inline |
721 | void clearbit256(m256 *ptr, unsigned int n) { |
722 | *ptr = andnot256(mask1bit256(n), *ptr); |
723 | } |
724 | |
725 | // tests bit N in the given vector. |
726 | static really_inline |
727 | char testbit256(m256 val, unsigned int n) { |
728 | const m256 mask = mask1bit256(n); |
729 | return !_mm256_testz_si256(mask, val); |
730 | } |
731 | |
732 | static really_really_inline |
733 | m128 movdq_hi(m256 x) { |
734 | return _mm256_extracti128_si256(x, 1); |
735 | } |
736 | |
737 | static really_really_inline |
738 | m128 movdq_lo(m256 x) { |
739 | return _mm256_extracti128_si256(x, 0); |
740 | } |
741 | |
742 | #define cast256to128(a) _mm256_castsi256_si128(a) |
743 | #define cast128to256(a) _mm256_castsi128_si256(a) |
744 | #define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) |
745 | #define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) |
746 | #define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) |
747 | #define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) |
748 | #define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) |
749 | #define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) |
750 | #define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) |
751 | #define extractlow32from256(a) movd(cast256to128(a)) |
752 | #define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) |
753 | #define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) |
754 | #define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) |
755 | |
756 | static really_inline |
757 | m256 combine2x128(m128 hi, m128 lo) { |
758 | #if defined(_mm256_set_m128i) |
759 | return _mm256_set_m128i(hi, lo); |
760 | #else |
761 | return insert128to256(cast128to256(lo), hi, 1); |
762 | #endif |
763 | } |
764 | #endif //AVX2 |
765 | |
766 | #if defined(HAVE_AVX512) |
767 | #define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) |
768 | #define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) |
769 | #define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) |
770 | #define set2x256(a) _mm512_broadcast_i64x4(a) |
771 | #define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) |
772 | #define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) |
773 | #endif |
774 | |
775 | /**** |
776 | **** 384-bit Primitives |
777 | ****/ |
778 | |
779 | static really_inline m384 and384(m384 a, m384 b) { |
780 | m384 rv; |
781 | rv.lo = and128(a.lo, b.lo); |
782 | rv.mid = and128(a.mid, b.mid); |
783 | rv.hi = and128(a.hi, b.hi); |
784 | return rv; |
785 | } |
786 | |
787 | static really_inline m384 or384(m384 a, m384 b) { |
788 | m384 rv; |
789 | rv.lo = or128(a.lo, b.lo); |
790 | rv.mid = or128(a.mid, b.mid); |
791 | rv.hi = or128(a.hi, b.hi); |
792 | return rv; |
793 | } |
794 | |
795 | static really_inline m384 xor384(m384 a, m384 b) { |
796 | m384 rv; |
797 | rv.lo = xor128(a.lo, b.lo); |
798 | rv.mid = xor128(a.mid, b.mid); |
799 | rv.hi = xor128(a.hi, b.hi); |
800 | return rv; |
801 | } |
802 | static really_inline m384 not384(m384 a) { |
803 | m384 rv; |
804 | rv.lo = not128(a.lo); |
805 | rv.mid = not128(a.mid); |
806 | rv.hi = not128(a.hi); |
807 | return rv; |
808 | } |
809 | static really_inline m384 andnot384(m384 a, m384 b) { |
810 | m384 rv; |
811 | rv.lo = andnot128(a.lo, b.lo); |
812 | rv.mid = andnot128(a.mid, b.mid); |
813 | rv.hi = andnot128(a.hi, b.hi); |
814 | return rv; |
815 | } |
816 | |
817 | static really_really_inline |
818 | m384 lshift64_m384(m384 a, unsigned b) { |
819 | m384 rv; |
820 | rv.lo = lshift64_m128(a.lo, b); |
821 | rv.mid = lshift64_m128(a.mid, b); |
822 | rv.hi = lshift64_m128(a.hi, b); |
823 | return rv; |
824 | } |
825 | |
826 | static really_inline m384 zeroes384(void) { |
827 | m384 rv = {zeroes128(), zeroes128(), zeroes128()}; |
828 | return rv; |
829 | } |
830 | |
831 | static really_inline m384 ones384(void) { |
832 | m384 rv = {ones128(), ones128(), ones128()}; |
833 | return rv; |
834 | } |
835 | |
836 | static really_inline int diff384(m384 a, m384 b) { |
837 | return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); |
838 | } |
839 | |
840 | static really_inline int isnonzero384(m384 a) { |
841 | return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); |
842 | } |
843 | |
844 | /** |
845 | * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit |
846 | * mask indicating which 32-bit words contain differences. |
847 | */ |
848 | static really_inline u32 diffrich384(m384 a, m384 b) { |
849 | m128 z = zeroes128(); |
850 | a.lo = _mm_cmpeq_epi32(a.lo, b.lo); |
851 | a.mid = _mm_cmpeq_epi32(a.mid, b.mid); |
852 | a.hi = _mm_cmpeq_epi32(a.hi, b.hi); |
853 | m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid), |
854 | _mm_packs_epi32(a.hi, z)); |
855 | return ~(_mm_movemask_epi8(packed)) & 0xfff; |
856 | } |
857 | |
858 | /** |
859 | * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and |
860 | * returns a 12-bit mask indicating which 64-bit words contain differences. |
861 | */ |
862 | static really_inline u32 diffrich64_384(m384 a, m384 b) { |
863 | u32 d = diffrich384(a, b); |
864 | return (d | (d >> 1)) & 0x55555555; |
865 | } |
866 | |
867 | // aligned load |
868 | static really_inline m384 load384(const void *ptr) { |
869 | assert(ISALIGNED_16(ptr)); |
870 | m384 rv = { load128(ptr), load128((const char *)ptr + 16), |
871 | load128((const char *)ptr + 32) }; |
872 | return rv; |
873 | } |
874 | |
875 | // aligned store |
876 | static really_inline void store384(void *ptr, m384 a) { |
877 | assert(ISALIGNED_16(ptr)); |
878 | ptr = assume_aligned(ptr, 16); |
879 | *(m384 *)ptr = a; |
880 | } |
881 | |
882 | // unaligned load |
883 | static really_inline m384 loadu384(const void *ptr) { |
884 | m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), |
885 | loadu128((const char *)ptr + 32)}; |
886 | return rv; |
887 | } |
888 | |
889 | // packed unaligned store of first N bytes |
890 | static really_inline |
891 | void storebytes384(void *ptr, m384 a, unsigned int n) { |
892 | assert(n <= sizeof(a)); |
893 | memcpy(ptr, &a, n); |
894 | } |
895 | |
896 | // packed unaligned load of first N bytes, pad with zero |
897 | static really_inline |
898 | m384 loadbytes384(const void *ptr, unsigned int n) { |
899 | m384 a = zeroes384(); |
900 | assert(n <= sizeof(a)); |
901 | memcpy(&a, ptr, n); |
902 | return a; |
903 | } |
904 | |
905 | // switches on bit N in the given vector. |
906 | static really_inline |
907 | void setbit384(m384 *ptr, unsigned int n) { |
908 | assert(n < sizeof(*ptr) * 8); |
909 | m128 *sub; |
910 | if (n < 128) { |
911 | sub = &ptr->lo; |
912 | } else if (n < 256) { |
913 | sub = &ptr->mid; |
914 | } else { |
915 | sub = &ptr->hi; |
916 | } |
917 | setbit128(sub, n % 128); |
918 | } |
919 | |
920 | // switches off bit N in the given vector. |
921 | static really_inline |
922 | void clearbit384(m384 *ptr, unsigned int n) { |
923 | assert(n < sizeof(*ptr) * 8); |
924 | m128 *sub; |
925 | if (n < 128) { |
926 | sub = &ptr->lo; |
927 | } else if (n < 256) { |
928 | sub = &ptr->mid; |
929 | } else { |
930 | sub = &ptr->hi; |
931 | } |
932 | clearbit128(sub, n % 128); |
933 | } |
934 | |
935 | // tests bit N in the given vector. |
936 | static really_inline |
937 | char testbit384(m384 val, unsigned int n) { |
938 | assert(n < sizeof(val) * 8); |
939 | m128 sub; |
940 | if (n < 128) { |
941 | sub = val.lo; |
942 | } else if (n < 256) { |
943 | sub = val.mid; |
944 | } else { |
945 | sub = val.hi; |
946 | } |
947 | return testbit128(sub, n % 128); |
948 | } |
949 | |
950 | /**** |
951 | **** 512-bit Primitives |
952 | ****/ |
953 | |
954 | #define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) |
955 | #define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) |
956 | |
957 | static really_inline |
958 | m512 zeroes512(void) { |
959 | #if defined(HAVE_AVX512) |
960 | return _mm512_setzero_si512(); |
961 | #else |
962 | m512 rv = {zeroes256(), zeroes256()}; |
963 | return rv; |
964 | #endif |
965 | } |
966 | |
967 | static really_inline |
968 | m512 ones512(void) { |
969 | #if defined(HAVE_AVX512) |
970 | return _mm512_set1_epi8(0xFF); |
971 | //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); |
972 | #else |
973 | m512 rv = {ones256(), ones256()}; |
974 | return rv; |
975 | #endif |
976 | } |
977 | |
978 | #if defined(HAVE_AVX512) |
979 | static really_inline |
980 | m512 set64x8(u8 a) { |
981 | return _mm512_set1_epi8(a); |
982 | } |
983 | |
984 | static really_inline |
985 | m512 set8x64(u64a a) { |
986 | return _mm512_set1_epi64(a); |
987 | } |
988 | |
989 | static really_inline |
990 | m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, |
991 | u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { |
992 | return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, |
993 | lo_3, lo_2, lo_1, lo_0); |
994 | } |
995 | |
996 | static really_inline |
997 | m512 swap256in512(m512 a) { |
998 | m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); |
999 | return vpermq512(idx, a); |
1000 | } |
1001 | |
1002 | static really_inline |
1003 | m512 set4x128(m128 a) { |
1004 | return _mm512_broadcast_i32x4(a); |
1005 | } |
1006 | #endif |
1007 | |
1008 | static really_inline |
1009 | m512 and512(m512 a, m512 b) { |
1010 | #if defined(HAVE_AVX512) |
1011 | return _mm512_and_si512(a, b); |
1012 | #else |
1013 | m512 rv; |
1014 | rv.lo = and256(a.lo, b.lo); |
1015 | rv.hi = and256(a.hi, b.hi); |
1016 | return rv; |
1017 | #endif |
1018 | } |
1019 | |
1020 | static really_inline |
1021 | m512 or512(m512 a, m512 b) { |
1022 | #if defined(HAVE_AVX512) |
1023 | return _mm512_or_si512(a, b); |
1024 | #else |
1025 | m512 rv; |
1026 | rv.lo = or256(a.lo, b.lo); |
1027 | rv.hi = or256(a.hi, b.hi); |
1028 | return rv; |
1029 | #endif |
1030 | } |
1031 | |
1032 | static really_inline |
1033 | m512 xor512(m512 a, m512 b) { |
1034 | #if defined(HAVE_AVX512) |
1035 | return _mm512_xor_si512(a, b); |
1036 | #else |
1037 | m512 rv; |
1038 | rv.lo = xor256(a.lo, b.lo); |
1039 | rv.hi = xor256(a.hi, b.hi); |
1040 | return rv; |
1041 | #endif |
1042 | } |
1043 | |
1044 | static really_inline |
1045 | m512 not512(m512 a) { |
1046 | #if defined(HAVE_AVX512) |
1047 | return _mm512_xor_si512(a, ones512()); |
1048 | #else |
1049 | m512 rv; |
1050 | rv.lo = not256(a.lo); |
1051 | rv.hi = not256(a.hi); |
1052 | return rv; |
1053 | #endif |
1054 | } |
1055 | |
1056 | static really_inline |
1057 | m512 andnot512(m512 a, m512 b) { |
1058 | #if defined(HAVE_AVX512) |
1059 | return _mm512_andnot_si512(a, b); |
1060 | #else |
1061 | m512 rv; |
1062 | rv.lo = andnot256(a.lo, b.lo); |
1063 | rv.hi = andnot256(a.hi, b.hi); |
1064 | return rv; |
1065 | #endif |
1066 | } |
1067 | |
1068 | #if defined(HAVE_AVX512) |
1069 | static really_really_inline |
1070 | m512 lshift64_m512(m512 a, unsigned b) { |
1071 | #if defined(HAVE__BUILTIN_CONSTANT_P) |
1072 | if (__builtin_constant_p(b)) { |
1073 | return _mm512_slli_epi64(a, b); |
1074 | } |
1075 | #endif |
1076 | m128 x = _mm_cvtsi32_si128(b); |
1077 | return _mm512_sll_epi64(a, x); |
1078 | } |
1079 | #else |
1080 | static really_really_inline |
1081 | m512 lshift64_m512(m512 a, unsigned b) { |
1082 | m512 rv; |
1083 | rv.lo = lshift64_m256(a.lo, b); |
1084 | rv.hi = lshift64_m256(a.hi, b); |
1085 | return rv; |
1086 | } |
1087 | #endif |
1088 | |
1089 | #if defined(HAVE_AVX512) |
1090 | #define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) |
1091 | #define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) |
1092 | #define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) |
1093 | #endif |
1094 | |
1095 | #if !defined(_MM_CMPINT_NE) |
1096 | #define _MM_CMPINT_NE 0x4 |
1097 | #endif |
1098 | |
1099 | static really_inline |
1100 | int diff512(m512 a, m512 b) { |
1101 | #if defined(HAVE_AVX512) |
1102 | return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); |
1103 | #else |
1104 | return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); |
1105 | #endif |
1106 | } |
1107 | |
1108 | static really_inline |
1109 | int isnonzero512(m512 a) { |
1110 | #if defined(HAVE_AVX512) |
1111 | return diff512(a, zeroes512()); |
1112 | #elif defined(HAVE_AVX2) |
1113 | m256 x = or256(a.lo, a.hi); |
1114 | return !!diff256(x, zeroes256()); |
1115 | #else |
1116 | m128 x = or128(a.lo.lo, a.lo.hi); |
1117 | m128 y = or128(a.hi.lo, a.hi.hi); |
1118 | return isnonzero128(or128(x, y)); |
1119 | #endif |
1120 | } |
1121 | |
1122 | /** |
1123 | * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit |
1124 | * mask indicating which 32-bit words contain differences. |
1125 | */ |
1126 | static really_inline |
1127 | u32 diffrich512(m512 a, m512 b) { |
1128 | #if defined(HAVE_AVX512) |
1129 | return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); |
1130 | #elif defined(HAVE_AVX2) |
1131 | return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); |
1132 | #else |
1133 | a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); |
1134 | a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); |
1135 | a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); |
1136 | a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); |
1137 | m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), |
1138 | _mm_packs_epi32(a.hi.lo, a.hi.hi)); |
1139 | return ~(_mm_movemask_epi8(packed)) & 0xffff; |
1140 | #endif |
1141 | } |
1142 | |
1143 | /** |
1144 | * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and |
1145 | * returns a 16-bit mask indicating which 64-bit words contain differences. |
1146 | */ |
1147 | static really_inline |
1148 | u32 diffrich64_512(m512 a, m512 b) { |
1149 | //TODO: cmp_epi64? |
1150 | u32 d = diffrich512(a, b); |
1151 | return (d | (d >> 1)) & 0x55555555; |
1152 | } |
1153 | |
1154 | // aligned load |
1155 | static really_inline |
1156 | m512 load512(const void *ptr) { |
1157 | #if defined(HAVE_AVX512) |
1158 | return _mm512_load_si512(ptr); |
1159 | #else |
1160 | assert(ISALIGNED_N(ptr, alignof(m256))); |
1161 | m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; |
1162 | return rv; |
1163 | #endif |
1164 | } |
1165 | |
1166 | // aligned store |
1167 | static really_inline |
1168 | void store512(void *ptr, m512 a) { |
1169 | assert(ISALIGNED_N(ptr, alignof(m512))); |
1170 | #if defined(HAVE_AVX512) |
1171 | return _mm512_store_si512(ptr, a); |
1172 | #elif defined(HAVE_AVX2) |
1173 | m512 *x = (m512 *)ptr; |
1174 | store256(&x->lo, a.lo); |
1175 | store256(&x->hi, a.hi); |
1176 | #else |
1177 | ptr = assume_aligned(ptr, 16); |
1178 | *(m512 *)ptr = a; |
1179 | #endif |
1180 | } |
1181 | |
1182 | // unaligned load |
1183 | static really_inline |
1184 | m512 loadu512(const void *ptr) { |
1185 | #if defined(HAVE_AVX512) |
1186 | return _mm512_loadu_si512(ptr); |
1187 | #else |
1188 | m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; |
1189 | return rv; |
1190 | #endif |
1191 | } |
1192 | |
1193 | #if defined(HAVE_AVX512) |
1194 | static really_inline |
1195 | m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { |
1196 | return _mm512_maskz_loadu_epi8(k, ptr); |
1197 | } |
1198 | |
1199 | static really_inline |
1200 | m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { |
1201 | return _mm512_mask_loadu_epi8(src, k, ptr); |
1202 | } |
1203 | |
1204 | static really_inline |
1205 | m512 set_mask_m512(__mmask64 k) { |
1206 | return _mm512_movm_epi8(k); |
1207 | } |
1208 | #endif |
1209 | |
1210 | // packed unaligned store of first N bytes |
1211 | static really_inline |
1212 | void storebytes512(void *ptr, m512 a, unsigned int n) { |
1213 | assert(n <= sizeof(a)); |
1214 | memcpy(ptr, &a, n); |
1215 | } |
1216 | |
1217 | // packed unaligned load of first N bytes, pad with zero |
1218 | static really_inline |
1219 | m512 loadbytes512(const void *ptr, unsigned int n) { |
1220 | m512 a = zeroes512(); |
1221 | assert(n <= sizeof(a)); |
1222 | memcpy(&a, ptr, n); |
1223 | return a; |
1224 | } |
1225 | |
1226 | static really_inline |
1227 | m512 mask1bit512(unsigned int n) { |
1228 | assert(n < sizeof(m512) * 8); |
1229 | u32 mask_idx = ((n % 8) * 64) + 95; |
1230 | mask_idx -= n / 8; |
1231 | return loadu512(&simd_onebit_masks[mask_idx]); |
1232 | } |
1233 | |
1234 | // switches on bit N in the given vector. |
1235 | static really_inline |
1236 | void setbit512(m512 *ptr, unsigned int n) { |
1237 | assert(n < sizeof(*ptr) * 8); |
1238 | #if !defined(HAVE_AVX2) |
1239 | m128 *sub; |
1240 | if (n < 128) { |
1241 | sub = &ptr->lo.lo; |
1242 | } else if (n < 256) { |
1243 | sub = &ptr->lo.hi; |
1244 | } else if (n < 384) { |
1245 | sub = &ptr->hi.lo; |
1246 | } else { |
1247 | sub = &ptr->hi.hi; |
1248 | } |
1249 | setbit128(sub, n % 128); |
1250 | #elif defined(HAVE_AVX512) |
1251 | *ptr = or512(mask1bit512(n), *ptr); |
1252 | #else |
1253 | m256 *sub; |
1254 | if (n < 256) { |
1255 | sub = &ptr->lo; |
1256 | } else { |
1257 | sub = &ptr->hi; |
1258 | n -= 256; |
1259 | } |
1260 | setbit256(sub, n); |
1261 | #endif |
1262 | } |
1263 | |
1264 | // switches off bit N in the given vector. |
1265 | static really_inline |
1266 | void clearbit512(m512 *ptr, unsigned int n) { |
1267 | assert(n < sizeof(*ptr) * 8); |
1268 | #if !defined(HAVE_AVX2) |
1269 | m128 *sub; |
1270 | if (n < 128) { |
1271 | sub = &ptr->lo.lo; |
1272 | } else if (n < 256) { |
1273 | sub = &ptr->lo.hi; |
1274 | } else if (n < 384) { |
1275 | sub = &ptr->hi.lo; |
1276 | } else { |
1277 | sub = &ptr->hi.hi; |
1278 | } |
1279 | clearbit128(sub, n % 128); |
1280 | #elif defined(HAVE_AVX512) |
1281 | *ptr = andnot512(mask1bit512(n), *ptr); |
1282 | #else |
1283 | m256 *sub; |
1284 | if (n < 256) { |
1285 | sub = &ptr->lo; |
1286 | } else { |
1287 | sub = &ptr->hi; |
1288 | n -= 256; |
1289 | } |
1290 | clearbit256(sub, n); |
1291 | #endif |
1292 | } |
1293 | |
1294 | // tests bit N in the given vector. |
1295 | static really_inline |
1296 | char testbit512(m512 val, unsigned int n) { |
1297 | assert(n < sizeof(val) * 8); |
1298 | #if !defined(HAVE_AVX2) |
1299 | m128 sub; |
1300 | if (n < 128) { |
1301 | sub = val.lo.lo; |
1302 | } else if (n < 256) { |
1303 | sub = val.lo.hi; |
1304 | } else if (n < 384) { |
1305 | sub = val.hi.lo; |
1306 | } else { |
1307 | sub = val.hi.hi; |
1308 | } |
1309 | return testbit128(sub, n % 128); |
1310 | #elif defined(HAVE_AVX512) |
1311 | const m512 mask = mask1bit512(n); |
1312 | return !!_mm512_test_epi8_mask(mask, val); |
1313 | #else |
1314 | m256 sub; |
1315 | if (n < 256) { |
1316 | sub = val.lo; |
1317 | } else { |
1318 | sub = val.hi; |
1319 | n -= 256; |
1320 | } |
1321 | return testbit256(sub, n); |
1322 | #endif |
1323 | } |
1324 | |
1325 | #endif |
1326 | |