1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief SIMD types and primitive operations.
31 */
32
33#ifndef SIMD_UTILS
34#define SIMD_UTILS
35
36#if !defined(_WIN32) && !defined(__SSSE3__)
37#error SSSE3 instructions must be enabled
38#endif
39
40#include "config.h"
41#include "ue2common.h"
42#include "simd_types.h"
43#include "unaligned.h"
44#include "util/arch.h"
45#include "util/intrinsics.h"
46
47#include <string.h> // for memcpy
48
49// Define a common assume_aligned using an appropriate compiler built-in, if
50// it's available. Note that we need to handle C or C++ compilation.
51#ifdef __cplusplus
52# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
53# define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
54# endif
55#else
56# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
57# define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
58# endif
59#endif
60
61// Fallback to identity case.
62#ifndef assume_aligned
63#define assume_aligned(x, y) (x)
64#endif
65
66#ifdef __cplusplus
67extern "C" {
68#endif
69extern const char vbs_mask_data[];
70#ifdef __cplusplus
71}
72#endif
73
74static really_inline m128 ones128(void) {
75#if defined(__GNUC__) || defined(__INTEL_COMPILER)
76 /* gcc gets this right */
77 return _mm_set1_epi8(0xFF);
78#else
79 /* trick from Intel's optimization guide to generate all-ones.
80 * ICC converts this to the single cmpeq instruction */
81 return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
82#endif
83}
84
85static really_inline m128 zeroes128(void) {
86 return _mm_setzero_si128();
87}
88
89/** \brief Bitwise not for m128*/
90static really_inline m128 not128(m128 a) {
91 return _mm_xor_si128(a, ones128());
92}
93
94/** \brief Return 1 if a and b are different otherwise 0 */
95static really_inline int diff128(m128 a, m128 b) {
96 return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
97}
98
99static really_inline int isnonzero128(m128 a) {
100 return !!diff128(a, zeroes128());
101}
102
103/**
104 * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
105 * mask indicating which 32-bit words contain differences.
106 */
107static really_inline u32 diffrich128(m128 a, m128 b) {
108 a = _mm_cmpeq_epi32(a, b);
109 return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
110}
111
112/**
113 * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
114 * returns a 4-bit mask indicating which 64-bit words contain differences.
115 */
116static really_inline u32 diffrich64_128(m128 a, m128 b) {
117#if defined(HAVE_SSE41)
118 a = _mm_cmpeq_epi64(a, b);
119 return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
120#else
121 u32 d = diffrich128(a, b);
122 return (d | (d >> 1)) & 0x5;
123#endif
124}
125
126static really_really_inline
127m128 lshift64_m128(m128 a, unsigned b) {
128#if defined(HAVE__BUILTIN_CONSTANT_P)
129 if (__builtin_constant_p(b)) {
130 return _mm_slli_epi64(a, b);
131 }
132#endif
133 m128 x = _mm_cvtsi32_si128(b);
134 return _mm_sll_epi64(a, x);
135}
136
137#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
138#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
139#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
140
141static really_inline m128 set16x8(u8 c) {
142 return _mm_set1_epi8(c);
143}
144
145static really_inline m128 set4x32(u32 c) {
146 return _mm_set1_epi32(c);
147}
148
149static really_inline u32 movd(const m128 in) {
150 return _mm_cvtsi128_si32(in);
151}
152
153static really_inline u64a movq(const m128 in) {
154#if defined(ARCH_X86_64)
155 return _mm_cvtsi128_si64(in);
156#else // 32-bit - this is horrific
157 u32 lo = movd(in);
158 u32 hi = movd(_mm_srli_epi64(in, 32));
159 return (u64a)hi << 32 | lo;
160#endif
161}
162
163/* another form of movq */
164static really_inline
165m128 load_m128_from_u64a(const u64a *p) {
166 return _mm_set_epi64x(0LL, *p);
167}
168
169#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
170#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
171
172#if defined(HAVE_SSE41)
173#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
174#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
175#else
176#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
177#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
178#endif
179
180#if !defined(HAVE_AVX2)
181// TODO: this entire file needs restructuring - this carveout is awful
182#define extractlow64from256(a) movq(a.lo)
183#define extractlow32from256(a) movd(a.lo)
184#if defined(HAVE_SSE41)
185#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
186#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
187#else
188#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
189#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
190#endif
191
192#endif // !AVX2
193
194static really_inline m128 and128(m128 a, m128 b) {
195 return _mm_and_si128(a,b);
196}
197
198static really_inline m128 xor128(m128 a, m128 b) {
199 return _mm_xor_si128(a,b);
200}
201
202static really_inline m128 or128(m128 a, m128 b) {
203 return _mm_or_si128(a,b);
204}
205
206static really_inline m128 andnot128(m128 a, m128 b) {
207 return _mm_andnot_si128(a, b);
208}
209
210// aligned load
211static really_inline m128 load128(const void *ptr) {
212 assert(ISALIGNED_N(ptr, alignof(m128)));
213 ptr = assume_aligned(ptr, 16);
214 return _mm_load_si128((const m128 *)ptr);
215}
216
217// aligned store
218static really_inline void store128(void *ptr, m128 a) {
219 assert(ISALIGNED_N(ptr, alignof(m128)));
220 ptr = assume_aligned(ptr, 16);
221 *(m128 *)ptr = a;
222}
223
224// unaligned load
225static really_inline m128 loadu128(const void *ptr) {
226 return _mm_loadu_si128((const m128 *)ptr);
227}
228
229// unaligned store
230static really_inline void storeu128(void *ptr, m128 a) {
231 _mm_storeu_si128 ((m128 *)ptr, a);
232}
233
234// packed unaligned store of first N bytes
235static really_inline
236void storebytes128(void *ptr, m128 a, unsigned int n) {
237 assert(n <= sizeof(a));
238 memcpy(ptr, &a, n);
239}
240
241// packed unaligned load of first N bytes, pad with zero
242static really_inline
243m128 loadbytes128(const void *ptr, unsigned int n) {
244 m128 a = zeroes128();
245 assert(n <= sizeof(a));
246 memcpy(&a, ptr, n);
247 return a;
248}
249
250#ifdef __cplusplus
251extern "C" {
252#endif
253extern const u8 simd_onebit_masks[];
254#ifdef __cplusplus
255}
256#endif
257
258static really_inline
259m128 mask1bit128(unsigned int n) {
260 assert(n < sizeof(m128) * 8);
261 u32 mask_idx = ((n % 8) * 64) + 95;
262 mask_idx -= n / 8;
263 return loadu128(&simd_onebit_masks[mask_idx]);
264}
265
266// switches on bit N in the given vector.
267static really_inline
268void setbit128(m128 *ptr, unsigned int n) {
269 *ptr = or128(mask1bit128(n), *ptr);
270}
271
272// switches off bit N in the given vector.
273static really_inline
274void clearbit128(m128 *ptr, unsigned int n) {
275 *ptr = andnot128(mask1bit128(n), *ptr);
276}
277
278// tests bit N in the given vector.
279static really_inline
280char testbit128(m128 val, unsigned int n) {
281 const m128 mask = mask1bit128(n);
282#if defined(HAVE_SSE41)
283 return !_mm_testz_si128(mask, val);
284#else
285 return isnonzero128(and128(mask, val));
286#endif
287}
288
289// offset must be an immediate
290#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
291
292static really_inline
293m128 pshufb_m128(m128 a, m128 b) {
294 m128 result;
295 result = _mm_shuffle_epi8(a, b);
296 return result;
297}
298
299static really_inline
300m256 pshufb_m256(m256 a, m256 b) {
301#if defined(HAVE_AVX2)
302 return _mm256_shuffle_epi8(a, b);
303#else
304 m256 rv;
305 rv.lo = pshufb_m128(a.lo, b.lo);
306 rv.hi = pshufb_m128(a.hi, b.hi);
307 return rv;
308#endif
309}
310
311#if defined(HAVE_AVX512)
312static really_inline
313m512 pshufb_m512(m512 a, m512 b) {
314 return _mm512_shuffle_epi8(a, b);
315}
316
317static really_inline
318m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
319 return _mm512_maskz_shuffle_epi8(k, a, b);
320}
321#endif
322
323static really_inline
324m128 variable_byte_shift_m128(m128 in, s32 amount) {
325 assert(amount >= -16 && amount <= 16);
326 m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
327 return pshufb_m128(in, shift_mask);
328}
329
330static really_inline
331m128 max_u8_m128(m128 a, m128 b) {
332 return _mm_max_epu8(a, b);
333}
334
335static really_inline
336m128 min_u8_m128(m128 a, m128 b) {
337 return _mm_min_epu8(a, b);
338}
339
340static really_inline
341m128 sadd_u8_m128(m128 a, m128 b) {
342 return _mm_adds_epu8(a, b);
343}
344
345static really_inline
346m128 sub_u8_m128(m128 a, m128 b) {
347 return _mm_sub_epi8(a, b);
348}
349
350static really_inline
351m128 set64x2(u64a hi, u64a lo) {
352 return _mm_set_epi64x(hi, lo);
353}
354
355/****
356 **** 256-bit Primitives
357 ****/
358
359#if defined(HAVE_AVX2)
360
361static really_really_inline
362m256 lshift64_m256(m256 a, unsigned b) {
363#if defined(HAVE__BUILTIN_CONSTANT_P)
364 if (__builtin_constant_p(b)) {
365 return _mm256_slli_epi64(a, b);
366 }
367#endif
368 m128 x = _mm_cvtsi32_si128(b);
369 return _mm256_sll_epi64(a, x);
370}
371
372#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
373
374static really_inline
375m256 set32x8(u32 in) {
376 return _mm256_set1_epi8(in);
377}
378
379#define eq256(a, b) _mm256_cmpeq_epi8((a), (b))
380#define movemask256(a) ((u32)_mm256_movemask_epi8((a)))
381
382static really_inline
383m256 set2x128(m128 a) {
384 return _mm256_broadcastsi128_si256(a);
385}
386
387#else
388
389static really_really_inline
390m256 lshift64_m256(m256 a, int b) {
391 m256 rv = a;
392 rv.lo = lshift64_m128(rv.lo, b);
393 rv.hi = lshift64_m128(rv.hi, b);
394 return rv;
395}
396
397static really_inline
398m256 rshift64_m256(m256 a, int b) {
399 m256 rv = a;
400 rv.lo = rshift64_m128(rv.lo, b);
401 rv.hi = rshift64_m128(rv.hi, b);
402 return rv;
403}
404static really_inline
405m256 set32x8(u32 in) {
406 m256 rv;
407 rv.lo = set16x8((u8) in);
408 rv.hi = rv.lo;
409 return rv;
410}
411
412static really_inline
413m256 eq256(m256 a, m256 b) {
414 m256 rv;
415 rv.lo = eq128(a.lo, b.lo);
416 rv.hi = eq128(a.hi, b.hi);
417 return rv;
418}
419
420static really_inline
421u32 movemask256(m256 a) {
422 u32 lo_mask = movemask128(a.lo);
423 u32 hi_mask = movemask128(a.hi);
424 return lo_mask | (hi_mask << 16);
425}
426
427static really_inline
428m256 set2x128(m128 a) {
429 m256 rv = {a, a};
430 return rv;
431}
432#endif
433
434static really_inline m256 zeroes256(void) {
435#if defined(HAVE_AVX2)
436 return _mm256_setzero_si256();
437#else
438 m256 rv = {zeroes128(), zeroes128()};
439 return rv;
440#endif
441}
442
443static really_inline m256 ones256(void) {
444#if defined(HAVE_AVX2)
445 m256 rv = _mm256_set1_epi8(0xFF);
446#else
447 m256 rv = {ones128(), ones128()};
448#endif
449 return rv;
450}
451
452#if defined(HAVE_AVX2)
453static really_inline m256 and256(m256 a, m256 b) {
454 return _mm256_and_si256(a, b);
455}
456#else
457static really_inline m256 and256(m256 a, m256 b) {
458 m256 rv;
459 rv.lo = and128(a.lo, b.lo);
460 rv.hi = and128(a.hi, b.hi);
461 return rv;
462}
463#endif
464
465#if defined(HAVE_AVX2)
466static really_inline m256 or256(m256 a, m256 b) {
467 return _mm256_or_si256(a, b);
468}
469#else
470static really_inline m256 or256(m256 a, m256 b) {
471 m256 rv;
472 rv.lo = or128(a.lo, b.lo);
473 rv.hi = or128(a.hi, b.hi);
474 return rv;
475}
476#endif
477
478#if defined(HAVE_AVX2)
479static really_inline m256 xor256(m256 a, m256 b) {
480 return _mm256_xor_si256(a, b);
481}
482#else
483static really_inline m256 xor256(m256 a, m256 b) {
484 m256 rv;
485 rv.lo = xor128(a.lo, b.lo);
486 rv.hi = xor128(a.hi, b.hi);
487 return rv;
488}
489#endif
490
491#if defined(HAVE_AVX2)
492static really_inline m256 not256(m256 a) {
493 return _mm256_xor_si256(a, ones256());
494}
495#else
496static really_inline m256 not256(m256 a) {
497 m256 rv;
498 rv.lo = not128(a.lo);
499 rv.hi = not128(a.hi);
500 return rv;
501}
502#endif
503
504#if defined(HAVE_AVX2)
505static really_inline m256 andnot256(m256 a, m256 b) {
506 return _mm256_andnot_si256(a, b);
507}
508#else
509static really_inline m256 andnot256(m256 a, m256 b) {
510 m256 rv;
511 rv.lo = andnot128(a.lo, b.lo);
512 rv.hi = andnot128(a.hi, b.hi);
513 return rv;
514}
515#endif
516
517static really_inline int diff256(m256 a, m256 b) {
518#if defined(HAVE_AVX2)
519 return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
520#else
521 return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
522#endif
523}
524
525static really_inline int isnonzero256(m256 a) {
526#if defined(HAVE_AVX2)
527 return !!diff256(a, zeroes256());
528#else
529 return isnonzero128(or128(a.lo, a.hi));
530#endif
531}
532
533/**
534 * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
535 * mask indicating which 32-bit words contain differences.
536 */
537static really_inline u32 diffrich256(m256 a, m256 b) {
538#if defined(HAVE_AVX2)
539 a = _mm256_cmpeq_epi32(a, b);
540 return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
541#else
542 m128 z = zeroes128();
543 a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
544 a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
545 m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
546 return ~(_mm_movemask_epi8(packed)) & 0xff;
547#endif
548}
549
550/**
551 * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
552 * returns an 8-bit mask indicating which 64-bit words contain differences.
553 */
554static really_inline u32 diffrich64_256(m256 a, m256 b) {
555 u32 d = diffrich256(a, b);
556 return (d | (d >> 1)) & 0x55555555;
557}
558
559// aligned load
560static really_inline m256 load256(const void *ptr) {
561 assert(ISALIGNED_N(ptr, alignof(m256)));
562#if defined(HAVE_AVX2)
563 return _mm256_load_si256((const m256 *)ptr);
564#else
565 m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
566 return rv;
567#endif
568}
569
570// aligned load of 128-bit value to low and high part of 256-bit value
571static really_inline m256 load2x128(const void *ptr) {
572#if defined(HAVE_AVX2)
573 return set2x128(load128(ptr));
574#else
575 assert(ISALIGNED_N(ptr, alignof(m128)));
576 m256 rv;
577 rv.hi = rv.lo = load128(ptr);
578 return rv;
579#endif
580}
581
582static really_inline m256 loadu2x128(const void *ptr) {
583 return set2x128(loadu128(ptr));
584}
585
586// aligned store
587static really_inline void store256(void *ptr, m256 a) {
588 assert(ISALIGNED_N(ptr, alignof(m256)));
589#if defined(HAVE_AVX2)
590 _mm256_store_si256((m256 *)ptr, a);
591#else
592 ptr = assume_aligned(ptr, 16);
593 *(m256 *)ptr = a;
594#endif
595}
596
597// unaligned load
598static really_inline m256 loadu256(const void *ptr) {
599#if defined(HAVE_AVX2)
600 return _mm256_loadu_si256((const m256 *)ptr);
601#else
602 m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
603 return rv;
604#endif
605}
606
607// unaligned store
608static really_inline void storeu256(void *ptr, m256 a) {
609#if defined(HAVE_AVX2)
610 _mm256_storeu_si256((m256 *)ptr, a);
611#else
612 storeu128(ptr, a.lo);
613 storeu128((char *)ptr + 16, a.hi);
614#endif
615}
616
617// packed unaligned store of first N bytes
618static really_inline
619void storebytes256(void *ptr, m256 a, unsigned int n) {
620 assert(n <= sizeof(a));
621 memcpy(ptr, &a, n);
622}
623
624// packed unaligned load of first N bytes, pad with zero
625static really_inline
626m256 loadbytes256(const void *ptr, unsigned int n) {
627 m256 a = zeroes256();
628 assert(n <= sizeof(a));
629 memcpy(&a, ptr, n);
630 return a;
631}
632
633static really_inline
634m256 mask1bit256(unsigned int n) {
635 assert(n < sizeof(m256) * 8);
636 u32 mask_idx = ((n % 8) * 64) + 95;
637 mask_idx -= n / 8;
638 return loadu256(&simd_onebit_masks[mask_idx]);
639}
640
641static really_inline
642m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
643#if defined(HAVE_AVX2)
644 return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
645#else
646 m256 rv;
647 rv.hi = set64x2(hi_1, hi_0);
648 rv.lo = set64x2(lo_1, lo_0);
649 return rv;
650#endif
651}
652
653#if !defined(HAVE_AVX2)
654// switches on bit N in the given vector.
655static really_inline
656void setbit256(m256 *ptr, unsigned int n) {
657 assert(n < sizeof(*ptr) * 8);
658 m128 *sub;
659 if (n < 128) {
660 sub = &ptr->lo;
661 } else {
662 sub = &ptr->hi;
663 n -= 128;
664 }
665 setbit128(sub, n);
666}
667
668// switches off bit N in the given vector.
669static really_inline
670void clearbit256(m256 *ptr, unsigned int n) {
671 assert(n < sizeof(*ptr) * 8);
672 m128 *sub;
673 if (n < 128) {
674 sub = &ptr->lo;
675 } else {
676 sub = &ptr->hi;
677 n -= 128;
678 }
679 clearbit128(sub, n);
680}
681
682// tests bit N in the given vector.
683static really_inline
684char testbit256(m256 val, unsigned int n) {
685 assert(n < sizeof(val) * 8);
686 m128 sub;
687 if (n < 128) {
688 sub = val.lo;
689 } else {
690 sub = val.hi;
691 n -= 128;
692 }
693 return testbit128(sub, n);
694}
695
696static really_really_inline
697m128 movdq_hi(m256 x) {
698 return x.hi;
699}
700
701static really_really_inline
702m128 movdq_lo(m256 x) {
703 return x.lo;
704}
705
706static really_inline
707m256 combine2x128(m128 hi, m128 lo) {
708 m256 rv = {lo, hi};
709 return rv;
710}
711
712#else // AVX2
713
714// switches on bit N in the given vector.
715static really_inline
716void setbit256(m256 *ptr, unsigned int n) {
717 *ptr = or256(mask1bit256(n), *ptr);
718}
719
720static really_inline
721void clearbit256(m256 *ptr, unsigned int n) {
722 *ptr = andnot256(mask1bit256(n), *ptr);
723}
724
725// tests bit N in the given vector.
726static really_inline
727char testbit256(m256 val, unsigned int n) {
728 const m256 mask = mask1bit256(n);
729 return !_mm256_testz_si256(mask, val);
730}
731
732static really_really_inline
733m128 movdq_hi(m256 x) {
734 return _mm256_extracti128_si256(x, 1);
735}
736
737static really_really_inline
738m128 movdq_lo(m256 x) {
739 return _mm256_extracti128_si256(x, 0);
740}
741
742#define cast256to128(a) _mm256_castsi256_si128(a)
743#define cast128to256(a) _mm256_castsi128_si256(a)
744#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
745#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
746#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
747#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
748#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
749#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
750#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
751#define extractlow32from256(a) movd(cast256to128(a))
752#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
753#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
754#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
755
756static really_inline
757m256 combine2x128(m128 hi, m128 lo) {
758#if defined(_mm256_set_m128i)
759 return _mm256_set_m128i(hi, lo);
760#else
761 return insert128to256(cast128to256(lo), hi, 1);
762#endif
763}
764#endif //AVX2
765
766#if defined(HAVE_AVX512)
767#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
768#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
769#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
770#define set2x256(a) _mm512_broadcast_i64x4(a)
771#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
772#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
773#endif
774
775/****
776 **** 384-bit Primitives
777 ****/
778
779static really_inline m384 and384(m384 a, m384 b) {
780 m384 rv;
781 rv.lo = and128(a.lo, b.lo);
782 rv.mid = and128(a.mid, b.mid);
783 rv.hi = and128(a.hi, b.hi);
784 return rv;
785}
786
787static really_inline m384 or384(m384 a, m384 b) {
788 m384 rv;
789 rv.lo = or128(a.lo, b.lo);
790 rv.mid = or128(a.mid, b.mid);
791 rv.hi = or128(a.hi, b.hi);
792 return rv;
793}
794
795static really_inline m384 xor384(m384 a, m384 b) {
796 m384 rv;
797 rv.lo = xor128(a.lo, b.lo);
798 rv.mid = xor128(a.mid, b.mid);
799 rv.hi = xor128(a.hi, b.hi);
800 return rv;
801}
802static really_inline m384 not384(m384 a) {
803 m384 rv;
804 rv.lo = not128(a.lo);
805 rv.mid = not128(a.mid);
806 rv.hi = not128(a.hi);
807 return rv;
808}
809static really_inline m384 andnot384(m384 a, m384 b) {
810 m384 rv;
811 rv.lo = andnot128(a.lo, b.lo);
812 rv.mid = andnot128(a.mid, b.mid);
813 rv.hi = andnot128(a.hi, b.hi);
814 return rv;
815}
816
817static really_really_inline
818m384 lshift64_m384(m384 a, unsigned b) {
819 m384 rv;
820 rv.lo = lshift64_m128(a.lo, b);
821 rv.mid = lshift64_m128(a.mid, b);
822 rv.hi = lshift64_m128(a.hi, b);
823 return rv;
824}
825
826static really_inline m384 zeroes384(void) {
827 m384 rv = {zeroes128(), zeroes128(), zeroes128()};
828 return rv;
829}
830
831static really_inline m384 ones384(void) {
832 m384 rv = {ones128(), ones128(), ones128()};
833 return rv;
834}
835
836static really_inline int diff384(m384 a, m384 b) {
837 return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
838}
839
840static really_inline int isnonzero384(m384 a) {
841 return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
842}
843
844/**
845 * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
846 * mask indicating which 32-bit words contain differences.
847 */
848static really_inline u32 diffrich384(m384 a, m384 b) {
849 m128 z = zeroes128();
850 a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
851 a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
852 a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
853 m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
854 _mm_packs_epi32(a.hi, z));
855 return ~(_mm_movemask_epi8(packed)) & 0xfff;
856}
857
858/**
859 * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
860 * returns a 12-bit mask indicating which 64-bit words contain differences.
861 */
862static really_inline u32 diffrich64_384(m384 a, m384 b) {
863 u32 d = diffrich384(a, b);
864 return (d | (d >> 1)) & 0x55555555;
865}
866
867// aligned load
868static really_inline m384 load384(const void *ptr) {
869 assert(ISALIGNED_16(ptr));
870 m384 rv = { load128(ptr), load128((const char *)ptr + 16),
871 load128((const char *)ptr + 32) };
872 return rv;
873}
874
875// aligned store
876static really_inline void store384(void *ptr, m384 a) {
877 assert(ISALIGNED_16(ptr));
878 ptr = assume_aligned(ptr, 16);
879 *(m384 *)ptr = a;
880}
881
882// unaligned load
883static really_inline m384 loadu384(const void *ptr) {
884 m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
885 loadu128((const char *)ptr + 32)};
886 return rv;
887}
888
889// packed unaligned store of first N bytes
890static really_inline
891void storebytes384(void *ptr, m384 a, unsigned int n) {
892 assert(n <= sizeof(a));
893 memcpy(ptr, &a, n);
894}
895
896// packed unaligned load of first N bytes, pad with zero
897static really_inline
898m384 loadbytes384(const void *ptr, unsigned int n) {
899 m384 a = zeroes384();
900 assert(n <= sizeof(a));
901 memcpy(&a, ptr, n);
902 return a;
903}
904
905// switches on bit N in the given vector.
906static really_inline
907void setbit384(m384 *ptr, unsigned int n) {
908 assert(n < sizeof(*ptr) * 8);
909 m128 *sub;
910 if (n < 128) {
911 sub = &ptr->lo;
912 } else if (n < 256) {
913 sub = &ptr->mid;
914 } else {
915 sub = &ptr->hi;
916 }
917 setbit128(sub, n % 128);
918}
919
920// switches off bit N in the given vector.
921static really_inline
922void clearbit384(m384 *ptr, unsigned int n) {
923 assert(n < sizeof(*ptr) * 8);
924 m128 *sub;
925 if (n < 128) {
926 sub = &ptr->lo;
927 } else if (n < 256) {
928 sub = &ptr->mid;
929 } else {
930 sub = &ptr->hi;
931 }
932 clearbit128(sub, n % 128);
933}
934
935// tests bit N in the given vector.
936static really_inline
937char testbit384(m384 val, unsigned int n) {
938 assert(n < sizeof(val) * 8);
939 m128 sub;
940 if (n < 128) {
941 sub = val.lo;
942 } else if (n < 256) {
943 sub = val.mid;
944 } else {
945 sub = val.hi;
946 }
947 return testbit128(sub, n % 128);
948}
949
950/****
951 **** 512-bit Primitives
952 ****/
953
954#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
955#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
956
957static really_inline
958m512 zeroes512(void) {
959#if defined(HAVE_AVX512)
960 return _mm512_setzero_si512();
961#else
962 m512 rv = {zeroes256(), zeroes256()};
963 return rv;
964#endif
965}
966
967static really_inline
968m512 ones512(void) {
969#if defined(HAVE_AVX512)
970 return _mm512_set1_epi8(0xFF);
971 //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
972#else
973 m512 rv = {ones256(), ones256()};
974 return rv;
975#endif
976}
977
978#if defined(HAVE_AVX512)
979static really_inline
980m512 set64x8(u8 a) {
981 return _mm512_set1_epi8(a);
982}
983
984static really_inline
985m512 set8x64(u64a a) {
986 return _mm512_set1_epi64(a);
987}
988
989static really_inline
990m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
991 u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
992 return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
993 lo_3, lo_2, lo_1, lo_0);
994}
995
996static really_inline
997m512 swap256in512(m512 a) {
998 m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
999 return vpermq512(idx, a);
1000}
1001
1002static really_inline
1003m512 set4x128(m128 a) {
1004 return _mm512_broadcast_i32x4(a);
1005}
1006#endif
1007
1008static really_inline
1009m512 and512(m512 a, m512 b) {
1010#if defined(HAVE_AVX512)
1011 return _mm512_and_si512(a, b);
1012#else
1013 m512 rv;
1014 rv.lo = and256(a.lo, b.lo);
1015 rv.hi = and256(a.hi, b.hi);
1016 return rv;
1017#endif
1018}
1019
1020static really_inline
1021m512 or512(m512 a, m512 b) {
1022#if defined(HAVE_AVX512)
1023 return _mm512_or_si512(a, b);
1024#else
1025 m512 rv;
1026 rv.lo = or256(a.lo, b.lo);
1027 rv.hi = or256(a.hi, b.hi);
1028 return rv;
1029#endif
1030}
1031
1032static really_inline
1033m512 xor512(m512 a, m512 b) {
1034#if defined(HAVE_AVX512)
1035 return _mm512_xor_si512(a, b);
1036#else
1037 m512 rv;
1038 rv.lo = xor256(a.lo, b.lo);
1039 rv.hi = xor256(a.hi, b.hi);
1040 return rv;
1041#endif
1042}
1043
1044static really_inline
1045m512 not512(m512 a) {
1046#if defined(HAVE_AVX512)
1047 return _mm512_xor_si512(a, ones512());
1048#else
1049 m512 rv;
1050 rv.lo = not256(a.lo);
1051 rv.hi = not256(a.hi);
1052 return rv;
1053#endif
1054}
1055
1056static really_inline
1057m512 andnot512(m512 a, m512 b) {
1058#if defined(HAVE_AVX512)
1059 return _mm512_andnot_si512(a, b);
1060#else
1061 m512 rv;
1062 rv.lo = andnot256(a.lo, b.lo);
1063 rv.hi = andnot256(a.hi, b.hi);
1064 return rv;
1065#endif
1066}
1067
1068#if defined(HAVE_AVX512)
1069static really_really_inline
1070m512 lshift64_m512(m512 a, unsigned b) {
1071#if defined(HAVE__BUILTIN_CONSTANT_P)
1072 if (__builtin_constant_p(b)) {
1073 return _mm512_slli_epi64(a, b);
1074 }
1075#endif
1076 m128 x = _mm_cvtsi32_si128(b);
1077 return _mm512_sll_epi64(a, x);
1078}
1079#else
1080static really_really_inline
1081m512 lshift64_m512(m512 a, unsigned b) {
1082 m512 rv;
1083 rv.lo = lshift64_m256(a.lo, b);
1084 rv.hi = lshift64_m256(a.hi, b);
1085 return rv;
1086}
1087#endif
1088
1089#if defined(HAVE_AVX512)
1090#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
1091#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
1092#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
1093#endif
1094
1095#if !defined(_MM_CMPINT_NE)
1096#define _MM_CMPINT_NE 0x4
1097#endif
1098
1099static really_inline
1100int diff512(m512 a, m512 b) {
1101#if defined(HAVE_AVX512)
1102 return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
1103#else
1104 return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
1105#endif
1106}
1107
1108static really_inline
1109int isnonzero512(m512 a) {
1110#if defined(HAVE_AVX512)
1111 return diff512(a, zeroes512());
1112#elif defined(HAVE_AVX2)
1113 m256 x = or256(a.lo, a.hi);
1114 return !!diff256(x, zeroes256());
1115#else
1116 m128 x = or128(a.lo.lo, a.lo.hi);
1117 m128 y = or128(a.hi.lo, a.hi.hi);
1118 return isnonzero128(or128(x, y));
1119#endif
1120}
1121
1122/**
1123 * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
1124 * mask indicating which 32-bit words contain differences.
1125 */
1126static really_inline
1127u32 diffrich512(m512 a, m512 b) {
1128#if defined(HAVE_AVX512)
1129 return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
1130#elif defined(HAVE_AVX2)
1131 return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
1132#else
1133 a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
1134 a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
1135 a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
1136 a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
1137 m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
1138 _mm_packs_epi32(a.hi.lo, a.hi.hi));
1139 return ~(_mm_movemask_epi8(packed)) & 0xffff;
1140#endif
1141}
1142
1143/**
1144 * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
1145 * returns a 16-bit mask indicating which 64-bit words contain differences.
1146 */
1147static really_inline
1148u32 diffrich64_512(m512 a, m512 b) {
1149 //TODO: cmp_epi64?
1150 u32 d = diffrich512(a, b);
1151 return (d | (d >> 1)) & 0x55555555;
1152}
1153
1154// aligned load
1155static really_inline
1156m512 load512(const void *ptr) {
1157#if defined(HAVE_AVX512)
1158 return _mm512_load_si512(ptr);
1159#else
1160 assert(ISALIGNED_N(ptr, alignof(m256)));
1161 m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
1162 return rv;
1163#endif
1164}
1165
1166// aligned store
1167static really_inline
1168void store512(void *ptr, m512 a) {
1169 assert(ISALIGNED_N(ptr, alignof(m512)));
1170#if defined(HAVE_AVX512)
1171 return _mm512_store_si512(ptr, a);
1172#elif defined(HAVE_AVX2)
1173 m512 *x = (m512 *)ptr;
1174 store256(&x->lo, a.lo);
1175 store256(&x->hi, a.hi);
1176#else
1177 ptr = assume_aligned(ptr, 16);
1178 *(m512 *)ptr = a;
1179#endif
1180}
1181
1182// unaligned load
1183static really_inline
1184m512 loadu512(const void *ptr) {
1185#if defined(HAVE_AVX512)
1186 return _mm512_loadu_si512(ptr);
1187#else
1188 m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
1189 return rv;
1190#endif
1191}
1192
1193#if defined(HAVE_AVX512)
1194static really_inline
1195m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
1196 return _mm512_maskz_loadu_epi8(k, ptr);
1197}
1198
1199static really_inline
1200m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
1201 return _mm512_mask_loadu_epi8(src, k, ptr);
1202}
1203
1204static really_inline
1205m512 set_mask_m512(__mmask64 k) {
1206 return _mm512_movm_epi8(k);
1207}
1208#endif
1209
1210// packed unaligned store of first N bytes
1211static really_inline
1212void storebytes512(void *ptr, m512 a, unsigned int n) {
1213 assert(n <= sizeof(a));
1214 memcpy(ptr, &a, n);
1215}
1216
1217// packed unaligned load of first N bytes, pad with zero
1218static really_inline
1219m512 loadbytes512(const void *ptr, unsigned int n) {
1220 m512 a = zeroes512();
1221 assert(n <= sizeof(a));
1222 memcpy(&a, ptr, n);
1223 return a;
1224}
1225
1226static really_inline
1227m512 mask1bit512(unsigned int n) {
1228 assert(n < sizeof(m512) * 8);
1229 u32 mask_idx = ((n % 8) * 64) + 95;
1230 mask_idx -= n / 8;
1231 return loadu512(&simd_onebit_masks[mask_idx]);
1232}
1233
1234// switches on bit N in the given vector.
1235static really_inline
1236void setbit512(m512 *ptr, unsigned int n) {
1237 assert(n < sizeof(*ptr) * 8);
1238#if !defined(HAVE_AVX2)
1239 m128 *sub;
1240 if (n < 128) {
1241 sub = &ptr->lo.lo;
1242 } else if (n < 256) {
1243 sub = &ptr->lo.hi;
1244 } else if (n < 384) {
1245 sub = &ptr->hi.lo;
1246 } else {
1247 sub = &ptr->hi.hi;
1248 }
1249 setbit128(sub, n % 128);
1250#elif defined(HAVE_AVX512)
1251 *ptr = or512(mask1bit512(n), *ptr);
1252#else
1253 m256 *sub;
1254 if (n < 256) {
1255 sub = &ptr->lo;
1256 } else {
1257 sub = &ptr->hi;
1258 n -= 256;
1259 }
1260 setbit256(sub, n);
1261#endif
1262}
1263
1264// switches off bit N in the given vector.
1265static really_inline
1266void clearbit512(m512 *ptr, unsigned int n) {
1267 assert(n < sizeof(*ptr) * 8);
1268#if !defined(HAVE_AVX2)
1269 m128 *sub;
1270 if (n < 128) {
1271 sub = &ptr->lo.lo;
1272 } else if (n < 256) {
1273 sub = &ptr->lo.hi;
1274 } else if (n < 384) {
1275 sub = &ptr->hi.lo;
1276 } else {
1277 sub = &ptr->hi.hi;
1278 }
1279 clearbit128(sub, n % 128);
1280#elif defined(HAVE_AVX512)
1281 *ptr = andnot512(mask1bit512(n), *ptr);
1282#else
1283 m256 *sub;
1284 if (n < 256) {
1285 sub = &ptr->lo;
1286 } else {
1287 sub = &ptr->hi;
1288 n -= 256;
1289 }
1290 clearbit256(sub, n);
1291#endif
1292}
1293
1294// tests bit N in the given vector.
1295static really_inline
1296char testbit512(m512 val, unsigned int n) {
1297 assert(n < sizeof(val) * 8);
1298#if !defined(HAVE_AVX2)
1299 m128 sub;
1300 if (n < 128) {
1301 sub = val.lo.lo;
1302 } else if (n < 256) {
1303 sub = val.lo.hi;
1304 } else if (n < 384) {
1305 sub = val.hi.lo;
1306 } else {
1307 sub = val.hi.hi;
1308 }
1309 return testbit128(sub, n % 128);
1310#elif defined(HAVE_AVX512)
1311 const m512 mask = mask1bit512(n);
1312 return !!_mm512_test_epi8_mask(mask, val);
1313#else
1314 m256 sub;
1315 if (n < 256) {
1316 sub = val.lo;
1317 } else {
1318 sub = val.hi;
1319 n -= 256;
1320 }
1321 return testbit256(sub, n);
1322#endif
1323}
1324
1325#endif
1326