simd_utils.h source code [ClickHouse/contrib/hyperscan/src/util/simd_utils.h]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief SIMD types and primitive operations.
31	*/
32
33	#ifndef SIMD_UTILS
34	#define SIMD_UTILS
35
36	#if !defined(_WIN32) && !defined(__SSSE3__)
37	#error SSSE3 instructions must be enabled
38	#endif
39
40	#include "config.h"
41	#include "ue2common.h"
42	#include "simd_types.h"
43	#include "unaligned.h"
44	#include "util/arch.h"
45	#include "util/intrinsics.h"
46
47	#include <string.h> // for memcpy
48
49	// Define a common assume_aligned using an appropriate compiler built-in, if
50	// it's available. Note that we need to handle C or C++ compilation.
51	#ifdef __cplusplus
52	# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
53	# define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
54	# endif
55	#else
56	# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
57	# define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
58	# endif
59	#endif
60
61	// Fallback to identity case.
62	#ifndef assume_aligned
63	#define assume_aligned(x, y) (x)
64	#endif
65
66	#ifdef __cplusplus
67	extern "C" {
68	#endif
69	extern const char vbs_mask_data[];
70	#ifdef __cplusplus
71	}
72	#endif
73
74	static really_inline m128 ones128(void) {
75	#if defined(__GNUC__) \|\| defined(__INTEL_COMPILER)
76	/ gcc gets this right /
77	return _mm_set1_epi8(`0xFF`);
78	#else
79	/ trick from Intel's optimization guide to generate all-ones.*
80	* ICC converts this to the single cmpeq instruction */
81	return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
82	#endif
83	}
84
85	static really_inline m128 zeroes128(void) {
86	return _mm_setzero_si128();
87	}
88
89	/* \brief Bitwise not for m128/
90	static really_inline m128 not128(m128 a) {
91	return _mm_xor_si128(a, ones128());
92	}
93
94	/* \brief Return 1 if a and b are different otherwise 0 /
95	static really_inline int diff128(m128 a, m128 b) {
96	return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ `0xffff`);
97	}
98
99	static really_inline int isnonzero128(m128 a) {
100	return !!diff128(a, zeroes128());
101	}
102
103	/**
104	* "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
105	* mask indicating which 32-bit words contain differences.
106	*/
107	static really_inline u32 diffrich128(m128 a, m128 b) {
108	a = _mm_cmpeq_epi32(a, b);
109	return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & `0xf`;
110	}
111
112	/**
113	* "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
114	* returns a 4-bit mask indicating which 64-bit words contain differences.
115	*/
116	static really_inline u32 diffrich64_128(m128 a, m128 b) {
117	#if defined(HAVE_SSE41)
118	a = _mm_cmpeq_epi64(a, b);
119	return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & `0x5`;
120	#else
121	u32 d = diffrich128(a, b);
122	return (d \| (d >> `1`)) & `0x5`;
123	#endif
124	}
125
126	static really_really_inline
127	m128 lshift64_m128(m128 a, unsigned b) {
128	#if defined(HAVE__BUILTIN_CONSTANT_P)
129	if (__builtin_constant_p(b)) {
130	return _mm_slli_epi64(a, b);
131	}
132	#endif
133	m128 x = _mm_cvtsi32_si128(b);
134	return _mm_sll_epi64(a, x);
135	}
136
137	#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
138	#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
139	#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
140
141	static really_inline m128 set16x8(u8 c) {
142	return _mm_set1_epi8(c);
143	}
144
145	static really_inline m128 set4x32(u32 c) {
146	return _mm_set1_epi32(c);
147	}
148
149	static really_inline u32 movd(const m128 in) {
150	return _mm_cvtsi128_si32(in);
151	}
152
153	static really_inline u64a movq(const m128 in) {
154	#if defined(ARCH_X86_64)
155	return _mm_cvtsi128_si64(in);
156	#else // 32-bit - this is horrific
157	u32 lo = movd(in);
158	u32 hi = movd(_mm_srli_epi64(in, `32`));
159	return (u64a)hi << `32` \| lo;
160	#endif
161	}
162
163	/ another form of movq /
164	static really_inline
165	m128 load_m128_from_u64a(const u64a *p) {
166	return _mm_set_epi64x(`0LL`, *p);
167	}
168
169	#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
170	#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
171
172	#if defined(HAVE_SSE41)
173	#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
174	#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
175	#else
176	#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
177	#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
178	#endif
179
180	#if !defined(HAVE_AVX2)
181	// TODO: this entire file needs restructuring - this carveout is awful
182	#define extractlow64from256(a) movq(a.lo)
183	#define extractlow32from256(a) movd(a.lo)
184	#if defined(HAVE_SSE41)
185	#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
186	#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
187	#else
188	#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
189	#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
190	#endif
191
192	#endif // !AVX2
193
194	static really_inline m128 and128(m128 a, m128 b) {
195	return _mm_and_si128(a,b);
196	}
197
198	static really_inline m128 xor128(m128 a, m128 b) {
199	return _mm_xor_si128(a,b);
200	}
201
202	static really_inline m128 or128(m128 a, m128 b) {
203	return _mm_or_si128(a,b);
204	}
205
206	static really_inline m128 andnot128(m128 a, m128 b) {
207	return _mm_andnot_si128(a, b);
208	}
209
210	// aligned load
211	static really_inline m128 load128(const void *ptr) {
212	assert(ISALIGNED_N(ptr, alignof(m128)));
213	ptr = assume_aligned(ptr, `16`);
214	return _mm_load_si128((const m128 *)ptr);
215	}
216
217	// aligned store
218	static really_inline void store128(void *ptr, m128 a) {
219	assert(ISALIGNED_N(ptr, alignof(m128)));
220	ptr = assume_aligned(ptr, `16`);
221	(m128 )ptr = a;
222	}
223
224	// unaligned load
225	static really_inline m128 loadu128(const void *ptr) {
226	return _mm_loadu_si128((const m128 *)ptr);
227	}
228
229	// unaligned store
230	static really_inline void storeu128(void *ptr, m128 a) {
231	_mm_storeu_si128 ((m128 *)ptr, a);
232	}
233
234	// packed unaligned store of first N bytes
235	static really_inline
236	void storebytes128(void ptr, m128 a, unsigned* int n) {
237	assert(n <= sizeof(a));
238	memcpy(ptr, &a, n);
239	}
240
241	// packed unaligned load of first N bytes, pad with zero
242	static really_inline
243	m128 loadbytes128(const void ptr, unsigned* int n) {
244	m128 a = zeroes128();
245	assert(n <= sizeof(a));
246	memcpy(&a, ptr, n);
247	return a;
248	}
249
250	#ifdef __cplusplus
251	extern "C" {
252	#endif
253	extern const u8 simd_onebit_masks[];
254	#ifdef __cplusplus
255	}
256	#endif
257
258	static really_inline
259	m128 mask1bit128(unsigned int n) {
260	assert(n < sizeof(m128) * `8`);
261	u32 mask_idx = ((n % `8`) * `64`) + `95`;
262	mask_idx -= n / `8`;
263	return loadu128(&simd_onebit_masks[mask_idx]);
264	}
265
266	// switches on bit N in the given vector.
267	static really_inline
268	void setbit128(m128 ptr, unsigned* int n) {
269	ptr = or128(mask1bit128(n), ptr);
270	}
271
272	// switches off bit N in the given vector.
273	static really_inline
274	void clearbit128(m128 ptr, unsigned* int n) {
275	ptr = andnot128(mask1bit128(n), ptr);
276	}
277
278	// tests bit N in the given vector.
279	static really_inline
280	char testbit128(m128 val, unsigned int n) {
281	const m128 mask = mask1bit128(n);
282	#if defined(HAVE_SSE41)
283	return !_mm_testz_si128(mask, val);
284	#else
285	return isnonzero128(and128(mask, val));
286	#endif
287	}
288
289	// offset must be an immediate
290	#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
291
292	static really_inline
293	m128 pshufb_m128(m128 a, m128 b) {
294	m128 result;
295	result = _mm_shuffle_epi8(a, b);
296	return result;
297	}
298
299	static really_inline
300	m256 pshufb_m256(m256 a, m256 b) {
301	#if defined(HAVE_AVX2)
302	return _mm256_shuffle_epi8(a, b);
303	#else
304	m256 rv;
305	rv.lo = pshufb_m128(a.lo, b.lo);
306	rv.hi = pshufb_m128(a.hi, b.hi);
307	return rv;
308	#endif
309	}
310
311	#if defined(HAVE_AVX512)
312	static really_inline
313	m512 pshufb_m512(m512 a, m512 b) {
314	return _mm512_shuffle_epi8(a, b);
315	}
316
317	static really_inline
318	m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
319	return _mm512_maskz_shuffle_epi8(k, a, b);
320	}
321	#endif
322
323	static really_inline
324	m128 variable_byte_shift_m128(m128 in, s32 amount) {
325	assert(amount >= -`16` && amount <= `16`);
326	m128 shift_mask = loadu128(vbs_mask_data + `16` - amount);
327	return pshufb_m128(in, shift_mask);
328	}
329
330	static really_inline
331	m128 max_u8_m128(m128 a, m128 b) {
332	return _mm_max_epu8(a, b);
333	}
334
335	static really_inline
336	m128 min_u8_m128(m128 a, m128 b) {
337	return _mm_min_epu8(a, b);
338	}
339
340	static really_inline
341	m128 sadd_u8_m128(m128 a, m128 b) {
342	return _mm_adds_epu8(a, b);
343	}
344
345	static really_inline
346	m128 sub_u8_m128(m128 a, m128 b) {
347	return _mm_sub_epi8(a, b);
348	}
349
350	static really_inline
351	m128 set64x2(u64a hi, u64a lo) {
352	return _mm_set_epi64x(hi, lo);
353	}
354
355	/****
356	**** 256-bit Primitives
357	****/
358
359	#if defined(HAVE_AVX2)
360
361	static really_really_inline
362	m256 lshift64_m256(m256 a, unsigned b) {
363	#if defined(HAVE__BUILTIN_CONSTANT_P)
364	if (__builtin_constant_p(b)) {
365	return _mm256_slli_epi64(a, b);
366	}
367	#endif
368	m128 x = _mm_cvtsi32_si128(b);
369	return _mm256_sll_epi64(a, x);
370	}
371
372	#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
373
374	static really_inline
375	m256 set32x8(u32 in) {
376	return _mm256_set1_epi8(in);
377	}
378
379	#define eq256(a, b) _mm256_cmpeq_epi8((a), (b))
380	#define movemask256(a) ((u32)_mm256_movemask_epi8((a)))
381
382	static really_inline
383	m256 set2x128(m128 a) {
384	return _mm256_broadcastsi128_si256(a);
385	}
386
387	#else
388
389	static really_really_inline
390	m256 lshift64_m256(m256 a, int b) {
391	m256 rv = a;
392	rv.lo = lshift64_m128(rv.lo, b);
393	rv.hi = lshift64_m128(rv.hi, b);
394	return rv;
395	}
396
397	static really_inline
398	m256 rshift64_m256(m256 a, int b) {
399	m256 rv = a;
400	rv.lo = rshift64_m128(rv.lo, b);
401	rv.hi = rshift64_m128(rv.hi, b);
402	return rv;
403	}
404	static really_inline
405	m256 set32x8(u32 in) {
406	m256 rv;
407	rv.lo = set16x8((u8) in);
408	rv.hi = rv.lo;
409	return rv;
410	}
411
412	static really_inline
413	m256 eq256(m256 a, m256 b) {
414	m256 rv;
415	rv.lo = eq128(a.lo, b.lo);
416	rv.hi = eq128(a.hi, b.hi);
417	return rv;
418	}
419
420	static really_inline
421	u32 movemask256(m256 a) {
422	u32 lo_mask = movemask128(a.lo);
423	u32 hi_mask = movemask128(a.hi);
424	return lo_mask \| (hi_mask << `16`);
425	}
426
427	static really_inline
428	m256 set2x128(m128 a) {
429	m256 rv = {a, a};
430	return rv;
431	}
432	#endif
433
434	static really_inline m256 zeroes256(void) {
435	#if defined(HAVE_AVX2)
436	return _mm256_setzero_si256();
437	#else
438	m256 rv = {zeroes128(), zeroes128()};
439	return rv;
440	#endif
441	}
442
443	static really_inline m256 ones256(void) {
444	#if defined(HAVE_AVX2)
445	m256 rv = _mm256_set1_epi8(`0xFF`);
446	#else
447	m256 rv = {ones128(), ones128()};
448	#endif
449	return rv;
450	}
451
452	#if defined(HAVE_AVX2)
453	static really_inline m256 and256(m256 a, m256 b) {
454	return _mm256_and_si256(a, b);
455	}
456	#else
457	static really_inline m256 and256(m256 a, m256 b) {
458	m256 rv;
459	rv.lo = and128(a.lo, b.lo);
460	rv.hi = and128(a.hi, b.hi);
461	return rv;
462	}
463	#endif
464
465	#if defined(HAVE_AVX2)
466	static really_inline m256 or256(m256 a, m256 b) {
467	return _mm256_or_si256(a, b);
468	}
469	#else
470	static really_inline m256 or256(m256 a, m256 b) {
471	m256 rv;
472	rv.lo = or128(a.lo, b.lo);
473	rv.hi = or128(a.hi, b.hi);
474	return rv;
475	}
476	#endif
477
478	#if defined(HAVE_AVX2)
479	static really_inline m256 xor256(m256 a, m256 b) {
480	return _mm256_xor_si256(a, b);
481	}
482	#else
483	static really_inline m256 xor256(m256 a, m256 b) {
484	m256 rv;
485	rv.lo = xor128(a.lo, b.lo);
486	rv.hi = xor128(a.hi, b.hi);
487	return rv;
488	}
489	#endif
490
491	#if defined(HAVE_AVX2)
492	static really_inline m256 not256(m256 a) {
493	return _mm256_xor_si256(a, ones256());
494	}
495	#else
496	static really_inline m256 not256(m256 a) {
497	m256 rv;
498	rv.lo = not128(a.lo);
499	rv.hi = not128(a.hi);
500	return rv;
501	}
502	#endif
503
504	#if defined(HAVE_AVX2)
505	static really_inline m256 andnot256(m256 a, m256 b) {
506	return _mm256_andnot_si256(a, b);
507	}
508	#else
509	static really_inline m256 andnot256(m256 a, m256 b) {
510	m256 rv;
511	rv.lo = andnot128(a.lo, b.lo);
512	rv.hi = andnot128(a.hi, b.hi);
513	return rv;
514	}
515	#endif
516
517	static really_inline int diff256(m256 a, m256 b) {
518	#if defined(HAVE_AVX2)
519	return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-`1`);
520	#else
521	return diff128(a.lo, b.lo) \|\| diff128(a.hi, b.hi);
522	#endif
523	}
524
525	static really_inline int isnonzero256(m256 a) {
526	#if defined(HAVE_AVX2)
527	return !!diff256(a, zeroes256());
528	#else
529	return isnonzero128(or128(a.lo, a.hi));
530	#endif
531	}
532
533	/**
534	* "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
535	* mask indicating which 32-bit words contain differences.
536	*/
537	static really_inline u32 diffrich256(m256 a, m256 b) {
538	#if defined(HAVE_AVX2)
539	a = _mm256_cmpeq_epi32(a, b);
540	return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & `0xFF`;
541	#else
542	m128 z = zeroes128();
543	a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
544	a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
545	m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
546	return ~(_mm_movemask_epi8(packed)) & `0xff`;
547	#endif
548	}
549
550	/**
551	* "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
552	* returns an 8-bit mask indicating which 64-bit words contain differences.
553	*/
554	static really_inline u32 diffrich64_256(m256 a, m256 b) {
555	u32 d = diffrich256(a, b);
556	return (d \| (d >> `1`)) & `0x55555555`;
557	}
558
559	// aligned load
560	static really_inline m256 load256(const void *ptr) {
561	assert(ISALIGNED_N(ptr, alignof(m256)));
562	#if defined(HAVE_AVX2)
563	return _mm256_load_si256((const m256 *)ptr);
564	#else
565	m256 rv = { load128(ptr), load128((const char *)ptr + `16`) };
566	return rv;
567	#endif
568	}
569
570	// aligned load of 128-bit value to low and high part of 256-bit value
571	static really_inline m256 load2x128(const void *ptr) {
572	#if defined(HAVE_AVX2)
573	return set2x128(load128(ptr));
574	#else
575	assert(ISALIGNED_N(ptr, alignof(m128)));
576	m256 rv;
577	rv.hi = rv.lo = load128(ptr);
578	return rv;
579	#endif
580	}
581
582	static really_inline m256 loadu2x128(const void *ptr) {
583	return set2x128(loadu128(ptr));
584	}
585
586	// aligned store
587	static really_inline void store256(void *ptr, m256 a) {
588	assert(ISALIGNED_N(ptr, alignof(m256)));
589	#if defined(HAVE_AVX2)
590	_mm256_store_si256((m256 *)ptr, a);
591	#else
592	ptr = assume_aligned(ptr, `16`);
593	(m256 )ptr = a;
594	#endif
595	}
596
597	// unaligned load
598	static really_inline m256 loadu256(const void *ptr) {
599	#if defined(HAVE_AVX2)
600	return _mm256_loadu_si256((const m256 *)ptr);
601	#else
602	m256 rv = { loadu128(ptr), loadu128((const char *)ptr + `16`) };
603	return rv;
604	#endif
605	}
606
607	// unaligned store
608	static really_inline void storeu256(void *ptr, m256 a) {
609	#if defined(HAVE_AVX2)
610	_mm256_storeu_si256((m256 *)ptr, a);
611	#else
612	storeu128(ptr, a.lo);
613	storeu128((char *)ptr + `16`, a.hi);
614	#endif
615	}
616
617	// packed unaligned store of first N bytes
618	static really_inline
619	void storebytes256(void ptr, m256 a, unsigned* int n) {
620	assert(n <= sizeof(a));
621	memcpy(ptr, &a, n);
622	}
623
624	// packed unaligned load of first N bytes, pad with zero
625	static really_inline
626	m256 loadbytes256(const void ptr, unsigned* int n) {
627	m256 a = zeroes256();
628	assert(n <= sizeof(a));
629	memcpy(&a, ptr, n);
630	return a;
631	}
632
633	static really_inline
634	m256 mask1bit256(unsigned int n) {
635	assert(n < sizeof(m256) * `8`);
636	u32 mask_idx = ((n % `8`) * `64`) + `95`;
637	mask_idx -= n / `8`;
638	return loadu256(&simd_onebit_masks[mask_idx]);
639	}
640
641	static really_inline
642	m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
643	#if defined(HAVE_AVX2)
644	return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
645	#else
646	m256 rv;
647	rv.hi = set64x2(hi_1, hi_0);
648	rv.lo = set64x2(lo_1, lo_0);
649	return rv;
650	#endif
651	}
652
653	#if !defined(HAVE_AVX2)
654	// switches on bit N in the given vector.
655	static really_inline
656	void setbit256(m256 ptr, unsigned* int n) {
657	assert(n < sizeof(ptr) `8`);
658	m128 *sub;
659	if (n < `128`) {
660	sub = &ptr->lo;
661	} else {
662	sub = &ptr->hi;
663	n -= `128`;
664	}
665	setbit128(sub, n);
666	}
667
668	// switches off bit N in the given vector.
669	static really_inline
670	void clearbit256(m256 ptr, unsigned* int n) {
671	assert(n < sizeof(ptr) `8`);
672	m128 *sub;
673	if (n < `128`) {
674	sub = &ptr->lo;
675	} else {
676	sub = &ptr->hi;
677	n -= `128`;
678	}
679	clearbit128(sub, n);
680	}
681
682	// tests bit N in the given vector.
683	static really_inline
684	char testbit256(m256 val, unsigned int n) {
685	assert(n < sizeof(val) * `8`);
686	m128 sub;
687	if (n < `128`) {
688	sub = val.lo;
689	} else {
690	sub = val.hi;
691	n -= `128`;
692	}
693	return testbit128(sub, n);
694	}
695
696	static really_really_inline
697	m128 movdq_hi(m256 x) {
698	return x.hi;
699	}
700
701	static really_really_inline
702	m128 movdq_lo(m256 x) {
703	return x.lo;
704	}
705
706	static really_inline
707	m256 combine2x128(m128 hi, m128 lo) {
708	m256 rv = {lo, hi};
709	return rv;
710	}
711
712	#else // AVX2
713
714	// switches on bit N in the given vector.
715	static really_inline
716	void setbit256(m256 ptr, unsigned* int n) {
717	ptr = or256(mask1bit256(n), ptr);
718	}
719
720	static really_inline
721	void clearbit256(m256 ptr, unsigned* int n) {
722	ptr = andnot256(mask1bit256(n), ptr);
723	}
724
725	// tests bit N in the given vector.
726	static really_inline
727	char testbit256(m256 val, unsigned int n) {
728	const m256 mask = mask1bit256(n);
729	return !_mm256_testz_si256(mask, val);
730	}
731
732	static really_really_inline
733	m128 movdq_hi(m256 x) {
734	return _mm256_extracti128_si256(x, `1`);
735	}
736
737	static really_really_inline
738	m128 movdq_lo(m256 x) {
739	return _mm256_extracti128_si256(x, `0`);
740	}
741
742	#define cast256to128(a) _mm256_castsi256_si128(a)
743	#define cast128to256(a) _mm256_castsi128_si256(a)
744	#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
745	#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
746	#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
747	#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
748	#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
749	#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
750	#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
751	#define extractlow32from256(a) movd(cast256to128(a))
752	#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
753	#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
754	#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
755
756	static really_inline
757	m256 combine2x128(m128 hi, m128 lo) {
758	#if defined(_mm256_set_m128i)
759	return _mm256_set_m128i(hi, lo);
760	#else
761	return insert128to256(cast128to256(lo), hi, `1`);
762	#endif
763	}
764	#endif //AVX2
765
766	#if defined(HAVE_AVX512)
767	#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
768	#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
769	#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
770	#define set2x256(a) _mm512_broadcast_i64x4(a)
771	#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
772	#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
773	#endif
774
775	/****
776	**** 384-bit Primitives
777	****/
778
779	static really_inline m384 and384(m384 a, m384 b) {
780	m384 rv;
781	rv.lo = and128(a.lo, b.lo);
782	rv.mid = and128(a.mid, b.mid);
783	rv.hi = and128(a.hi, b.hi);
784	return rv;
785	}
786
787	static really_inline m384 or384(m384 a, m384 b) {
788	m384 rv;
789	rv.lo = or128(a.lo, b.lo);
790	rv.mid = or128(a.mid, b.mid);
791	rv.hi = or128(a.hi, b.hi);
792	return rv;
793	}
794
795	static really_inline m384 xor384(m384 a, m384 b) {
796	m384 rv;
797	rv.lo = xor128(a.lo, b.lo);
798	rv.mid = xor128(a.mid, b.mid);
799	rv.hi = xor128(a.hi, b.hi);
800	return rv;
801	}
802	static really_inline m384 not384(m384 a) {
803	m384 rv;
804	rv.lo = not128(a.lo);
805	rv.mid = not128(a.mid);
806	rv.hi = not128(a.hi);
807	return rv;
808	}
809	static really_inline m384 andnot384(m384 a, m384 b) {
810	m384 rv;
811	rv.lo = andnot128(a.lo, b.lo);
812	rv.mid = andnot128(a.mid, b.mid);
813	rv.hi = andnot128(a.hi, b.hi);
814	return rv;
815	}
816
817	static really_really_inline
818	m384 lshift64_m384(m384 a, unsigned b) {
819	m384 rv;
820	rv.lo = lshift64_m128(a.lo, b);
821	rv.mid = lshift64_m128(a.mid, b);
822	rv.hi = lshift64_m128(a.hi, b);
823	return rv;
824	}
825
826	static really_inline m384 zeroes384(void) {
827	m384 rv = {zeroes128(), zeroes128(), zeroes128()};
828	return rv;
829	}
830
831	static really_inline m384 ones384(void) {
832	m384 rv = {ones128(), ones128(), ones128()};
833	return rv;
834	}
835
836	static really_inline int diff384(m384 a, m384 b) {
837	return diff128(a.lo, b.lo) \|\| diff128(a.mid, b.mid) \|\| diff128(a.hi, b.hi);
838	}
839
840	static really_inline int isnonzero384(m384 a) {
841	return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
842	}
843
844	/**
845	* "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
846	* mask indicating which 32-bit words contain differences.
847	*/
848	static really_inline u32 diffrich384(m384 a, m384 b) {
849	m128 z = zeroes128();
850	a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
851	a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
852	a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
853	m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
854	_mm_packs_epi32(a.hi, z));
855	return ~(_mm_movemask_epi8(packed)) & `0xfff`;
856	}
857
858	/**
859	* "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
860	* returns a 12-bit mask indicating which 64-bit words contain differences.
861	*/
862	static really_inline u32 diffrich64_384(m384 a, m384 b) {
863	u32 d = diffrich384(a, b);
864	return (d \| (d >> `1`)) & `0x55555555`;
865	}
866
867	// aligned load
868	static really_inline m384 load384(const void *ptr) {
869	assert(ISALIGNED_16(ptr));
870	m384 rv = { load128(ptr), load128((const char *)ptr + `16`),
871	load128((const char *)ptr + `32`) };
872	return rv;
873	}
874
875	// aligned store
876	static really_inline void store384(void *ptr, m384 a) {
877	assert(ISALIGNED_16(ptr));
878	ptr = assume_aligned(ptr, `16`);
879	(m384 )ptr = a;
880	}
881
882	// unaligned load
883	static really_inline m384 loadu384(const void *ptr) {
884	m384 rv = { loadu128(ptr), loadu128((const char *)ptr + `16`),
885	loadu128((const char *)ptr + `32`)};
886	return rv;
887	}
888
889	// packed unaligned store of first N bytes
890	static really_inline
891	void storebytes384(void ptr, m384 a, unsigned* int n) {
892	assert(n <= sizeof(a));
893	memcpy(ptr, &a, n);
894	}
895
896	// packed unaligned load of first N bytes, pad with zero
897	static really_inline
898	m384 loadbytes384(const void ptr, unsigned* int n) {
899	m384 a = zeroes384();
900	assert(n <= sizeof(a));
901	memcpy(&a, ptr, n);
902	return a;
903	}
904
905	// switches on bit N in the given vector.
906	static really_inline
907	void setbit384(m384 ptr, unsigned* int n) {
908	assert(n < sizeof(ptr) `8`);
909	m128 *sub;
910	if (n < `128`) {
911	sub = &ptr->lo;
912	} else if (n < `256`) {
913	sub = &ptr->mid;
914	} else {
915	sub = &ptr->hi;
916	}
917	setbit128(sub, n % `128`);
918	}
919
920	// switches off bit N in the given vector.
921	static really_inline
922	void clearbit384(m384 ptr, unsigned* int n) {
923	assert(n < sizeof(ptr) `8`);
924	m128 *sub;
925	if (n < `128`) {
926	sub = &ptr->lo;
927	} else if (n < `256`) {
928	sub = &ptr->mid;
929	} else {
930	sub = &ptr->hi;
931	}
932	clearbit128(sub, n % `128`);
933	}
934
935	// tests bit N in the given vector.
936	static really_inline
937	char testbit384(m384 val, unsigned int n) {
938	assert(n < sizeof(val) * `8`);
939	m128 sub;
940	if (n < `128`) {
941	sub = val.lo;
942	} else if (n < `256`) {
943	sub = val.mid;
944	} else {
945	sub = val.hi;
946	}
947	return testbit128(sub, n % `128`);
948	}
949
950	/****
951	**** 512-bit Primitives
952	****/
953
954	#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
955	#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
956
957	static really_inline
958	m512 zeroes512(void) {
959	#if defined(HAVE_AVX512)
960	return _mm512_setzero_si512();
961	#else
962	m512 rv = {zeroes256(), zeroes256()};
963	return rv;
964	#endif
965	}
966
967	static really_inline
968	m512 ones512(void) {
969	#if defined(HAVE_AVX512)
970	return _mm512_set1_epi8(`0xFF`);
971	//return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
972	#else
973	m512 rv = {ones256(), ones256()};
974	return rv;
975	#endif
976	}
977
978	#if defined(HAVE_AVX512)
979	static really_inline
980	m512 set64x8(u8 a) {
981	return _mm512_set1_epi8(a);
982	}
983
984	static really_inline
985	m512 set8x64(u64a a) {
986	return _mm512_set1_epi64(a);
987	}
988
989	static really_inline
990	m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
991	u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
992	return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
993	lo_3, lo_2, lo_1, lo_0);
994	}
995
996	static really_inline
997	m512 swap256in512(m512 a) {
998	m512 idx = set512_64(`3ULL`, `2ULL`, `1ULL`, `0ULL`, `7ULL`, `6ULL`, `5ULL`, `4ULL`);
999	return vpermq512(idx, a);
1000	}
1001
1002	static really_inline
1003	m512 set4x128(m128 a) {
1004	return _mm512_broadcast_i32x4(a);
1005	}
1006	#endif
1007
1008	static really_inline
1009	m512 and512(m512 a, m512 b) {
1010	#if defined(HAVE_AVX512)
1011	return _mm512_and_si512(a, b);
1012	#else
1013	m512 rv;
1014	rv.lo = and256(a.lo, b.lo);
1015	rv.hi = and256(a.hi, b.hi);
1016	return rv;
1017	#endif
1018	}
1019
1020	static really_inline
1021	m512 or512(m512 a, m512 b) {
1022	#if defined(HAVE_AVX512)
1023	return _mm512_or_si512(a, b);
1024	#else
1025	m512 rv;
1026	rv.lo = or256(a.lo, b.lo);
1027	rv.hi = or256(a.hi, b.hi);
1028	return rv;
1029	#endif
1030	}
1031
1032	static really_inline
1033	m512 xor512(m512 a, m512 b) {
1034	#if defined(HAVE_AVX512)
1035	return _mm512_xor_si512(a, b);
1036	#else
1037	m512 rv;
1038	rv.lo = xor256(a.lo, b.lo);
1039	rv.hi = xor256(a.hi, b.hi);
1040	return rv;
1041	#endif
1042	}
1043
1044	static really_inline
1045	m512 not512(m512 a) {
1046	#if defined(HAVE_AVX512)
1047	return _mm512_xor_si512(a, ones512());
1048	#else
1049	m512 rv;
1050	rv.lo = not256(a.lo);
1051	rv.hi = not256(a.hi);
1052	return rv;
1053	#endif
1054	}
1055
1056	static really_inline
1057	m512 andnot512(m512 a, m512 b) {
1058	#if defined(HAVE_AVX512)
1059	return _mm512_andnot_si512(a, b);
1060	#else
1061	m512 rv;
1062	rv.lo = andnot256(a.lo, b.lo);
1063	rv.hi = andnot256(a.hi, b.hi);
1064	return rv;
1065	#endif
1066	}
1067
1068	#if defined(HAVE_AVX512)
1069	static really_really_inline
1070	m512 lshift64_m512(m512 a, unsigned b) {
1071	#if defined(HAVE__BUILTIN_CONSTANT_P)
1072	if (__builtin_constant_p(b)) {
1073	return _mm512_slli_epi64(a, b);
1074	}
1075	#endif
1076	m128 x = _mm_cvtsi32_si128(b);
1077	return _mm512_sll_epi64(a, x);
1078	}
1079	#else
1080	static really_really_inline
1081	m512 lshift64_m512(m512 a, unsigned b) {
1082	m512 rv;
1083	rv.lo = lshift64_m256(a.lo, b);
1084	rv.hi = lshift64_m256(a.hi, b);
1085	return rv;
1086	}
1087	#endif
1088
1089	#if defined(HAVE_AVX512)
1090	#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
1091	#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
1092	#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
1093	#endif
1094
1095	#if !defined(_MM_CMPINT_NE)
1096	#define _MM_CMPINT_NE 0x4
1097	#endif
1098
1099	static really_inline
1100	int diff512(m512 a, m512 b) {
1101	#if defined(HAVE_AVX512)
1102	return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
1103	#else
1104	return diff256(a.lo, b.lo) \|\| diff256(a.hi, b.hi);
1105	#endif
1106	}
1107
1108	static really_inline
1109	int isnonzero512(m512 a) {
1110	#if defined(HAVE_AVX512)
1111	return diff512(a, zeroes512());
1112	#elif defined(HAVE_AVX2)
1113	m256 x = or256(a.lo, a.hi);
1114	return !!diff256(x, zeroes256());
1115	#else
1116	m128 x = or128(a.lo.lo, a.lo.hi);
1117	m128 y = or128(a.hi.lo, a.hi.hi);
1118	return isnonzero128(or128(x, y));
1119	#endif
1120	}
1121
1122	/**
1123	* "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
1124	* mask indicating which 32-bit words contain differences.
1125	*/
1126	static really_inline
1127	u32 diffrich512(m512 a, m512 b) {
1128	#if defined(HAVE_AVX512)
1129	return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
1130	#elif defined(HAVE_AVX2)
1131	return diffrich256(a.lo, b.lo) \| (diffrich256(a.hi, b.hi) << `8`);
1132	#else
1133	a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
1134	a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
1135	a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
1136	a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
1137	m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
1138	_mm_packs_epi32(a.hi.lo, a.hi.hi));
1139	return ~(_mm_movemask_epi8(packed)) & `0xffff`;
1140	#endif
1141	}
1142
1143	/**
1144	* "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
1145	* returns a 16-bit mask indicating which 64-bit words contain differences.
1146	*/
1147	static really_inline
1148	u32 diffrich64_512(m512 a, m512 b) {
1149	//TODO: cmp_epi64?
1150	u32 d = diffrich512(a, b);
1151	return (d \| (d >> `1`)) & `0x55555555`;
1152	}
1153
1154	// aligned load
1155	static really_inline
1156	m512 load512(const void *ptr) {
1157	#if defined(HAVE_AVX512)
1158	return _mm512_load_si512(ptr);
1159	#else
1160	assert(ISALIGNED_N(ptr, alignof(m256)));
1161	m512 rv = { load256(ptr), load256((const char *)ptr + `32`) };
1162	return rv;
1163	#endif
1164	}
1165
1166	// aligned store
1167	static really_inline
1168	void store512(void *ptr, m512 a) {
1169	assert(ISALIGNED_N(ptr, alignof(m512)));
1170	#if defined(HAVE_AVX512)
1171	return _mm512_store_si512(ptr, a);
1172	#elif defined(HAVE_AVX2)
1173	m512 x = (m512 )ptr;
1174	store256(&x->lo, a.lo);
1175	store256(&x->hi, a.hi);
1176	#else
1177	ptr = assume_aligned(ptr, `16`);
1178	(m512 )ptr = a;
1179	#endif
1180	}
1181
1182	// unaligned load
1183	static really_inline
1184	m512 loadu512(const void *ptr) {
1185	#if defined(HAVE_AVX512)
1186	return _mm512_loadu_si512(ptr);
1187	#else
1188	m512 rv = { loadu256(ptr), loadu256((const char *)ptr + `32`) };
1189	return rv;
1190	#endif
1191	}
1192
1193	#if defined(HAVE_AVX512)
1194	static really_inline
1195	m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
1196	return _mm512_maskz_loadu_epi8(k, ptr);
1197	}
1198
1199	static really_inline
1200	m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
1201	return _mm512_mask_loadu_epi8(src, k, ptr);
1202	}
1203
1204	static really_inline
1205	m512 set_mask_m512(__mmask64 k) {
1206	return _mm512_movm_epi8(k);
1207	}
1208	#endif
1209
1210	// packed unaligned store of first N bytes
1211	static really_inline
1212	void storebytes512(void ptr, m512 a, unsigned* int n) {
1213	assert(n <= sizeof(a));
1214	memcpy(ptr, &a, n);
1215	}
1216
1217	// packed unaligned load of first N bytes, pad with zero
1218	static really_inline
1219	m512 loadbytes512(const void ptr, unsigned* int n) {
1220	m512 a = zeroes512();
1221	assert(n <= sizeof(a));
1222	memcpy(&a, ptr, n);
1223	return a;
1224	}
1225
1226	static really_inline
1227	m512 mask1bit512(unsigned int n) {
1228	assert(n < sizeof(m512) * `8`);
1229	u32 mask_idx = ((n % `8`) * `64`) + `95`;
1230	mask_idx -= n / `8`;
1231	return loadu512(&simd_onebit_masks[mask_idx]);
1232	}
1233
1234	// switches on bit N in the given vector.
1235	static really_inline
1236	void setbit512(m512 ptr, unsigned* int n) {
1237	assert(n < sizeof(ptr) `8`);
1238	#if !defined(HAVE_AVX2)
1239	m128 *sub;
1240	if (n < `128`) {
1241	sub = &ptr->lo.lo;
1242	} else if (n < `256`) {
1243	sub = &ptr->lo.hi;
1244	} else if (n < `384`) {
1245	sub = &ptr->hi.lo;
1246	} else {
1247	sub = &ptr->hi.hi;
1248	}
1249	setbit128(sub, n % `128`);
1250	#elif defined(HAVE_AVX512)
1251	ptr = or512(mask1bit512(n), ptr);
1252	#else
1253	m256 *sub;
1254	if (n < `256`) {
1255	sub = &ptr->lo;
1256	} else {
1257	sub = &ptr->hi;
1258	n -= `256`;
1259	}
1260	setbit256(sub, n);
1261	#endif
1262	}
1263
1264	// switches off bit N in the given vector.
1265	static really_inline
1266	void clearbit512(m512 ptr, unsigned* int n) {
1267	assert(n < sizeof(ptr) `8`);
1268	#if !defined(HAVE_AVX2)
1269	m128 *sub;
1270	if (n < `128`) {
1271	sub = &ptr->lo.lo;
1272	} else if (n < `256`) {
1273	sub = &ptr->lo.hi;
1274	} else if (n < `384`) {
1275	sub = &ptr->hi.lo;
1276	} else {
1277	sub = &ptr->hi.hi;
1278	}
1279	clearbit128(sub, n % `128`);
1280	#elif defined(HAVE_AVX512)
1281	ptr = andnot512(mask1bit512(n), ptr);
1282	#else
1283	m256 *sub;
1284	if (n < `256`) {
1285	sub = &ptr->lo;
1286	} else {
1287	sub = &ptr->hi;
1288	n -= `256`;
1289	}
1290	clearbit256(sub, n);
1291	#endif
1292	}
1293
1294	// tests bit N in the given vector.
1295	static really_inline
1296	char testbit512(m512 val, unsigned int n) {
1297	assert(n < sizeof(val) * `8`);
1298	#if !defined(HAVE_AVX2)
1299	m128 sub;
1300	if (n < `128`) {
1301	sub = val.lo.lo;
1302	} else if (n < `256`) {
1303	sub = val.lo.hi;
1304	} else if (n < `384`) {
1305	sub = val.hi.lo;
1306	} else {
1307	sub = val.hi.hi;
1308	}
1309	return testbit128(sub, n % `128`);
1310	#elif defined(HAVE_AVX512)
1311	const m512 mask = mask1bit512(n);
1312	return !!_mm512_test_epi8_mask(mask, val);
1313	#else
1314	m256 sub;
1315	if (n < `256`) {
1316	sub = val.lo;
1317	} else {
1318	sub = val.hi;
1319	n -= `256`;
1320	}
1321	return testbit256(sub, n);
1322	#endif
1323	}
1324
1325	#endif
1326

Browse the source code of ClickHouse/contrib/hyperscan/src/util/simd_utils.h