1// [Blend2D]
2// 2D Vector Graphics Powered by a JIT Compiler.
3//
4// [License]
5// Zlib - See LICENSE.md file in the package.
6
7#ifndef BLEND2D_BLSIMD_X86_P_H
8#define BLEND2D_BLSIMD_X86_P_H
9
10#include "./blsupport_p.h"
11#include "./bltables_p.h"
12
13#if defined(_MSC_VER)
14 #include <intrin.h>
15#endif
16
17#if defined(BL_TARGET_OPT_SSE)
18 #include <xmmintrin.h>
19#endif
20
21#if defined(BL_TARGET_OPT_SSE2)
22 #include <emmintrin.h>
23#endif
24
25#if defined(BL_TARGET_OPT_SSE3) && !defined(_MSC_VER)
26 #include <pmmintrin.h>
27#endif
28
29#if defined(BL_TARGET_OPT_SSSE3)
30 #include <tmmintrin.h>
31#endif
32
33#if defined(BL_TARGET_OPT_SSE4_1)
34 #include <smmintrin.h>
35#endif
36
37#if defined(BL_TARGET_OPT_SSE4_2)
38 #include <nmmintrin.h>
39#endif
40
41#if defined(BL_TARGET_OPT_AVX) || defined(BL_TARGET_OPT_AVX2)
42 #include <immintrin.h>
43#endif
44
45#if defined(BL_TARGET_OPT_NEON)
46 #include <arm_neon.h>
47#endif
48
49//! \cond INTERNAL
50//! \addtogroup blend2d_internal
51//! \{
52
53//! SIMD namespace contains helper functions to access SIMD intrinsics. The
54//! names of these functions correspond to names of functions used by pipeline
55//! generator (BLPipe).
56namespace SIMD {
57
58// ============================================================================
59// [BLSIMD - Features]
60// ============================================================================
61
62#if defined(BL_TARGET_OPT_AVX2)
63 #define BL_TARGET_SIMD_I 256
64 #define BL_TARGET_SIMD_F 256
65 #define BL_TARGET_SIMD_D 256
66#elif defined(BL_TARGET_OPT_AVX)
67 #define BL_TARGET_SIMD_I 128
68 #define BL_TARGET_SIMD_F 256
69 #define BL_TARGET_SIMD_D 256
70#elif defined(BL_TARGET_OPT_SSE2)
71 #define BL_TARGET_SIMD_I 128
72 #define BL_TARGET_SIMD_F 128
73 #define BL_TARGET_SIMD_D 128
74#else
75 #define BL_TARGET_SIMD_I 0
76 #define BL_TARGET_SIMD_F 0
77 #define BL_TARGET_SIMD_D 0
78#endif
79
80// ============================================================================
81// [BLSIMD - Types]
82// ============================================================================
83
84#if defined(BL_TARGET_OPT_SSE2)
85typedef __m128i I128;
86typedef __m128 F128;
87typedef __m128d D128;
88#endif
89
90// 256-bit types (including integers) are accessible through AVX as AVX also
91// include conversion instructions between integer types and FP types.
92#if defined(BL_TARGET_OPT_AVX)
93typedef __m256i I256;
94typedef __m256 F256;
95typedef __m256d D256;
96#endif
97
98// Must be in anonymous namespace.
99namespace {
100
101// ============================================================================
102// [BLSIMD - Cast]
103// ============================================================================
104
105template<typename Out, typename In>
106BL_INLINE const Out& v_const_as(const In* c) noexcept {
107 return *reinterpret_cast<const Out*>(c);
108}
109
110template<typename DstT, typename SrcT>
111BL_INLINE DstT vcast(const SrcT& x) noexcept { return x; }
112
113#if defined(BL_TARGET_OPT_SSE2)
114template<> BL_INLINE F128 vcast(const I128& x) noexcept { return _mm_castsi128_ps(x); }
115template<> BL_INLINE D128 vcast(const I128& x) noexcept { return _mm_castsi128_pd(x); }
116template<> BL_INLINE I128 vcast(const F128& x) noexcept { return _mm_castps_si128(x); }
117template<> BL_INLINE D128 vcast(const F128& x) noexcept { return _mm_castps_pd(x); }
118template<> BL_INLINE I128 vcast(const D128& x) noexcept { return _mm_castpd_si128(x); }
119template<> BL_INLINE F128 vcast(const D128& x) noexcept { return _mm_castpd_ps(x); }
120#endif
121
122#if defined(BL_TARGET_OPT_AVX)
123template<> BL_INLINE I128 vcast(const I256& x) noexcept { return _mm256_castsi256_si128(x); }
124template<> BL_INLINE I256 vcast(const I128& x) noexcept { return _mm256_castsi128_si256(x); }
125
126template<> BL_INLINE F128 vcast(const F256& x) noexcept { return _mm256_castps256_ps128(x); }
127template<> BL_INLINE F256 vcast(const F128& x) noexcept { return _mm256_castps128_ps256(x); }
128
129template<> BL_INLINE D128 vcast(const D256& x) noexcept { return _mm256_castpd256_pd128(x); }
130template<> BL_INLINE D256 vcast(const D128& x) noexcept { return _mm256_castpd128_pd256(x); }
131
132template<> BL_INLINE D256 vcast(const F256& x) noexcept { return _mm256_castps_pd(x); }
133template<> BL_INLINE F256 vcast(const D256& x) noexcept { return _mm256_castpd_ps(x); }
134
135template<> BL_INLINE F256 vcast(const I256& x) noexcept { return _mm256_castsi256_ps(x); }
136template<> BL_INLINE I256 vcast(const F256& x) noexcept { return _mm256_castps_si256(x); }
137
138template<> BL_INLINE D256 vcast(const I256& x) noexcept { return _mm256_castsi256_pd(x); }
139template<> BL_INLINE I256 vcast(const D256& x) noexcept { return _mm256_castpd_si256(x); }
140#endif
141
142// ============================================================================
143// [BLSIMD - I128]
144// ============================================================================
145
146#if defined(BL_TARGET_OPT_SSE2)
147BL_INLINE I128 vzeroi128() noexcept { return _mm_setzero_si128(); }
148
149BL_INLINE I128 vseti128i8(int8_t x) noexcept { return _mm_set1_epi8(x); }
150BL_INLINE I128 vseti128i16(int16_t x) noexcept { return _mm_set1_epi16(x); }
151BL_INLINE I128 vseti128i32(int32_t x) noexcept { return _mm_set1_epi32(x); }
152
153BL_INLINE I128 vseti128i32(int32_t x1, int32_t x0) noexcept { return _mm_set_epi32(x1, x0, x1, x0); }
154BL_INLINE I128 vseti128i32(int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm_set_epi32(x3, x2, x1, x0); }
155
156BL_INLINE I128 vseti128i64(int64_t x) noexcept {
157#if BL_TARGET_ARCH_BITS >= 64
158 return _mm_set1_epi64x(x);
159#else
160 return vseti128i32(int32_t(uint64_t(x) >> 32), int32_t(x & 0xFFFFFFFFu));
161#endif
162}
163
164BL_INLINE I128 vseti128i64(int64_t x1, int64_t x0) noexcept {
165 return vseti128i32(int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
166 int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu));
167}
168
169BL_INLINE I128 vseti128u8(uint8_t x) noexcept { return vseti128i8(int8_t(x)); }
170BL_INLINE I128 vseti128u16(uint16_t x) noexcept { return vseti128i16(int16_t(x)); }
171BL_INLINE I128 vseti128u32(uint32_t x) noexcept { return vseti128i32(int32_t(x)); }
172BL_INLINE I128 vseti128u32(uint32_t x1, uint32_t x0) noexcept { return vseti128i32(int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0)); }
173BL_INLINE I128 vseti128u32(uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept { return vseti128i32(int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0)); }
174BL_INLINE I128 vseti128u64(uint64_t x) noexcept { return vseti128i64(int64_t(x)); }
175BL_INLINE I128 vseti128u64(uint64_t x1, uint64_t x0) noexcept { return vseti128i64(int64_t(x1), int64_t(x0)); }
176
177BL_INLINE I128 vcvti32i128(int32_t x) noexcept { return _mm_cvtsi32_si128(int(x)); }
178BL_INLINE I128 vcvtu32i128(uint32_t x) noexcept { return _mm_cvtsi32_si128(int(x)); }
179
180BL_INLINE int32_t vcvti128i32(const I128& x) noexcept { return int32_t(_mm_cvtsi128_si32(x)); }
181BL_INLINE uint32_t vcvti128u32(const I128& x) noexcept { return uint32_t(_mm_cvtsi128_si32(x)); }
182
183BL_INLINE I128 vcvti64i128(int64_t x) noexcept {
184#if BL_TARGET_ARCH_BITS >= 64
185 return _mm_cvtsi64_si128(x);
186#else
187 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&x));
188#endif
189}
190
191BL_INLINE int64_t vcvti128i64(const I128& x) noexcept {
192#if BL_TARGET_ARCH_BITS >= 64
193 return int64_t(_mm_cvtsi128_si64(x));
194#else
195 int64_t result;
196 _mm_storel_epi64(reinterpret_cast<__m128i*>(&result), x);
197 return result;
198#endif
199}
200
201BL_INLINE I128 vcvtu64i128(uint64_t x) noexcept { return vcvti64i128(int64_t(x)); }
202BL_INLINE uint64_t vcvti128u64(const I128& x) noexcept { return uint64_t(vcvti128i64(x)); }
203
204template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
205BL_INLINE I128 vswizli16(const I128& x) noexcept { return _mm_shufflelo_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
206template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
207BL_INLINE I128 vswizhi16(const I128& x) noexcept { return _mm_shufflehi_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
208
209template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
210BL_INLINE I128 vswizi16(const I128& x) noexcept { return vswizhi16<A, B, C, D>(vswizli16<A, B, C, D>(x)); }
211template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
212BL_INLINE I128 vswizi32(const I128& x) noexcept { return _mm_shuffle_epi32(x, _MM_SHUFFLE(A, B, C, D)); }
213template<int A, int B>
214BL_INLINE I128 vswizi64(const I128& x) noexcept { return vswizi32<A*2 + 1, A*2, B*2 + 1, B*2>(x); }
215
216#if defined(BL_TARGET_OPT_SSSE3)
217BL_INLINE I128 vpshufb(const I128& x, const I128& y) noexcept { return _mm_shuffle_epi8(x, y); }
218
219template<int N_BYTES>
220BL_INLINE I128 vpalignr(const I128& x, const I128& y) noexcept { return _mm_alignr_epi8(x, y, N_BYTES); }
221#endif
222
223BL_INLINE I128 vswapi64(const I128& x) noexcept { return vswizi64<0, 1>(x); }
224BL_INLINE I128 vdupli64(const I128& x) noexcept { return vswizi64<0, 0>(x); }
225BL_INLINE I128 vduphi64(const I128& x) noexcept { return vswizi64<1, 1>(x); }
226
227BL_INLINE I128 vmovli64u8u16(const I128& x) noexcept {
228#if defined(BL_TARGET_OPT_SSE4_1)
229 return _mm_cvtepu8_epi16(x);
230#else
231 return _mm_unpacklo_epi8(x, _mm_setzero_si128());
232#endif
233}
234
235BL_INLINE I128 vmovli64u16u32(const I128& x) noexcept {
236#if defined(BL_TARGET_OPT_SSE4_1)
237 return _mm_cvtepu16_epi32(x);
238#else
239 return _mm_unpacklo_epi16(x, _mm_setzero_si128());
240#endif
241}
242
243BL_INLINE I128 vmovli64u32u64(const I128& x) noexcept {
244#if defined(BL_TARGET_OPT_SSE4_1)
245 return _mm_cvtepu32_epi64(x);
246#else
247 return _mm_unpacklo_epi32(x, _mm_setzero_si128());
248#endif
249}
250
251BL_INLINE I128 vmovhi64u8u16(const I128& x) noexcept { return _mm_unpackhi_epi8(x, _mm_setzero_si128()); }
252BL_INLINE I128 vmovhi64u16u32(const I128& x) noexcept { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }
253BL_INLINE I128 vmovhi64u32u64(const I128& x) noexcept { return _mm_unpackhi_epi32(x, _mm_setzero_si128()); }
254
255BL_INLINE I128 vpacki16i8(const I128& x, const I128& y) noexcept { return _mm_packs_epi16(x, y); }
256BL_INLINE I128 vpacki16u8(const I128& x, const I128& y) noexcept { return _mm_packus_epi16(x, y); }
257BL_INLINE I128 vpacki32i16(const I128& x, const I128& y) noexcept { return _mm_packs_epi32(x, y); }
258
259BL_INLINE I128 vpacki16i8(const I128& x) noexcept { return vpacki16i8(x, x); }
260BL_INLINE I128 vpacki16u8(const I128& x) noexcept { return vpacki16u8(x, x); }
261BL_INLINE I128 vpacki32i16(const I128& x) noexcept { return vpacki32i16(x, x); }
262
263BL_INLINE I128 vpacki32u16(const I128& x, const I128& y) noexcept {
264#if defined(BL_TARGET_OPT_SSE4_1)
265 return _mm_packus_epi32(x, y);
266#else
267 I128 xShifted = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
268 I128 yShifted = _mm_srai_epi32(_mm_slli_epi32(y, 16), 16);
269 return _mm_packs_epi32(xShifted, yShifted);
270#endif
271}
272
273BL_INLINE I128 vpacki32u16(const I128& x) noexcept {
274#if defined(BL_TARGET_OPT_SSE4_1)
275 return vpacki32u16(x, x);
276#else
277 I128 xShifted = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
278 return _mm_packs_epi32(xShifted, xShifted);
279#endif
280}
281
282BL_INLINE I128 vpacki32i8(const I128& x) noexcept { return vpacki16i8(vpacki32i16(x)); }
283BL_INLINE I128 vpacki32i8(const I128& x, const I128& y) noexcept { return vpacki16i8(vpacki32i16(x, y)); }
284BL_INLINE I128 vpacki32i8(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16i8(vpacki32i16(x, y), vpacki32i16(z, w)); }
285
286BL_INLINE I128 vpacki32u8(const I128& x) noexcept { return vpacki16u8(vpacki32i16(x)); }
287BL_INLINE I128 vpacki32u8(const I128& x, const I128& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
288BL_INLINE I128 vpacki32u8(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
289
290// These assume that HI bytes of all inputs are always zero, so the implementation
291// can decide between packing with signed/unsigned saturation or vector swizzling.
292BL_INLINE I128 vpackzzwb(const I128& x) noexcept { return vpacki16u8(x); }
293BL_INLINE I128 vpackzzwb(const I128& x, const I128& y) noexcept { return vpacki16u8(x, y); }
294
295BL_INLINE I128 vpackzzdw(const I128& x) noexcept {
296#if defined(BL_TARGET_OPT_SSE4_1) || !defined(BL_TARGET_OPT_SSSE3)
297 return vpacki32u16(x);
298#else
299 return vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo));
300#endif
301}
302
303BL_INLINE I128 vpackzzdw(const I128& x, const I128& y) noexcept {
304#if defined(BL_TARGET_OPT_SSE4_1) || !defined(BL_TARGET_OPT_SSSE3)
305 return vpacki32u16(x, y);
306#else
307 I128 xLo = vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo));
308 I128 yLo = vpshufb(y, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo));
309 return _mm_unpacklo_epi64(xLo, yLo);
310#endif
311}
312
313BL_INLINE I128 vpackzzdb(const I128& x) noexcept {
314#if defined(BL_TARGET_OPT_SSSE3)
315 return vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u8_lo));
316#else
317 return vpacki16u8(vpacki32i16(x));
318#endif
319}
320
321BL_INLINE I128 vpackzzdb(const I128& x, const I128& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
322BL_INLINE I128 vpackzzdb(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
323
324BL_INLINE I128 vunpackli8(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi8(x, y); }
325BL_INLINE I128 vunpackhi8(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi8(x, y); }
326
327BL_INLINE I128 vunpackli16(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi16(x, y); }
328BL_INLINE I128 vunpackhi16(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi16(x, y); }
329
330BL_INLINE I128 vunpackli32(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi32(x, y); }
331BL_INLINE I128 vunpackhi32(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi32(x, y); }
332
333BL_INLINE I128 vunpackli64(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi64(x, y); }
334BL_INLINE I128 vunpackhi64(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi64(x, y); }
335
336BL_INLINE I128 vor(const I128& x, const I128& y) noexcept { return _mm_or_si128(x, y); }
337BL_INLINE I128 vxor(const I128& x, const I128& y) noexcept { return _mm_xor_si128(x, y); }
338BL_INLINE I128 vand(const I128& x, const I128& y) noexcept { return _mm_and_si128(x, y); }
339BL_INLINE I128 vandnot_a(const I128& x, const I128& y) noexcept { return _mm_andnot_si128(x, y); }
340BL_INLINE I128 vandnot_b(const I128& x, const I128& y) noexcept { return _mm_andnot_si128(y, x); }
341BL_INLINE I128 vblendmask(const I128& x, const I128& y, const I128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
342
343//! Blend BITs or BYTEs, taking advantage of `pblendvb` (SSE4.1), if possible.
344BL_INLINE I128 vblendx(const I128& x, const I128& y, const I128& mask) noexcept {
345#if defined(BL_TARGET_OPT_SSE4_1)
346 return _mm_blendv_epi8(x, y, mask);
347#else
348 return vblendmask(x, y, mask);
349#endif
350}
351
352BL_INLINE I128 vaddi8(const I128& x, const I128& y) noexcept { return _mm_add_epi8(x, y); }
353BL_INLINE I128 vaddi16(const I128& x, const I128& y) noexcept { return _mm_add_epi16(x, y); }
354BL_INLINE I128 vaddi32(const I128& x, const I128& y) noexcept { return _mm_add_epi32(x, y); }
355BL_INLINE I128 vaddi64(const I128& x, const I128& y) noexcept { return _mm_add_epi64(x, y); }
356
357BL_INLINE I128 vaddsi8(const I128& x, const I128& y) noexcept { return _mm_adds_epi8(x, y); }
358BL_INLINE I128 vaddsu8(const I128& x, const I128& y) noexcept { return _mm_adds_epu8(x, y); }
359BL_INLINE I128 vaddsi16(const I128& x, const I128& y) noexcept { return _mm_adds_epi16(x, y); }
360BL_INLINE I128 vaddsu16(const I128& x, const I128& y) noexcept { return _mm_adds_epu16(x, y); }
361
362BL_INLINE I128 vsubi8(const I128& x, const I128& y) noexcept { return _mm_sub_epi8(x, y); }
363BL_INLINE I128 vsubi16(const I128& x, const I128& y) noexcept { return _mm_sub_epi16(x, y); }
364BL_INLINE I128 vsubi32(const I128& x, const I128& y) noexcept { return _mm_sub_epi32(x, y); }
365BL_INLINE I128 vsubi64(const I128& x, const I128& y) noexcept { return _mm_sub_epi64(x, y); }
366
367BL_INLINE I128 vsubsi8(const I128& x, const I128& y) noexcept { return _mm_subs_epi8(x, y); }
368BL_INLINE I128 vsubsu8(const I128& x, const I128& y) noexcept { return _mm_subs_epu8(x, y); }
369BL_INLINE I128 vsubsi16(const I128& x, const I128& y) noexcept { return _mm_subs_epi16(x, y); }
370BL_INLINE I128 vsubsu16(const I128& x, const I128& y) noexcept { return _mm_subs_epu16(x, y); }
371
372BL_INLINE I128 vmuli16(const I128& x, const I128& y) noexcept { return _mm_mullo_epi16(x, y); }
373BL_INLINE I128 vmulu16(const I128& x, const I128& y) noexcept { return _mm_mullo_epi16(x, y); }
374BL_INLINE I128 vmulhi16(const I128& x, const I128& y) noexcept { return _mm_mulhi_epi16(x, y); }
375BL_INLINE I128 vmulhu16(const I128& x, const I128& y) noexcept { return _mm_mulhi_epu16(x, y); }
376
377template<uint8_t N_BITS> BL_INLINE I128 vslli16(const I128& x) noexcept { return _mm_slli_epi16(x, N_BITS); }
378template<uint8_t N_BITS> BL_INLINE I128 vslli32(const I128& x) noexcept { return _mm_slli_epi32(x, N_BITS); }
379template<uint8_t N_BITS> BL_INLINE I128 vslli64(const I128& x) noexcept { return _mm_slli_epi64(x, N_BITS); }
380
381template<uint8_t N_BITS> BL_INLINE I128 vsrli16(const I128& x) noexcept { return _mm_srli_epi16(x, N_BITS); }
382template<uint8_t N_BITS> BL_INLINE I128 vsrli32(const I128& x) noexcept { return _mm_srli_epi32(x, N_BITS); }
383template<uint8_t N_BITS> BL_INLINE I128 vsrli64(const I128& x) noexcept { return _mm_srli_epi64(x, N_BITS); }
384
385template<uint8_t N_BITS> BL_INLINE I128 vsrai16(const I128& x) noexcept { return _mm_srai_epi16(x, N_BITS); }
386template<uint8_t N_BITS> BL_INLINE I128 vsrai32(const I128& x) noexcept { return _mm_srai_epi32(x, N_BITS); }
387
388template<uint8_t N_BYTES> BL_INLINE I128 vslli128b(const I128& x) noexcept { return _mm_slli_si128(x, N_BYTES); }
389template<uint8_t N_BYTES> BL_INLINE I128 vsrli128b(const I128& x) noexcept { return _mm_srli_si128(x, N_BYTES); }
390
391#if defined(BL_TARGET_OPT_SSE4_1)
392BL_INLINE I128 vmini8(const I128& x, const I128& y) noexcept { return _mm_min_epi8(x, y); }
393BL_INLINE I128 vmaxi8(const I128& x, const I128& y) noexcept { return _mm_max_epi8(x, y); }
394#else
395BL_INLINE I128 vmini8(const I128& x, const I128& y) noexcept { return vblendmask(y, x, _mm_cmpgt_epi8(x, y)); }
396BL_INLINE I128 vmaxi8(const I128& x, const I128& y) noexcept { return vblendmask(x, y, _mm_cmpgt_epi8(x, y)); }
397#endif
398
399BL_INLINE I128 vminu8(const I128& x, const I128& y) noexcept { return _mm_min_epu8(x, y); }
400BL_INLINE I128 vmaxu8(const I128& x, const I128& y) noexcept { return _mm_max_epu8(x, y); }
401
402BL_INLINE I128 vmini16(const I128& x, const I128& y) noexcept { return _mm_min_epi16(x, y); }
403BL_INLINE I128 vmaxi16(const I128& x, const I128& y) noexcept { return _mm_max_epi16(x, y); }
404
405#if defined(BL_TARGET_OPT_SSE4_1)
406BL_INLINE I128 vminu16(const I128& x, const I128& y) noexcept { return _mm_min_epu16(x, y); }
407BL_INLINE I128 vmaxu16(const I128& x, const I128& y) noexcept { return _mm_max_epu16(x, y); }
408#else
409BL_INLINE I128 vminu16(const I128& x, const I128& y) noexcept { return _mm_sub_epi16(x, _mm_subs_epu16(x, y)); }
410BL_INLINE I128 vmaxu16(const I128& x, const I128& y) noexcept { return _mm_add_epi16(x, _mm_subs_epu16(x, y)); }
411#endif
412
413#if defined(BL_TARGET_OPT_SSE4_1)
414BL_INLINE I128 vmini32(const I128& x, const I128& y) noexcept { return _mm_min_epi32(x, y); }
415BL_INLINE I128 vmaxi32(const I128& x, const I128& y) noexcept { return _mm_max_epi32(x, y); }
416#else
417BL_INLINE I128 vmini32(const I128& x, const I128& y) noexcept { return vblendmask(y, x, _mm_cmpgt_epi32(x, y)); }
418BL_INLINE I128 vmaxi32(const I128& x, const I128& y) noexcept { return vblendmask(x, y, _mm_cmpgt_epi32(x, y)); }
419#endif
420
421BL_INLINE I128 vcmpeqi8(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi8(x, y); }
422BL_INLINE I128 vcmpgti8(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi8(x, y); }
423
424BL_INLINE I128 vcmpeqi16(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi16(x, y); }
425BL_INLINE I128 vcmpgti16(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi16(x, y); }
426
427BL_INLINE I128 vcmpeqi32(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi32(x, y); }
428BL_INLINE I128 vcmpgti32(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi32(x, y); }
429
430#if defined(BL_TARGET_OPT_SSSE3)
431BL_INLINE I128 vabsi8(const I128& x) noexcept { return _mm_abs_epi8(x); }
432BL_INLINE I128 vabsi16(const I128& x) noexcept { return _mm_abs_epi16(x); }
433BL_INLINE I128 vabsi32(const I128& x) noexcept { return _mm_abs_epi32(x); }
434#else
435BL_INLINE I128 vabsi8(const I128& x) noexcept { return vminu8(vsubi8(vzeroi128(), x), x); }
436BL_INLINE I128 vabsi16(const I128& x) noexcept { return vmaxi16(vsubi16(vzeroi128(), x), x); }
437BL_INLINE I128 vabsi32(const I128& x) noexcept { I128 y = vsrai32<31>(x); return vsubi32(vxor(x, y), y); }
438#endif
439
440BL_INLINE I128 vloadi128_32(const void* p) noexcept { return _mm_cvtsi32_si128(int(*(BLMisalignedUInt<uint32_t, 1>::T*)(p))); }
441BL_INLINE I128 vloadi128_64(const void* p) noexcept { return _mm_loadl_epi64(static_cast<const I128*>(p)); }
442BL_INLINE I128 vloadi128a(const void* p) noexcept { return _mm_load_si128(static_cast<const I128*>(p)); }
443BL_INLINE I128 vloadi128u(const void* p) noexcept { return _mm_loadu_si128(static_cast<const I128*>(p)); }
444
445BL_INLINE I128 vloadi128_l64(const I128& x, const void* p) noexcept { return vcast<I128>(_mm_loadl_pd(vcast<D128>(x), static_cast<const double*>(p))); }
446BL_INLINE I128 vloadi128_h64(const I128& x, const void* p) noexcept { return vcast<I128>(_mm_loadh_pd(vcast<D128>(x), static_cast<const double*>(p))); }
447
448BL_INLINE void vstorei32(void* p, const I128& x) noexcept { static_cast<int*>(p)[0] = _mm_cvtsi128_si32(x); }
449BL_INLINE void vstorei64(void* p, const I128& x) noexcept { _mm_storel_epi64(static_cast<I128*>(p), x); }
450BL_INLINE void vstorei128a(void* p, const I128& x) noexcept { _mm_store_si128(static_cast<I128*>(p), x); }
451BL_INLINE void vstorei128u(void* p, const I128& x) noexcept { _mm_storeu_si128(static_cast<I128*>(p), x); }
452
453BL_INLINE void vstoreli64(void* p, const I128& x) noexcept { _mm_storel_epi64(static_cast<I128*>(p), x); }
454BL_INLINE void vstorehi64(void* p, const I128& x) noexcept { _mm_storeh_pd(static_cast<double*>(p), vcast<D128>(x)); }
455
456BL_INLINE bool vhasmaski8(const I128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; }
457BL_INLINE bool vhasmaski8(const F128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; }
458BL_INLINE bool vhasmaski8(const D128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; }
459
460BL_INLINE bool vhasmaski32(const I128& x, int bits0_3) noexcept { return _mm_movemask_ps(vcast<F128>(x)) == bits0_3; }
461BL_INLINE bool vhasmaski64(const I128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; }
462
463BL_INLINE I128 vdiv255u16(const I128& x) noexcept {
464 I128 y = vaddi16(x, v_const_as<I128>(blCommonTable.i128_0080008000800080));
465 return vmulhu16(y, v_const_as<I128>(blCommonTable.i128_0101010101010101));
466}
467#endif
468
469// ============================================================================
470// [BLSIMD - F128]
471// ============================================================================
472
473#if defined(BL_TARGET_OPT_SSE)
474BL_INLINE F128 vzerof128() noexcept { return _mm_setzero_ps(); }
475
476BL_INLINE F128 vsetf128(float x) noexcept { return _mm_set1_ps(x); }
477BL_INLINE F128 vsetf128(float x3, float x2, float x1, float x0) noexcept { return _mm_set_ps(x3, x2, x1, x0); }
478
479//! Cast a scalar `float` to `F128` vector type.
480BL_INLINE F128 vcvtf32f128(float x) noexcept {
481#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
482 // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70708
483 F128 reg;
484 __asm__("" : "=x" (reg) : "0" (x));
485 return reg;
486#else
487 return _mm_set_ss(x);
488#endif
489}
490BL_INLINE float vcvtf128f32(const F128& x) noexcept { return _mm_cvtss_f32(x); }
491
492BL_INLINE F128 vcvti32f128(int32_t x) noexcept { return _mm_cvtsi32_ss(vzerof128(), x); }
493BL_INLINE int32_t vcvtf128i32(const F128& x) noexcept { return _mm_cvtss_si32(x); }
494BL_INLINE int32_t vcvttf128i32(const F128& x) noexcept { return _mm_cvttss_si32(x); }
495
496#if BL_TARGET_ARCH_BITS >= 64
497BL_INLINE F128 vcvti64f128(int64_t x) noexcept { return _mm_cvtsi64_ss(vzerof128(), x); }
498BL_INLINE int64_t vcvtf128i64(const F128& x) noexcept { return _mm_cvtss_si64(x); }
499BL_INLINE int64_t vcvttf128i64(const F128& x) noexcept { return _mm_cvttss_si64(x); }
500#endif
501
502template<int A, int B, int C, int D>
503BL_INLINE F128 vshuff32(const F128& x, const F128& y) noexcept { return _mm_shuffle_ps(x, y, _MM_SHUFFLE(A, B, C, D)); }
504
505template<int A, int B, int C, int D>
506BL_INLINE F128 vswizf32(const F128& x) noexcept {
507#if defined(BL_TARGET_OPT_SSE2) && !defined(BL_TARGET_OPT_AVX)
508 return vcast<F128>(vswizi32<A, B, C, D>(vcast<I128>(x)));
509#else
510 return vshuff32<A, B, C, D>(x, x);
511#endif
512}
513
514template<int A, int B>
515BL_INLINE F128 vswizf64(const F128& x) noexcept {
516#if defined(BL_TARGET_OPT_SSE2) && !defined(BL_TARGET_OPT_AVX)
517 return vcast<F128>(vswizi64<A, B>(vcast<I128>(x)));
518#else
519 return vswizf32<A*2 + 1, A*2, B*2 + 1, B*2>(x);
520#endif
521}
522
523BL_INLINE F128 vduplf32(const F128& x) noexcept { return vswizf32<2, 2, 0, 0>(x); }
524BL_INLINE F128 vduphf32(const F128& x) noexcept { return vswizf32<3, 3, 1, 1>(x); }
525
526BL_INLINE F128 vswapf64(const F128& x) noexcept { return vswizf64<0, 1>(x); }
527BL_INLINE F128 vduplf64(const F128& x) noexcept { return vswizf64<0, 0>(x); }
528BL_INLINE F128 vduphf64(const F128& x) noexcept { return vswizf64<1, 1>(x); }
529
530BL_INLINE F128 vunpacklf32(const F128& x, const F128& y) noexcept { return _mm_unpacklo_ps(x, y); }
531BL_INLINE F128 vunpackhf32(const F128& x, const F128& y) noexcept { return _mm_unpackhi_ps(x, y); }
532
533BL_INLINE F128 vor(const F128& x, const F128& y) noexcept { return _mm_or_ps(x, y); }
534BL_INLINE F128 vxor(const F128& x, const F128& y) noexcept { return _mm_xor_ps(x, y); }
535BL_INLINE F128 vand(const F128& x, const F128& y) noexcept { return _mm_and_ps(x, y); }
536BL_INLINE F128 vandnot_a(const F128& x, const F128& y) noexcept { return _mm_andnot_ps(x, y); }
537BL_INLINE F128 vandnot_b(const F128& x, const F128& y) noexcept { return _mm_andnot_ps(y, x); }
538BL_INLINE F128 vblendmask(const F128& x, const F128& y, const F128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
539
540BL_INLINE F128 vaddss(const F128& x, const F128& y) noexcept { return _mm_add_ss(x, y); }
541BL_INLINE F128 vaddps(const F128& x, const F128& y) noexcept { return _mm_add_ps(x, y); }
542
543BL_INLINE F128 vsubss(const F128& x, const F128& y) noexcept { return _mm_sub_ss(x, y); }
544BL_INLINE F128 vsubps(const F128& x, const F128& y) noexcept { return _mm_sub_ps(x, y); }
545
546BL_INLINE F128 vmulss(const F128& x, const F128& y) noexcept { return _mm_mul_ss(x, y); }
547BL_INLINE F128 vmulps(const F128& x, const F128& y) noexcept { return _mm_mul_ps(x, y); }
548
549BL_INLINE F128 vdivss(const F128& x, const F128& y) noexcept { return _mm_div_ss(x, y); }
550BL_INLINE F128 vdivps(const F128& x, const F128& y) noexcept { return _mm_div_ps(x, y); }
551
552BL_INLINE F128 vminss(const F128& x, const F128& y) noexcept { return _mm_min_ss(x, y); }
553BL_INLINE F128 vminps(const F128& x, const F128& y) noexcept { return _mm_min_ps(x, y); }
554
555BL_INLINE F128 vmaxss(const F128& x, const F128& y) noexcept { return _mm_max_ss(x, y); }
556BL_INLINE F128 vmaxps(const F128& x, const F128& y) noexcept { return _mm_max_ps(x, y); }
557
558BL_INLINE F128 vcmpeqss(const F128& x, const F128& y) noexcept { return _mm_cmpeq_ss(x, y); }
559BL_INLINE F128 vcmpeqps(const F128& x, const F128& y) noexcept { return _mm_cmpeq_ps(x, y); }
560
561BL_INLINE F128 vcmpness(const F128& x, const F128& y) noexcept { return _mm_cmpneq_ss(x, y); }
562BL_INLINE F128 vcmpneps(const F128& x, const F128& y) noexcept { return _mm_cmpneq_ps(x, y); }
563
564BL_INLINE F128 vcmpgess(const F128& x, const F128& y) noexcept { return _mm_cmpge_ss(x, y); }
565BL_INLINE F128 vcmpgeps(const F128& x, const F128& y) noexcept { return _mm_cmpge_ps(x, y); }
566
567BL_INLINE F128 vcmpgtss(const F128& x, const F128& y) noexcept { return _mm_cmpgt_ss(x, y); }
568BL_INLINE F128 vcmpgtps(const F128& x, const F128& y) noexcept { return _mm_cmpgt_ps(x, y); }
569
570BL_INLINE F128 vcmpless(const F128& x, const F128& y) noexcept { return _mm_cmple_ss(x, y); }
571BL_INLINE F128 vcmpleps(const F128& x, const F128& y) noexcept { return _mm_cmple_ps(x, y); }
572
573BL_INLINE F128 vcmpltss(const F128& x, const F128& y) noexcept { return _mm_cmplt_ss(x, y); }
574BL_INLINE F128 vcmpltps(const F128& x, const F128& y) noexcept { return _mm_cmplt_ps(x, y); }
575
576BL_INLINE F128 vsqrtss(const F128& x) noexcept { return _mm_sqrt_ss(x); }
577BL_INLINE F128 vsqrtps(const F128& x) noexcept { return _mm_sqrt_ps(x); }
578
579BL_INLINE F128 vloadf128_32(const void* p) noexcept { return _mm_load_ss(static_cast<const float*>(p)); }
580BL_INLINE F128 vloadf128_64(const void* p) noexcept { return vcast<F128>(vloadi128_64(p)); }
581
582BL_INLINE F128 vloadf128a(const void* p) noexcept { return _mm_load_ps(static_cast<const float*>(p)); }
583BL_INLINE F128 vloadf128u(const void* p) noexcept { return _mm_loadu_ps(static_cast<const float*>(p)); }
584
585BL_INLINE F128 vloadf128_l64(const F128& x, const void* p) noexcept { return _mm_loadl_pi(x, static_cast<const __m64*>(p)); }
586BL_INLINE F128 vloadf128_h64(const F128& x, const void* p) noexcept { return _mm_loadh_pi(x, static_cast<const __m64*>(p)); }
587
588BL_INLINE void vstoref32(void* p, const F128& x) noexcept { _mm_store_ss(static_cast<float*>(p), x); }
589BL_INLINE void vstoref64(void* p, const F128& x) noexcept { _mm_storel_pi(static_cast<__m64*>(p), x); }
590BL_INLINE void vstorelf64(void* p, const F128& x) noexcept { _mm_storel_pi(static_cast<__m64*>(p), x); }
591BL_INLINE void vstorehf64(void* p, const F128& x) noexcept { _mm_storeh_pi(static_cast<__m64*>(p), x); }
592BL_INLINE void vstoref128a(void* p, const F128& x) noexcept { _mm_store_ps(static_cast<float*>(p), x); }
593BL_INLINE void vstoref128u(void* p, const F128& x) noexcept { _mm_storeu_ps(static_cast<float*>(p), x); }
594
595BL_INLINE F128 vbroadcastf128_64(const void* p) noexcept {
596#if defined(BL_TARGET_OPT_SSE3)
597 return vcast<F128>(_mm_loaddup_pd(static_cast<const double*>(p)));
598#else
599 return vduplf64(vloadf128_64(p));
600#endif
601}
602
603BL_INLINE bool vhasmaskf32(const F128& x, int bits0_3) noexcept { return _mm_movemask_ps(vcast<F128>(x)) == bits0_3; }
604BL_INLINE bool vhasmaskf64(const F128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; }
605
606// ============================================================================
607// [BLSIMD - D128]
608// ============================================================================
609
610#if defined(BL_TARGET_OPT_SSE2)
611BL_INLINE D128 vzerod128() noexcept { return _mm_setzero_pd(); }
612
613BL_INLINE D128 vsetd128(double x) noexcept { return _mm_set1_pd(x); }
614BL_INLINE D128 vsetd128(double x1, double x0) noexcept { return _mm_set_pd(x1, x0); }
615
616//! Cast a scalar `double` to `D128` vector type.
617BL_INLINE D128 vcvtd64d128(double x) noexcept {
618#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
619 // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70708
620 D128 reg;
621 __asm__("" : "=x" (reg) : "0" (x));
622 return reg;
623#else
624 return _mm_set_sd(x);
625#endif
626}
627BL_INLINE double vcvtd128d64(const D128& x) noexcept { return _mm_cvtsd_f64(x); }
628
629BL_INLINE D128 vcvti32d128(int32_t x) noexcept { return _mm_cvtsi32_sd(vzerod128(), x); }
630BL_INLINE int32_t vcvtd128i32(const D128& x) noexcept { return _mm_cvtsd_si32(x); }
631BL_INLINE int32_t vcvttd128i32(const D128& x) noexcept { return _mm_cvttsd_si32(x); }
632
633#if BL_TARGET_ARCH_BITS >= 64
634BL_INLINE D128 vcvti64d128(int64_t x) noexcept { return _mm_cvtsi64_sd(vzerod128(), x); }
635BL_INLINE int64_t vcvtd128i64(const D128& x) noexcept { return _mm_cvtsd_si64(x); }
636BL_INLINE int64_t vcvttd128i64(const D128& x) noexcept { return _mm_cvttsd_si64(x); }
637#endif
638
639BL_INLINE D128 vcvtf128d128(const F128& x) noexcept { return _mm_cvtps_pd(x); }
640BL_INLINE F128 vcvtd128f128(const D128& x) noexcept { return _mm_cvtpd_ps(x); }
641
642BL_INLINE F128 vcvti128f128(const I128& x) noexcept { return _mm_cvtepi32_ps(x); }
643BL_INLINE D128 vcvti128d128(const I128& x) noexcept { return _mm_cvtepi32_pd(x); }
644
645BL_INLINE I128 vcvtf128i128(const F128& x) noexcept { return _mm_cvtps_epi32(x); }
646BL_INLINE I128 vcvttf128i128(const F128& x) noexcept { return _mm_cvttps_epi32(x); }
647
648BL_INLINE I128 vcvtd128i128(const D128& x) noexcept { return _mm_cvtpd_epi32(x); }
649BL_INLINE I128 vcvttd128i128(const D128& x) noexcept { return _mm_cvttpd_epi32(x); }
650
651template<int A, int B>
652BL_INLINE D128 vshufd64(const D128& x, const D128& y) noexcept { return _mm_shuffle_pd(x, y, (A << 1) | B); }
653
654template<int A, int B>
655BL_INLINE D128 vswizd64(const D128& x) noexcept {
656#if !defined(BL_TARGET_OPT_AVX)
657 return vcast<D128>(vswizi64<A, B>(vcast<I128>(x)));
658#else
659 return vshufd64<A, B>(x, x);
660#endif
661}
662
663BL_INLINE D128 vswapd64(const D128& x) noexcept { return vswizd64<0, 1>(x); }
664BL_INLINE D128 vdupld64(const D128& x) noexcept { return vswizd64<0, 0>(x); }
665BL_INLINE D128 vduphd64(const D128& x) noexcept { return vswizd64<1, 1>(x); }
666
667BL_INLINE D128 vunpackld64(const D128& x, const D128& y) noexcept { return _mm_unpacklo_pd(x, y); }
668BL_INLINE D128 vunpackhd64(const D128& x, const D128& y) noexcept { return _mm_unpackhi_pd(x, y); }
669
670BL_INLINE D128 vor(const D128& x, const D128& y) noexcept { return _mm_or_pd(x, y); }
671BL_INLINE D128 vxor(const D128& x, const D128& y) noexcept { return _mm_xor_pd(x, y); }
672BL_INLINE D128 vand(const D128& x, const D128& y) noexcept { return _mm_and_pd(x, y); }
673BL_INLINE D128 vandnot_a(const D128& x, const D128& y) noexcept { return _mm_andnot_pd(x, y); }
674BL_INLINE D128 vandnot_b(const D128& x, const D128& y) noexcept { return _mm_andnot_pd(y, x); }
675BL_INLINE D128 vblendmask(const D128& x, const D128& y, const D128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
676
677BL_INLINE D128 vaddsd(const D128& x, const D128& y) noexcept { return _mm_add_sd(x, y); }
678BL_INLINE D128 vaddpd(const D128& x, const D128& y) noexcept { return _mm_add_pd(x, y); }
679
680BL_INLINE D128 vsubsd(const D128& x, const D128& y) noexcept { return _mm_sub_sd(x, y); }
681BL_INLINE D128 vsubpd(const D128& x, const D128& y) noexcept { return _mm_sub_pd(x, y); }
682
683BL_INLINE D128 vmulsd(const D128& x, const D128& y) noexcept { return _mm_mul_sd(x, y); }
684BL_INLINE D128 vmulpd(const D128& x, const D128& y) noexcept { return _mm_mul_pd(x, y); }
685
686BL_INLINE D128 vdivsd(const D128& x, const D128& y) noexcept { return _mm_div_sd(x, y); }
687BL_INLINE D128 vdivpd(const D128& x, const D128& y) noexcept { return _mm_div_pd(x, y); }
688
689BL_INLINE D128 vminsd(const D128& x, const D128& y) noexcept { return _mm_min_sd(x, y); }
690BL_INLINE D128 vminpd(const D128& x, const D128& y) noexcept { return _mm_min_pd(x, y); }
691
692BL_INLINE D128 vmaxsd(const D128& x, const D128& y) noexcept { return _mm_max_sd(x, y); }
693BL_INLINE D128 vmaxpd(const D128& x, const D128& y) noexcept { return _mm_max_pd(x, y); }
694
695BL_INLINE D128 vcmpeqsd(const D128& x, const D128& y) noexcept { return _mm_cmpeq_sd(x, y); }
696BL_INLINE D128 vcmpeqpd(const D128& x, const D128& y) noexcept { return _mm_cmpeq_pd(x, y); }
697
698BL_INLINE D128 vcmpnesd(const D128& x, const D128& y) noexcept { return _mm_cmpneq_sd(x, y); }
699BL_INLINE D128 vcmpnepd(const D128& x, const D128& y) noexcept { return _mm_cmpneq_pd(x, y); }
700
701BL_INLINE D128 vcmpgesd(const D128& x, const D128& y) noexcept { return _mm_cmpge_sd(x, y); }
702BL_INLINE D128 vcmpgepd(const D128& x, const D128& y) noexcept { return _mm_cmpge_pd(x, y); }
703
704BL_INLINE D128 vcmpgtsd(const D128& x, const D128& y) noexcept { return _mm_cmpgt_sd(x, y); }
705BL_INLINE D128 vcmpgtpd(const D128& x, const D128& y) noexcept { return _mm_cmpgt_pd(x, y); }
706
707BL_INLINE D128 vcmplesd(const D128& x, const D128& y) noexcept { return _mm_cmple_sd(x, y); }
708BL_INLINE D128 vcmplepd(const D128& x, const D128& y) noexcept { return _mm_cmple_pd(x, y); }
709
710BL_INLINE D128 vcmpltsd(const D128& x, const D128& y) noexcept { return _mm_cmplt_sd(x, y); }
711BL_INLINE D128 vcmpltpd(const D128& x, const D128& y) noexcept { return _mm_cmplt_pd(x, y); }
712
713BL_INLINE D128 vsqrtsd(const D128& x) noexcept { return _mm_sqrt_sd(x, x); }
714BL_INLINE D128 vsqrtpd(const D128& x) noexcept { return _mm_sqrt_pd(x); }
715
716BL_INLINE D128 vloadd128_64(const void* p) noexcept { return _mm_load_sd(static_cast<const double*>(p)); }
717BL_INLINE D128 vloadd128a(const void* p) noexcept { return _mm_load_pd(static_cast<const double*>(p)); }
718BL_INLINE D128 vloadd128u(const void* p) noexcept { return _mm_loadu_pd(static_cast<const double*>(p)); }
719
720BL_INLINE D128 vloadd128_l64(const D128& x, const void* p) noexcept { return _mm_loadl_pd(x, static_cast<const double*>(p)); }
721BL_INLINE D128 vloadd128_h64(const D128& x, const void* p) noexcept { return _mm_loadh_pd(x, static_cast<const double*>(p)); }
722
723BL_INLINE D128 vbroadcastd128_64(const void* p) noexcept {
724#if defined(BL_TARGET_OPT_SSE3)
725 return _mm_loaddup_pd(static_cast<const double*>(p));
726#else
727 return vdupld64(vloadd128_64(p));
728#endif
729}
730
731BL_INLINE void vstored64(void* p, const D128& x) noexcept { _mm_store_sd(static_cast<double*>(p), x); }
732BL_INLINE void vstoreld64(void* p, const D128& x) noexcept { _mm_storel_pd(static_cast<double*>(p), x); }
733BL_INLINE void vstorehd64(void* p, const D128& x) noexcept { _mm_storeh_pd(static_cast<double*>(p), x); }
734BL_INLINE void vstored128a(void* p, const D128& x) noexcept { _mm_store_pd(static_cast<double*>(p), x); }
735BL_INLINE void vstored128u(void* p, const D128& x) noexcept { _mm_storeu_pd(static_cast<double*>(p), x); }
736
737BL_INLINE bool vhasmaskd64(const D128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; }
738#endif
739
740// ============================================================================
741// [BLSIMD::I256]
742// ============================================================================
743
744#if defined(BL_TARGET_OPT_AVX)
745BL_INLINE I256 vzeroi256() noexcept { return _mm256_setzero_si256(); }
746
747BL_INLINE F256 vcvti256f256(const I256& x) noexcept { return _mm256_cvtepi32_ps(x); }
748BL_INLINE D256 vcvti128d256(const I128& x) noexcept { return _mm256_cvtepi32_pd(vcast<I128>(x)); }
749BL_INLINE D256 vcvti256d256(const I256& x) noexcept { return _mm256_cvtepi32_pd(vcast<I128>(x)); }
750#endif
751
752#if defined(BL_TARGET_OPT_AVX2)
753BL_INLINE I256 vseti256i8(int8_t x) noexcept { return _mm256_set1_epi8(x); }
754BL_INLINE I256 vseti256i16(int16_t x) noexcept { return _mm256_set1_epi16(x); }
755
756BL_INLINE I256 vseti256i32(int32_t x) noexcept { return _mm256_set1_epi32(x); }
757BL_INLINE I256 vseti256i32(int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x1, x0, x1, x0, x1, x0, x1, x0); }
758BL_INLINE I256 vseti256i32(int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x3, x2, x1, x0, x3, x2, x1, x0); }
759BL_INLINE I256 vseti256i32(int32_t x7, int32_t x6, int32_t x5, int32_t x4, int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0); }
760
761BL_INLINE I256 vseti256i64(int64_t x) noexcept {
762#if BL_TARGET_ARCH_BITS >= 64
763 return _mm256_set1_epi64x(x);
764#else
765 return vseti256i32(int32_t(uint64_t(x) >> 32), int32_t(x & 0xFFFFFFFFu));
766#endif
767}
768
769BL_INLINE I256 vseti256i64(int64_t x1, int64_t x0) noexcept {
770 return vseti256i32(int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
771 int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu),
772 int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
773 int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu));
774}
775
776BL_INLINE I256 vseti256i64(int64_t x3, int64_t x2, int64_t x1, int64_t x0) noexcept {
777 return vseti256i32(int32_t(uint64_t(x3) >> 32), int32_t(x3 & 0xFFFFFFFFu),
778 int32_t(uint64_t(x2) >> 32), int32_t(x2 & 0xFFFFFFFFu),
779 int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu),
780 int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu));
781}
782
783BL_INLINE I256 vseti256u8(uint8_t x) noexcept { return vseti256i8(int8_t(x)); }
784BL_INLINE I256 vseti256u16(uint16_t x) noexcept { return vseti256i16(int16_t(x)); }
785BL_INLINE I256 vseti256u32(uint32_t x) noexcept { return vseti256i32(int32_t(x)); }
786BL_INLINE I256 vseti256u64(uint64_t x) noexcept { return vseti256i64(int64_t(x)); }
787
788BL_INLINE I256 vseti256u32(uint32_t x1, uint32_t x0) noexcept {
789 return vseti256i32(int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0),
790 int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0));
791}
792
793BL_INLINE I256 vseti256u32(uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept {
794 return vseti256i32(int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0),
795 int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0));
796}
797
798BL_INLINE I256 vseti256u32(uint32_t x7, uint32_t x6, uint32_t x5, uint32_t x4, uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept {
799 return vseti256i32(int32_t(x7), int32_t(x6), int32_t(x5), int32_t(x4),
800 int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0));
801}
802
803BL_INLINE I256 vseti256u64(uint64_t x1, uint64_t x0) noexcept {
804 return vseti256i64(int64_t(x1), int64_t(x0));
805}
806
807BL_INLINE I256 vseti256u64(uint64_t x3, uint64_t x2, uint64_t x1, uint64_t x0) noexcept {
808 return vseti256i64(int64_t(x3), int64_t(x2), int64_t(x1), int64_t(x0));
809}
810
811BL_INLINE I256 vcvti32i256(int32_t x) noexcept { return vcast<I256>(vcvti32i128(x)); }
812BL_INLINE I256 vcvtu32i256(uint32_t x) noexcept { return vcast<I256>(vcvtu32i128(x)); }
813
814BL_INLINE int32_t vcvti256i32(const I256& x) noexcept { return vcvti128i32(vcast<I128>(x)); }
815BL_INLINE uint32_t vcvti256u32(const I256& x) noexcept { return vcvti128u32(vcast<I128>(x)); }
816
817BL_INLINE I256 vcvti64i256(int64_t x) noexcept { return vcast<I256>(vcvti64i128(x)); }
818BL_INLINE I256 vcvtu64i256(uint64_t x) noexcept { return vcast<I256>(vcvtu64i128(x)); }
819
820BL_INLINE int64_t vcvti256i64(const I256& x) noexcept { return vcvti128i64(vcast<I128>(x)); }
821BL_INLINE uint64_t vcvti256u64(const I256& x) noexcept { return vcvti128u64(vcast<I128>(x)); }
822
823template<int A, int B>
824BL_INLINE I256 vpermi128(const I256& x, const I256& y) noexcept { return _mm256_permute2x128_si256(x, y, ((A & 0xF) << 4) + (B & 0xF)); }
825template<int A, int B>
826BL_INLINE I256 vpermi128(const I256& x) noexcept { return vpermi128<A, B>(x, x); }
827
828template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
829BL_INLINE I256 vswizli16(const I256& x) noexcept { return _mm256_shufflelo_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
830template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
831BL_INLINE I256 vswizhi16(const I256& x) noexcept { return _mm256_shufflehi_epi16(x, _MM_SHUFFLE(A, B, C, D)); }
832
833template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
834BL_INLINE I256 vswizi16(const I256& x) noexcept { return vswizhi16<A, B, C, D>(vswizli16<A, B, C, D>(x)); }
835template<uint8_t A, uint8_t B, uint8_t C, uint8_t D>
836BL_INLINE I256 vswizi32(const I256& x) noexcept { return _mm256_shuffle_epi32(x, _MM_SHUFFLE(A, B, C, D)); }
837template<int A, int B>
838BL_INLINE I256 vswizi64(const I256& x) noexcept { return vswizi32<A*2 + 1, A*2, B*2 + 1, B*2>(x); }
839
840BL_INLINE I256 vpshufb(const I256& x, const I256& y) noexcept { return _mm256_shuffle_epi8(x, y); }
841
842template<int N_BYTES>
843BL_INLINE I256 vpalignr(const I256& x, const I256& y) noexcept { return _mm256_alignr_epi8(x, y, N_BYTES); }
844
845BL_INLINE I256 vsplati8i256(const I128& x) noexcept { return _mm256_broadcastb_epi8(vcast<I128>(x)); }
846BL_INLINE I256 vsplati8i256(const I256& x) noexcept { return _mm256_broadcastb_epi8(vcast<I128>(x)); }
847
848BL_INLINE I256 vsplati16i256(const I128& x) noexcept { return _mm256_broadcastw_epi16(vcast<I128>(x)); }
849BL_INLINE I256 vsplati16i256(const I256& x) noexcept { return _mm256_broadcastw_epi16(vcast<I128>(x)); }
850
851BL_INLINE I256 vsplati32i256(const I128& x) noexcept { return _mm256_broadcastd_epi32(vcast<I128>(x)); }
852BL_INLINE I256 vsplati32i256(const I256& x) noexcept { return _mm256_broadcastd_epi32(vcast<I128>(x)); }
853
854BL_INLINE I256 vsplati64i256(const I128& x) noexcept { return _mm256_broadcastq_epi64(vcast<I128>(x)); }
855BL_INLINE I256 vsplati64i256(const I256& x) noexcept { return _mm256_broadcastq_epi64(vcast<I128>(x)); }
856
857BL_INLINE I256 vswapi64(const I256& x) noexcept { return vswizi64<0, 1>(x); }
858BL_INLINE I256 vdupli64(const I256& x) noexcept { return vswizi64<0, 0>(x); }
859BL_INLINE I256 vduphi64(const I256& x) noexcept { return vswizi64<1, 1>(x); }
860
861BL_INLINE I256 vswapi128(const I256& x) noexcept { return vpermi128<0, 1>(x); }
862BL_INLINE I256 vdupli128(const I128& x) noexcept { return vpermi128<0, 0>(vcast<I256>(x)); }
863BL_INLINE I256 vdupli128(const I256& x) noexcept { return vpermi128<0, 0>(x); }
864BL_INLINE I256 vduphi128(const I256& x) noexcept { return vpermi128<1, 1>(x); }
865
866BL_INLINE I256 vmovli128u8u16(const I128& x) noexcept { return _mm256_cvtepu8_epi16(x); }
867BL_INLINE I256 vmovli128u8u32(const I128& x) noexcept { return _mm256_cvtepu8_epi32(x); }
868BL_INLINE I256 vmovli128u8u64(const I128& x) noexcept { return _mm256_cvtepu8_epi64(x); }
869BL_INLINE I256 vmovli128u16u32(const I128& x) noexcept { return _mm256_cvtepu16_epi32(x); }
870BL_INLINE I256 vmovli128u16u64(const I128& x) noexcept { return _mm256_cvtepu16_epi64(x); }
871BL_INLINE I256 vmovli128u32u64(const I128& x) noexcept { return _mm256_cvtepu32_epi64(x); }
872
873BL_INLINE I256 vpacki16i8(const I256& x, const I256& y) noexcept { return _mm256_packs_epi16(x, y); }
874BL_INLINE I256 vpacki16u8(const I256& x, const I256& y) noexcept { return _mm256_packus_epi16(x, y); }
875BL_INLINE I256 vpacki32i16(const I256& x, const I256& y) noexcept { return _mm256_packs_epi32(x, y); }
876BL_INLINE I256 vpacki32u16(const I256& x, const I256& y) noexcept { return _mm256_packus_epi32(x, y); }
877
878BL_INLINE I256 vpacki16i8(const I256& x) noexcept { return vpacki16i8(x, x); }
879BL_INLINE I256 vpacki16u8(const I256& x) noexcept { return vpacki16u8(x, x); }
880BL_INLINE I256 vpacki32i16(const I256& x) noexcept { return vpacki32i16(x, x); }
881BL_INLINE I256 vpacki32u16(const I256& x) noexcept { return vpacki32u16(x, x); }
882
883BL_INLINE I256 vpacki32i8(const I256& x) noexcept { return vpacki16i8(vpacki32i16(x)); }
884BL_INLINE I256 vpacki32i8(const I256& x, const I256& y) noexcept { return vpacki16i8(vpacki32i16(x, y)); }
885BL_INLINE I256 vpacki32i8(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16i8(vpacki32i16(x, y), vpacki32i16(z, w)); }
886
887BL_INLINE I256 vpacki32u8(const I256& x) noexcept { return vpacki16u8(vpacki32i16(x)); }
888BL_INLINE I256 vpacki32u8(const I256& x, const I256& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
889BL_INLINE I256 vpacki32u8(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
890
891BL_INLINE I256 vpackzzdb(const I256& x, const I256& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); }
892BL_INLINE I256 vpackzzdb(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); }
893
894BL_INLINE I256 vunpackli8(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi8(x, y); }
895BL_INLINE I256 vunpackhi8(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi8(x, y); }
896
897BL_INLINE I256 vunpackli16(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi16(x, y); }
898BL_INLINE I256 vunpackhi16(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi16(x, y); }
899
900BL_INLINE I256 vunpackli32(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi32(x, y); }
901BL_INLINE I256 vunpackhi32(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi32(x, y); }
902
903BL_INLINE I256 vunpackli64(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi64(x, y); }
904BL_INLINE I256 vunpackhi64(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi64(x, y); }
905
906BL_INLINE I256 vor(const I256& x, const I256& y) noexcept { return _mm256_or_si256(x, y); }
907BL_INLINE I256 vxor(const I256& x, const I256& y) noexcept { return _mm256_xor_si256(x, y); }
908BL_INLINE I256 vand(const I256& x, const I256& y) noexcept { return _mm256_and_si256(x, y); }
909BL_INLINE I256 vandnot_a(const I256& x, const I256& y) noexcept { return _mm256_andnot_si256(x, y); }
910BL_INLINE I256 vandnot_b(const I256& x, const I256& y) noexcept { return _mm256_andnot_si256(y, x); }
911
912BL_INLINE I256 vblendmask(const I256& x, const I256& y, const I256& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
913BL_INLINE I256 vblendx(const I256& x, const I256& y, const I256& mask) noexcept { return _mm256_blendv_epi8(x, y, mask); }
914
915BL_INLINE I256 vaddi8(const I256& x, const I256& y) noexcept { return _mm256_add_epi8(x, y); }
916BL_INLINE I256 vaddi16(const I256& x, const I256& y) noexcept { return _mm256_add_epi16(x, y); }
917BL_INLINE I256 vaddi32(const I256& x, const I256& y) noexcept { return _mm256_add_epi32(x, y); }
918BL_INLINE I256 vaddi64(const I256& x, const I256& y) noexcept { return _mm256_add_epi64(x, y); }
919
920BL_INLINE I256 vaddsi8(const I256& x, const I256& y) noexcept { return _mm256_adds_epi8(x, y); }
921BL_INLINE I256 vaddsu8(const I256& x, const I256& y) noexcept { return _mm256_adds_epu8(x, y); }
922BL_INLINE I256 vaddsi16(const I256& x, const I256& y) noexcept { return _mm256_adds_epi16(x, y); }
923BL_INLINE I256 vaddsu16(const I256& x, const I256& y) noexcept { return _mm256_adds_epu16(x, y); }
924
925BL_INLINE I256 vsubi8(const I256& x, const I256& y) noexcept { return _mm256_sub_epi8(x, y); }
926BL_INLINE I256 vsubi16(const I256& x, const I256& y) noexcept { return _mm256_sub_epi16(x, y); }
927BL_INLINE I256 vsubi32(const I256& x, const I256& y) noexcept { return _mm256_sub_epi32(x, y); }
928BL_INLINE I256 vsubi64(const I256& x, const I256& y) noexcept { return _mm256_sub_epi64(x, y); }
929
930BL_INLINE I256 vsubsi8(const I256& x, const I256& y) noexcept { return _mm256_subs_epi8(x, y); }
931BL_INLINE I256 vsubsu8(const I256& x, const I256& y) noexcept { return _mm256_subs_epu8(x, y); }
932BL_INLINE I256 vsubsi16(const I256& x, const I256& y) noexcept { return _mm256_subs_epi16(x, y); }
933BL_INLINE I256 vsubsu16(const I256& x, const I256& y) noexcept { return _mm256_subs_epu16(x, y); }
934
935BL_INLINE I256 vmuli16(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi16(x, y); }
936BL_INLINE I256 vmulu16(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi16(x, y); }
937BL_INLINE I256 vmulhi16(const I256& x, const I256& y) noexcept { return _mm256_mulhi_epi16(x, y); }
938BL_INLINE I256 vmulhu16(const I256& x, const I256& y) noexcept { return _mm256_mulhi_epu16(x, y); }
939
940template<uint8_t N_BITS> BL_INLINE I256 vslli16(const I256& x) noexcept { return _mm256_slli_epi16(x, N_BITS); }
941template<uint8_t N_BITS> BL_INLINE I256 vslli32(const I256& x) noexcept { return _mm256_slli_epi32(x, N_BITS); }
942template<uint8_t N_BITS> BL_INLINE I256 vslli64(const I256& x) noexcept { return _mm256_slli_epi64(x, N_BITS); }
943
944template<uint8_t N_BITS> BL_INLINE I256 vsrli16(const I256& x) noexcept { return _mm256_srli_epi16(x, N_BITS); }
945template<uint8_t N_BITS> BL_INLINE I256 vsrli32(const I256& x) noexcept { return _mm256_srli_epi32(x, N_BITS); }
946template<uint8_t N_BITS> BL_INLINE I256 vsrli64(const I256& x) noexcept { return _mm256_srli_epi64(x, N_BITS); }
947
948template<uint8_t N_BITS> BL_INLINE I256 vsrai16(const I256& x) noexcept { return _mm256_srai_epi16(x, N_BITS); }
949template<uint8_t N_BITS> BL_INLINE I256 vsrai32(const I256& x) noexcept { return _mm256_srai_epi32(x, N_BITS); }
950
951template<uint8_t N_BYTES> BL_INLINE I256 vslli128b(const I256& x) noexcept { return _mm256_slli_si256(x, N_BYTES); }
952template<uint8_t N_BYTES> BL_INLINE I256 vsrli128b(const I256& x) noexcept { return _mm256_srli_si256(x, N_BYTES); }
953
954BL_INLINE I256 vmini8(const I256& x, const I256& y) noexcept { return _mm256_min_epi8(x, y); }
955BL_INLINE I256 vmaxi8(const I256& x, const I256& y) noexcept { return _mm256_max_epi8(x, y); }
956BL_INLINE I256 vminu8(const I256& x, const I256& y) noexcept { return _mm256_min_epu8(x, y); }
957BL_INLINE I256 vmaxu8(const I256& x, const I256& y) noexcept { return _mm256_max_epu8(x, y); }
958
959BL_INLINE I256 vmini16(const I256& x, const I256& y) noexcept { return _mm256_min_epi16(x, y); }
960BL_INLINE I256 vmaxi16(const I256& x, const I256& y) noexcept { return _mm256_max_epi16(x, y); }
961BL_INLINE I256 vminu16(const I256& x, const I256& y) noexcept { return _mm256_min_epu16(x, y); }
962BL_INLINE I256 vmaxu16(const I256& x, const I256& y) noexcept { return _mm256_max_epu16(x, y); }
963
964BL_INLINE I256 vmini32(const I256& x, const I256& y) noexcept { return _mm256_min_epi32(x, y); }
965BL_INLINE I256 vmaxi32(const I256& x, const I256& y) noexcept { return _mm256_max_epi32(x, y); }
966BL_INLINE I256 vminu32(const I256& x, const I256& y) noexcept { return _mm256_min_epu32(x, y); }
967BL_INLINE I256 vmaxu32(const I256& x, const I256& y) noexcept { return _mm256_max_epu32(x, y); }
968
969BL_INLINE I256 vcmpeqi8(const I256& x, const I256& y) noexcept { return _mm256_cmpeq_epi8(x, y); }
970BL_INLINE I256 vcmpgti8(const I256& x, const I256& y) noexcept { return _mm256_cmpgt_epi8(x, y); }
971
972BL_INLINE I256 vcmpeqi16(const I256& x, const I256& y) noexcept { return _mm256_cmpeq_epi16(x, y); }
973BL_INLINE I256 vcmpgti16(const I256& x, const I256& y) noexcept { return _mm256_cmpgt_epi16(x, y); }
974
975BL_INLINE I256 vcmpeqi32(const I256& x, const I256& y) noexcept { return _mm256_cmpeq_epi32(x, y); }
976BL_INLINE I256 vcmpgti32(const I256& x, const I256& y) noexcept { return _mm256_cmpgt_epi32(x, y); }
977
978BL_INLINE I256 vloadi256_32(const void* p) noexcept { return vcast<I256>(vloadi128_32(p)); }
979BL_INLINE I256 vloadi256_64(const void* p) noexcept { return vcast<I256>(vloadi128_64(p)); }
980BL_INLINE I256 vloadi256_128a(const void* p) noexcept { return vcast<I256>(vloadi128a(p)); }
981BL_INLINE I256 vloadi256_128u(const void* p) noexcept { return vcast<I256>(vloadi128u(p)); }
982BL_INLINE I256 vloadi256a(const void* p) noexcept { return _mm256_load_si256(static_cast<const I256*>(p)); }
983BL_INLINE I256 vloadi256u(const void* p) noexcept { return _mm256_loadu_si256(static_cast<const I256*>(p)); }
984
985BL_INLINE I256 vloadi256_l64(const I256& x, const void* p) noexcept { return vcast<I256>(vloadi128_l64(vcast<I128>(x), p)); }
986BL_INLINE I256 vloadi256_h64(const I256& x, const void* p) noexcept { return vcast<I256>(vloadi128_h64(vcast<I128>(x), p)); }
987
988BL_INLINE void vstorei32(void* p, const I256& x) noexcept { vstorei32(p, vcast<I128>(x)); }
989BL_INLINE void vstorei64(void* p, const I256& x) noexcept { vstorei64(p, vcast<I128>(x)); }
990BL_INLINE void vstorei128a(void* p, const I256& x) noexcept { vstorei128a(p, vcast<I128>(x)); }
991BL_INLINE void vstorei128u(void* p, const I256& x) noexcept { vstorei128u(p, vcast<I128>(x)); }
992BL_INLINE void vstorei256a(void* p, const I256& x) noexcept { _mm256_store_si256(static_cast<I256*>(p), x); }
993BL_INLINE void vstorei256u(void* p, const I256& x) noexcept { _mm256_storeu_si256(static_cast<I256*>(p), x); }
994
995BL_INLINE void vstoreli64(void* p, const I256& x) noexcept { vstoreli64(p, vcast<I128>(x)); }
996BL_INLINE void vstorehi64(void* p, const I256& x) noexcept { vstorehi64(p, vcast<I128>(x)); }
997
998BL_INLINE bool vhasmaski8(const I256& x, int bits0_31) noexcept { return _mm256_movemask_epi8(vcast<I256>(x)) == bits0_31; }
999BL_INLINE bool vhasmaski8(const F256& x, int bits0_31) noexcept { return _mm256_movemask_epi8(vcast<I256>(x)) == bits0_31; }
1000BL_INLINE bool vhasmaski8(const D256& x, int bits0_31) noexcept { return _mm256_movemask_epi8(vcast<I256>(x)) == bits0_31; }
1001
1002BL_INLINE bool vhasmaski32(const I256& x, int bits0_7) noexcept { return _mm256_movemask_ps(vcast<F256>(x)) == bits0_7; }
1003BL_INLINE bool vhasmaski64(const I256& x, int bits0_3) noexcept { return _mm256_movemask_pd(vcast<D256>(x)) == bits0_3; }
1004
1005BL_INLINE I256 vdiv255u16(const I256& x) noexcept {
1006 I256 y = vaddi16(x, v_const_as<I256>(blCommonTable.i256_0080008000800080));
1007 return vmulhu16(y, v_const_as<I256>(blCommonTable.i256_0101010101010101));
1008}
1009#endif
1010
1011// ============================================================================
1012// [BLSIMD::F256]
1013// ============================================================================
1014
1015#if defined(BL_TARGET_OPT_AVX)
1016BL_INLINE F256 vzerof256() noexcept { return _mm256_setzero_ps(); }
1017
1018BL_INLINE F256 vsetf256(float x) noexcept { return _mm256_set1_ps(x); }
1019BL_INLINE F256 vsetf256(float x1, float x0) noexcept { return _mm256_set_ps(x1, x0, x1, x0, x1, x0, x1, x0); }
1020BL_INLINE F256 vsetf256(float x3, float x2, float x1, float x0) noexcept { return _mm256_set_ps(x3, x2, x1, x0, x3, x2, x1, x0); }
1021BL_INLINE F256 vsetf256(float x7, float x6, float x5, float x4, float x3, float x2, float x1, float x0) noexcept { return _mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0); }
1022
1023BL_INLINE F256 vcvtf32f256(float x) noexcept { return vcast<F256>(vcvtf32f128(x)); }
1024BL_INLINE float vcvtf256f32(const F256& x) noexcept { return vcvtf128f32(vcast<F128>(x)); }
1025
1026BL_INLINE F256 vcvti32f256(int32_t x) noexcept { return vcast<F256>(vcvti32f128(x)); }
1027BL_INLINE int32_t vcvtf256i32(const F256& x) noexcept { return vcvtf128i32(vcast<F128>(x)); }
1028BL_INLINE int32_t vcvttf256i32(const F256& x) noexcept { return vcvttf128i32(vcast<F128>(x)); }
1029
1030#if BL_TARGET_ARCH_BITS >= 64
1031BL_INLINE F256 vcvti64f256(int64_t x) noexcept { return vcast<F256>(vcvti64f128(x)); }
1032BL_INLINE int64_t vcvtf256i64(const F256& x) noexcept { return vcvtf128i64(vcast<F128>(x)); }
1033BL_INLINE int64_t vcvttf256i64(const F256& x) noexcept { return vcvttf128i64(vcast<F128>(x)); }
1034#endif
1035
1036BL_INLINE I256 vcvtf256i256(const F256& x) noexcept { return _mm256_cvtps_epi32(x); }
1037BL_INLINE I256 vcvttf256i256(const F256& x) noexcept { return _mm256_cvttps_epi32(x); }
1038
1039BL_INLINE D256 vcvtf128d256(const F128& x) noexcept { return _mm256_cvtps_pd(vcast<F128>(x)); }
1040BL_INLINE D256 vcvtf256d256(const F256& x) noexcept { return _mm256_cvtps_pd(vcast<F128>(x)); }
1041
1042template<int A, int B, int C, int D>
1043BL_INLINE F256 vshuff32(const F256& x, const F256& y) noexcept { return _mm256_shuffle_ps(x, y, _MM_SHUFFLE(A, B, C, D)); }
1044template<int A, int B, int C, int D>
1045BL_INLINE F256 vswizf32(const F256& x) noexcept { return vshuff32<A, B, C, D>(x, x); }
1046
1047template<int A, int B>
1048BL_INLINE F256 vswizf64(const F256& x) noexcept { return vshuff32<A*2 + 1, A*2, B*2 + 1, B*2>(x, x); }
1049
1050template<int A, int B>
1051BL_INLINE F256 vpermf128(const F256& x, const F256& y) noexcept { return _mm256_permute2f128_ps(x, y, ((A & 0xF) << 4) + (B & 0xF)); }
1052template<int A, int B>
1053BL_INLINE F256 vpermf128(const F256& x) noexcept { return vpermf128<A, B>(x, x); }
1054
1055BL_INLINE F256 vduplf32(const F256& x) noexcept { return vswizf32<2, 2, 0, 0>(x); }
1056BL_INLINE F256 vduphf32(const F256& x) noexcept { return vswizf32<3, 3, 1, 1>(x); }
1057
1058BL_INLINE F256 vswapf64(const F256& x) noexcept { return vswizf64<0, 1>(x); }
1059BL_INLINE F256 vduplf64(const F256& x) noexcept { return vswizf64<0, 0>(x); }
1060BL_INLINE F256 vduphf64(const F256& x) noexcept { return vswizf64<1, 1>(x); }
1061
1062BL_INLINE F256 vswapf128(const F256& x) noexcept { return vpermf128<0, 1>(x); }
1063BL_INLINE F256 vduplf128(const F128& x) noexcept { return vpermf128<0, 0>(vcast<F256>(x)); }
1064BL_INLINE F256 vduplf128(const F256& x) noexcept { return vpermf128<0, 0>(x); }
1065BL_INLINE F256 vduphf128(const F256& x) noexcept { return vpermf128<1, 1>(x); }
1066
1067BL_INLINE F256 vunpacklf32(const F256& x, const F256& y) noexcept { return _mm256_unpacklo_ps(x, y); }
1068BL_INLINE F256 vunpackhf32(const F256& x, const F256& y) noexcept { return _mm256_unpackhi_ps(x, y); }
1069
1070#if defined(BL_TARGET_OPT_AVX2)
1071BL_INLINE F256 vsplatf32f256(const F128& x) noexcept { return _mm256_broadcastss_ps(vcast<F128>(x)); }
1072BL_INLINE F256 vsplatf32f256(const F256& x) noexcept { return _mm256_broadcastss_ps(vcast<F128>(x)); }
1073#else
1074BL_INLINE F256 vsplatf32f256(const F128& x) noexcept { return vduplf128(vswizf32<0, 0, 0, 0>(vcast<F128>(x))); }
1075BL_INLINE F256 vsplatf32f256(const F256& x) noexcept { return vduplf128(vswizf32<0, 0, 0, 0>(vcast<F128>(x))); }
1076#endif
1077
1078BL_INLINE F256 vor(const F256& x, const F256& y) noexcept { return _mm256_or_ps(x, y); }
1079BL_INLINE F256 vxor(const F256& x, const F256& y) noexcept { return _mm256_xor_ps(x, y); }
1080BL_INLINE F256 vand(const F256& x, const F256& y) noexcept { return _mm256_and_ps(x, y); }
1081BL_INLINE F256 vandnot_a(const F256& x, const F256& y) noexcept { return _mm256_andnot_ps(x, y); }
1082BL_INLINE F256 vandnot_b(const F256& x, const F256& y) noexcept { return _mm256_andnot_ps(y, x); }
1083BL_INLINE F256 vblendmask(const F256& x, const F256& y, const F256& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
1084
1085BL_INLINE F256 vaddss(const F256& x, const F256& y) noexcept { return vcast<F256>(vaddss(vcast<F128>(x), vcast<F128>(y))); }
1086BL_INLINE F256 vaddps(const F256& x, const F256& y) noexcept { return _mm256_add_ps(x, y); }
1087
1088BL_INLINE F256 vsubss(const F256& x, const F256& y) noexcept { return vcast<F256>(vsubss(vcast<F128>(x), vcast<F128>(y))); }
1089BL_INLINE F256 vsubps(const F256& x, const F256& y) noexcept { return _mm256_sub_ps(x, y); }
1090
1091BL_INLINE F256 vmulss(const F256& x, const F256& y) noexcept { return vcast<F256>(vmulss(vcast<F128>(x), vcast<F128>(y))); }
1092BL_INLINE F256 vmulps(const F256& x, const F256& y) noexcept { return _mm256_mul_ps(x, y); }
1093
1094BL_INLINE F256 vdivss(const F256& x, const F256& y) noexcept { return vcast<F256>(vdivss(vcast<F128>(x), vcast<F128>(y))); }
1095BL_INLINE F256 vdivps(const F256& x, const F256& y) noexcept { return _mm256_div_ps(x, y); }
1096
1097BL_INLINE F256 vminss(const F256& x, const F256& y) noexcept { return vcast<F256>(vminss(vcast<F128>(x), vcast<F128>(y))); }
1098BL_INLINE F256 vminps(const F256& x, const F256& y) noexcept { return _mm256_min_ps(x, y); }
1099
1100BL_INLINE F256 vmaxss(const F256& x, const F256& y) noexcept { return vcast<F256>(vmaxss(vcast<F128>(x), vcast<F128>(y))); }
1101BL_INLINE F256 vmaxps(const F256& x, const F256& y) noexcept { return _mm256_max_ps(x, y); }
1102
1103BL_INLINE F256 vcmpeqss(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpeqss(vcast<F128>(x), vcast<F128>(y))); }
1104BL_INLINE F256 vcmpeqps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_EQ_OQ); }
1105
1106BL_INLINE F256 vcmpness(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpness(vcast<F128>(x), vcast<F128>(y))); }
1107BL_INLINE F256 vcmpneps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_NEQ_OQ); }
1108
1109BL_INLINE F256 vcmpgess(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpgess(vcast<F128>(x), vcast<F128>(y))); }
1110BL_INLINE F256 vcmpgeps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_GE_OQ); }
1111
1112BL_INLINE F256 vcmpgtss(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpgtss(vcast<F128>(x), vcast<F128>(y))); }
1113BL_INLINE F256 vcmpgtps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_GT_OQ); }
1114
1115BL_INLINE F256 vcmpless(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpless(vcast<F128>(x), vcast<F128>(y))); }
1116BL_INLINE F256 vcmpleps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_LE_OQ); }
1117
1118BL_INLINE F256 vcmpltss(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpltss(vcast<F128>(x), vcast<F128>(y))); }
1119BL_INLINE F256 vcmpltps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_LT_OQ); }
1120
1121BL_INLINE F256 vsqrtss(const F256& x) noexcept { return vcast<F256>(vsqrtss(vcast<F128>(x))); }
1122BL_INLINE F256 vsqrtps(const F256& x) noexcept { return _mm256_sqrt_ps(x); }
1123
1124BL_INLINE F256 vloadf256_32(const void* p) noexcept { return vcast<F256>(vloadf128_32(p)); }
1125BL_INLINE F256 vloadf256_64(const void* p) noexcept { return vcast<F256>(vloadf128_64(p)); }
1126BL_INLINE F256 vloadf256_128a(const void* p) noexcept { return vcast<F256>(vloadf128a(p)); }
1127BL_INLINE F256 vloadf256_128u(const void* p) noexcept { return vcast<F256>(vloadf128u(p)); }
1128BL_INLINE F256 vloadf256a(const void* p) noexcept { return _mm256_load_ps(static_cast<const float*>(p)); }
1129BL_INLINE F256 vloadf256u(const void* p) noexcept { return _mm256_loadu_ps(static_cast<const float*>(p)); }
1130
1131BL_INLINE F256 vloadf256_l64(const F256& x, const void* p) noexcept { return vcast<F256>(vloadf128_l64(vcast<F128>(x), p)); }
1132BL_INLINE F256 vloadf256_h64(const F256& x, const void* p) noexcept { return vcast<F256>(vloadf128_h64(vcast<F128>(x), p)); }
1133
1134BL_INLINE F128 vbroadcastf128_32(const void* p) noexcept { return vcast<F128>(_mm_broadcast_ss(static_cast<const float*>(p))); }
1135BL_INLINE F256 vbroadcastf256_32(const void* p) noexcept { return vcast<F256>(_mm256_broadcast_ss(static_cast<const float*>(p))); }
1136BL_INLINE F256 vbroadcastf256_64(const void* p) noexcept { return vcast<F256>(_mm256_broadcast_sd(static_cast<const double*>(p))); }
1137BL_INLINE F256 vbroadcastf256_128(const void* p) noexcept { return vcast<F256>(_mm256_broadcast_ps(static_cast<const __m128*>(p))); }
1138
1139BL_INLINE void vstoref32(void* p, const F256& x) noexcept { vstoref32(p, vcast<F128>(x)); }
1140BL_INLINE void vstoref64(void* p, const F256& x) noexcept { vstoref64(p, vcast<F128>(x)); }
1141BL_INLINE void vstorelf64(void* p, const F256& x) noexcept { vstorelf64(p, vcast<F128>(x)); }
1142BL_INLINE void vstorehf64(void* p, const F256& x) noexcept { vstorehf64(p, vcast<F128>(x)); }
1143BL_INLINE void vstoref128a(void* p, const F256& x) noexcept { vstoref128a(p, vcast<F128>(x)); }
1144BL_INLINE void vstoref128u(void* p, const F256& x) noexcept { vstoref128u(p, vcast<F128>(x)); }
1145BL_INLINE void vstoref256a(void* p, const F256& x) noexcept { _mm256_store_ps(static_cast<float*>(p), x); }
1146BL_INLINE void vstoref256u(void* p, const F256& x) noexcept { _mm256_storeu_ps(static_cast<float*>(p), x); }
1147
1148BL_INLINE bool vhasmaskf32(const F256& x, int bits0_7) noexcept { return _mm256_movemask_ps(vcast<F256>(x)) == bits0_7; }
1149BL_INLINE bool vhasmaskf64(const F256& x, int bits0_3) noexcept { return _mm256_movemask_pd(vcast<D256>(x)) == bits0_3; }
1150#endif
1151
1152// ============================================================================
1153// [BLSIMD::D256]
1154// ============================================================================
1155
1156#if defined(BL_TARGET_OPT_AVX)
1157BL_INLINE D256 vzerod256() noexcept { return _mm256_setzero_pd(); }
1158BL_INLINE D256 vsetd256(double x) noexcept { return _mm256_set1_pd(x); }
1159BL_INLINE D256 vsetd256(double x1, double x0) noexcept { return _mm256_set_pd(x1, x0, x1, x0); }
1160BL_INLINE D256 vsetd256(double x3, double x2, double x1, double x0) noexcept { return _mm256_set_pd(x3, x2, x1, x0); }
1161
1162BL_INLINE D256 vcvtd64d256(double x) noexcept { return vcast<D256>(vcvtd64d128(x)); }
1163BL_INLINE double vcvtd256d64(const D256& x) noexcept { return vcvtd128d64(vcast<D128>(x)); }
1164
1165BL_INLINE D256 vcvti32d256(int32_t x) noexcept { return vcast<D256>(vcvti32d128(x)); }
1166BL_INLINE int32_t vcvtd256i32(const D256& x) noexcept { return vcvtd128i32(vcast<D128>(x)); }
1167BL_INLINE int32_t vcvttd256i32(const D256& x) noexcept { return vcvttd128i32(vcast<D128>(x)); }
1168
1169#if BL_TARGET_ARCH_BITS >= 64
1170BL_INLINE D256 vcvti64d256(int64_t x) noexcept { return vcast<D256>(vcvti64d128(x)); }
1171BL_INLINE int64_t vcvtd256i64(const D256& x) noexcept { return vcvtd128i64(vcast<D128>(x)); }
1172BL_INLINE int64_t vcvttd256i64(const D256& x) noexcept { return vcvttd128i64(vcast<D128>(x)); }
1173#endif
1174
1175BL_INLINE I128 vcvtd256i128(const D256& x) noexcept { return vcast<I128>(_mm256_cvtpd_epi32(x)); }
1176BL_INLINE I256 vcvtd256i256(const D256& x) noexcept { return vcast<I256>(_mm256_cvtpd_epi32(x)); }
1177
1178BL_INLINE I128 vcvttd256i128(const D256& x) noexcept { return vcast<I128>(_mm256_cvttpd_epi32(x)); }
1179BL_INLINE I256 vcvttd256i256(const D256& x) noexcept { return vcast<I256>(_mm256_cvttpd_epi32(x)); }
1180
1181BL_INLINE F128 vcvtd256f128(const D256& x) noexcept { return vcast<F128>(_mm256_cvtpd_ps(x)); }
1182BL_INLINE F256 vcvtd256f256(const D256& x) noexcept { return vcast<F256>(_mm256_cvtpd_ps(x)); }
1183
1184template<int A, int B>
1185BL_INLINE D256 vshufd64(const D256& x, const D256& y) noexcept { return _mm256_shuffle_pd(x, y, (A << 3) | (B << 2) | (A << 1) | B); }
1186template<int A, int B>
1187BL_INLINE D256 vswizd64(const D256& x) noexcept { return vshufd64<A, B>(x, x); }
1188
1189template<int A, int B>
1190BL_INLINE D256 vpermd128(const D256& x, const D256& y) noexcept { return _mm256_permute2f128_pd(x, y, ((A & 0xF) << 4) + (B & 0xF)); }
1191template<int A, int B>
1192BL_INLINE D256 vpermd128(const D256& x) noexcept { return vpermd128<A, B>(x, x); }
1193
1194BL_INLINE D256 vswapd64(const D256& x) noexcept { return vswizd64<0, 1>(x); }
1195BL_INLINE D256 vdupld64(const D256& x) noexcept { return vswizd64<0, 0>(x); }
1196BL_INLINE D256 vduphd64(const D256& x) noexcept { return vswizd64<1, 1>(x); }
1197
1198BL_INLINE D256 vswapd128(const D256& x) noexcept { return vpermd128<0, 1>(x); }
1199BL_INLINE D256 vdupld128(const D128& x) noexcept { return vpermd128<0, 0>(vcast<D256>(x)); }
1200BL_INLINE D256 vdupld128(const D256& x) noexcept { return vpermd128<0, 0>(x); }
1201BL_INLINE D256 vduphd128(const D256& x) noexcept { return vpermd128<1, 1>(x); }
1202
1203BL_INLINE D256 vunpackld64(const D256& x, const D256& y) noexcept { return _mm256_unpacklo_pd(x, y); }
1204BL_INLINE D256 vunpackhd64(const D256& x, const D256& y) noexcept { return _mm256_unpackhi_pd(x, y); }
1205
1206#if defined(BL_TARGET_OPT_AVX2)
1207BL_INLINE D256 vsplatd64d256(const D128& x) noexcept { return _mm256_broadcastsd_pd(vcast<D128>(x)); }
1208BL_INLINE D256 vsplatd64d256(const D256& x) noexcept { return _mm256_broadcastsd_pd(vcast<D128>(x)); }
1209#else
1210BL_INLINE D256 vsplatd64d256(const D128& x) noexcept { return vdupld128(vswizd64<0, 0>(vcast<D128>(x))); }
1211BL_INLINE D256 vsplatd64d256(const D256& x) noexcept { return vdupld128(vswizd64<0, 0>(vcast<D128>(x))); }
1212#endif
1213
1214BL_INLINE D256 vor(const D256& x, const D256& y) noexcept { return _mm256_or_pd(x, y); }
1215BL_INLINE D256 vxor(const D256& x, const D256& y) noexcept { return _mm256_xor_pd(x, y); }
1216BL_INLINE D256 vand(const D256& x, const D256& y) noexcept { return _mm256_and_pd(x, y); }
1217BL_INLINE D256 vandnot_a(const D256& x, const D256& y) noexcept { return _mm256_andnot_pd(x, y); }
1218BL_INLINE D256 vandnot_b(const D256& x, const D256& y) noexcept { return _mm256_andnot_pd(y, x); }
1219BL_INLINE D256 vblendmask(const D256& x, const D256& y, const D256& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); }
1220
1221BL_INLINE D256 vaddsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vaddsd(vcast<D128>(x), vcast<D128>(y))); }
1222BL_INLINE D256 vaddpd(const D256& x, const D256& y) noexcept { return _mm256_add_pd(x, y); }
1223
1224BL_INLINE D256 vsubsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vsubsd(vcast<D128>(x), vcast<D128>(y))); }
1225BL_INLINE D256 vsubpd(const D256& x, const D256& y) noexcept { return _mm256_sub_pd(x, y); }
1226
1227BL_INLINE D256 vmulsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vmulsd(vcast<D128>(x), vcast<D128>(y))); }
1228BL_INLINE D256 vmulpd(const D256& x, const D256& y) noexcept { return _mm256_mul_pd(x, y); }
1229
1230BL_INLINE D256 vdivsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vdivsd(vcast<D128>(x), vcast<D128>(y))); }
1231BL_INLINE D256 vdivpd(const D256& x, const D256& y) noexcept { return _mm256_div_pd(x, y); }
1232
1233BL_INLINE D256 vminsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vminsd(vcast<D128>(x), vcast<D128>(y))); }
1234BL_INLINE D256 vminpd(const D256& x, const D256& y) noexcept { return _mm256_min_pd(x, y); }
1235
1236BL_INLINE D256 vmaxsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vmaxsd(vcast<D128>(x), vcast<D128>(y))); }
1237BL_INLINE D256 vmaxpd(const D256& x, const D256& y) noexcept { return _mm256_max_pd(x, y); }
1238
1239BL_INLINE D256 vcmpeqsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpeqsd(vcast<D128>(x), vcast<D128>(y))); }
1240BL_INLINE D256 vcmpeqpd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_EQ_OQ); }
1241
1242BL_INLINE D256 vcmpnesd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpnesd(vcast<D128>(x), vcast<D128>(y))); }
1243BL_INLINE D256 vcmpnepd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_NEQ_OQ); }
1244
1245BL_INLINE D256 vcmpgesd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpgesd(vcast<D128>(x), vcast<D128>(y))); }
1246BL_INLINE D256 vcmpgepd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_GE_OQ); }
1247
1248BL_INLINE D256 vcmpgtsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpgtsd(vcast<D128>(x), vcast<D128>(y))); }
1249BL_INLINE D256 vcmpgtpd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_GT_OQ); }
1250
1251BL_INLINE D256 vcmplesd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmplesd(vcast<D128>(x), vcast<D128>(y))); }
1252BL_INLINE D256 vcmplepd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_LE_OQ); }
1253
1254BL_INLINE D256 vcmpltsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpltsd(vcast<D128>(x), vcast<D128>(y))); }
1255BL_INLINE D256 vcmpltpd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_LE_OQ); }
1256
1257BL_INLINE D256 vsqrtsd(const D256& x) noexcept { return vcast<D256>(vsqrtsd(vcast<D128>(x))); }
1258BL_INLINE D256 vsqrtpd(const D256& x) noexcept { return _mm256_sqrt_pd(x); }
1259
1260BL_INLINE D256 vloadd256_64(const void* p) noexcept { return vcast<D256>(vloadd128_64(p)); }
1261BL_INLINE D256 vloadd256_128a(const void* p) noexcept { return vcast<D256>(vloadd128a(p)); }
1262BL_INLINE D256 vloadd256_128u(const void* p) noexcept { return vcast<D256>(vloadd128u(p)); }
1263BL_INLINE D256 vloadd256a(const void* p) noexcept { return _mm256_load_pd(static_cast<const double*>(p)); }
1264BL_INLINE D256 vloadd256u(const void* p) noexcept { return _mm256_loadu_pd(static_cast<const double*>(p)); }
1265
1266BL_INLINE D256 vloadd256_l64(const D256& x, const void* p) noexcept { return vcast<D256>(vloadd128_l64(vcast<D128>(x), p)); }
1267BL_INLINE D256 vloadd256_h64(const D256& x, const void* p) noexcept { return vcast<D256>(vloadd128_h64(vcast<D128>(x), p)); }
1268
1269BL_INLINE D256 vbroadcastd256_64(const void* p) noexcept { return _mm256_broadcast_sd(static_cast<const double*>(p)); }
1270BL_INLINE D256 vbroadcastd256_128(const void* p) noexcept { return _mm256_broadcast_pd(static_cast<const __m128d*>(p)); }
1271
1272BL_INLINE void vstored64(void* p, const D256& x) noexcept { vstored64(p, vcast<D128>(x)); }
1273BL_INLINE void vstoreld64(void* p, const D256& x) noexcept { vstoreld64(p, vcast<D128>(x)); }
1274BL_INLINE void vstorehd64(void* p, const D256& x) noexcept { vstorehd64(p, vcast<D128>(x)); }
1275BL_INLINE void vstored128a(void* p, const D256& x) noexcept { vstored128a(p, vcast<D128>(x)); }
1276BL_INLINE void vstored128u(void* p, const D256& x) noexcept { vstored128u(p, vcast<D128>(x)); }
1277BL_INLINE void vstored256a(void* p, const D256& x) noexcept { _mm256_store_pd(static_cast<double*>(p), x); }
1278BL_INLINE void vstored256u(void* p, const D256& x) noexcept { _mm256_storeu_pd(static_cast<double*>(p), x); }
1279
1280BL_INLINE bool vhasmaskd64(const D256& x, int bits0_3) noexcept { return _mm256_movemask_pd(vcast<D256>(x)) == bits0_3; }
1281#endif
1282
1283#endif
1284
1285} // {anonymous}
1286} // {SIMD}
1287
1288//! \}
1289//! \endcond
1290
1291#endif // BLEND2D_BLSIMD_X86_P_H
1292