1 | // [Blend2D] |
2 | // 2D Vector Graphics Powered by a JIT Compiler. |
3 | // |
4 | // [License] |
5 | // Zlib - See LICENSE.md file in the package. |
6 | |
7 | #ifndef BLEND2D_BLSIMD_X86_P_H |
8 | #define BLEND2D_BLSIMD_X86_P_H |
9 | |
10 | #include "./blsupport_p.h" |
11 | #include "./bltables_p.h" |
12 | |
13 | #if defined(_MSC_VER) |
14 | #include <intrin.h> |
15 | #endif |
16 | |
17 | #if defined(BL_TARGET_OPT_SSE) |
18 | #include <xmmintrin.h> |
19 | #endif |
20 | |
21 | #if defined(BL_TARGET_OPT_SSE2) |
22 | #include <emmintrin.h> |
23 | #endif |
24 | |
25 | #if defined(BL_TARGET_OPT_SSE3) && !defined(_MSC_VER) |
26 | #include <pmmintrin.h> |
27 | #endif |
28 | |
29 | #if defined(BL_TARGET_OPT_SSSE3) |
30 | #include <tmmintrin.h> |
31 | #endif |
32 | |
33 | #if defined(BL_TARGET_OPT_SSE4_1) |
34 | #include <smmintrin.h> |
35 | #endif |
36 | |
37 | #if defined(BL_TARGET_OPT_SSE4_2) |
38 | #include <nmmintrin.h> |
39 | #endif |
40 | |
41 | #if defined(BL_TARGET_OPT_AVX) || defined(BL_TARGET_OPT_AVX2) |
42 | #include <immintrin.h> |
43 | #endif |
44 | |
45 | #if defined(BL_TARGET_OPT_NEON) |
46 | #include <arm_neon.h> |
47 | #endif |
48 | |
49 | //! \cond INTERNAL |
50 | //! \addtogroup blend2d_internal |
51 | //! \{ |
52 | |
53 | //! SIMD namespace contains helper functions to access SIMD intrinsics. The |
54 | //! names of these functions correspond to names of functions used by pipeline |
55 | //! generator (BLPipe). |
56 | namespace SIMD { |
57 | |
58 | // ============================================================================ |
59 | // [BLSIMD - Features] |
60 | // ============================================================================ |
61 | |
62 | #if defined(BL_TARGET_OPT_AVX2) |
63 | #define BL_TARGET_SIMD_I 256 |
64 | #define BL_TARGET_SIMD_F 256 |
65 | #define BL_TARGET_SIMD_D 256 |
66 | #elif defined(BL_TARGET_OPT_AVX) |
67 | #define BL_TARGET_SIMD_I 128 |
68 | #define BL_TARGET_SIMD_F 256 |
69 | #define BL_TARGET_SIMD_D 256 |
70 | #elif defined(BL_TARGET_OPT_SSE2) |
71 | #define BL_TARGET_SIMD_I 128 |
72 | #define BL_TARGET_SIMD_F 128 |
73 | #define BL_TARGET_SIMD_D 128 |
74 | #else |
75 | #define BL_TARGET_SIMD_I 0 |
76 | #define BL_TARGET_SIMD_F 0 |
77 | #define BL_TARGET_SIMD_D 0 |
78 | #endif |
79 | |
80 | // ============================================================================ |
81 | // [BLSIMD - Types] |
82 | // ============================================================================ |
83 | |
84 | #if defined(BL_TARGET_OPT_SSE2) |
85 | typedef __m128i I128; |
86 | typedef __m128 F128; |
87 | typedef __m128d D128; |
88 | #endif |
89 | |
90 | // 256-bit types (including integers) are accessible through AVX as AVX also |
91 | // include conversion instructions between integer types and FP types. |
92 | #if defined(BL_TARGET_OPT_AVX) |
93 | typedef __m256i I256; |
94 | typedef __m256 F256; |
95 | typedef __m256d D256; |
96 | #endif |
97 | |
98 | // Must be in anonymous namespace. |
99 | namespace { |
100 | |
101 | // ============================================================================ |
102 | // [BLSIMD - Cast] |
103 | // ============================================================================ |
104 | |
105 | template<typename Out, typename In> |
106 | BL_INLINE const Out& v_const_as(const In* c) noexcept { |
107 | return *reinterpret_cast<const Out*>(c); |
108 | } |
109 | |
110 | template<typename DstT, typename SrcT> |
111 | BL_INLINE DstT vcast(const SrcT& x) noexcept { return x; } |
112 | |
113 | #if defined(BL_TARGET_OPT_SSE2) |
114 | template<> BL_INLINE F128 vcast(const I128& x) noexcept { return _mm_castsi128_ps(x); } |
115 | template<> BL_INLINE D128 vcast(const I128& x) noexcept { return _mm_castsi128_pd(x); } |
116 | template<> BL_INLINE I128 vcast(const F128& x) noexcept { return _mm_castps_si128(x); } |
117 | template<> BL_INLINE D128 vcast(const F128& x) noexcept { return _mm_castps_pd(x); } |
118 | template<> BL_INLINE I128 vcast(const D128& x) noexcept { return _mm_castpd_si128(x); } |
119 | template<> BL_INLINE F128 vcast(const D128& x) noexcept { return _mm_castpd_ps(x); } |
120 | #endif |
121 | |
122 | #if defined(BL_TARGET_OPT_AVX) |
123 | template<> BL_INLINE I128 vcast(const I256& x) noexcept { return _mm256_castsi256_si128(x); } |
124 | template<> BL_INLINE I256 vcast(const I128& x) noexcept { return _mm256_castsi128_si256(x); } |
125 | |
126 | template<> BL_INLINE F128 vcast(const F256& x) noexcept { return _mm256_castps256_ps128(x); } |
127 | template<> BL_INLINE F256 vcast(const F128& x) noexcept { return _mm256_castps128_ps256(x); } |
128 | |
129 | template<> BL_INLINE D128 vcast(const D256& x) noexcept { return _mm256_castpd256_pd128(x); } |
130 | template<> BL_INLINE D256 vcast(const D128& x) noexcept { return _mm256_castpd128_pd256(x); } |
131 | |
132 | template<> BL_INLINE D256 vcast(const F256& x) noexcept { return _mm256_castps_pd(x); } |
133 | template<> BL_INLINE F256 vcast(const D256& x) noexcept { return _mm256_castpd_ps(x); } |
134 | |
135 | template<> BL_INLINE F256 vcast(const I256& x) noexcept { return _mm256_castsi256_ps(x); } |
136 | template<> BL_INLINE I256 vcast(const F256& x) noexcept { return _mm256_castps_si256(x); } |
137 | |
138 | template<> BL_INLINE D256 vcast(const I256& x) noexcept { return _mm256_castsi256_pd(x); } |
139 | template<> BL_INLINE I256 vcast(const D256& x) noexcept { return _mm256_castpd_si256(x); } |
140 | #endif |
141 | |
142 | // ============================================================================ |
143 | // [BLSIMD - I128] |
144 | // ============================================================================ |
145 | |
146 | #if defined(BL_TARGET_OPT_SSE2) |
147 | BL_INLINE I128 vzeroi128() noexcept { return _mm_setzero_si128(); } |
148 | |
149 | BL_INLINE I128 vseti128i8(int8_t x) noexcept { return _mm_set1_epi8(x); } |
150 | BL_INLINE I128 vseti128i16(int16_t x) noexcept { return _mm_set1_epi16(x); } |
151 | BL_INLINE I128 vseti128i32(int32_t x) noexcept { return _mm_set1_epi32(x); } |
152 | |
153 | BL_INLINE I128 vseti128i32(int32_t x1, int32_t x0) noexcept { return _mm_set_epi32(x1, x0, x1, x0); } |
154 | BL_INLINE I128 vseti128i32(int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm_set_epi32(x3, x2, x1, x0); } |
155 | |
156 | BL_INLINE I128 vseti128i64(int64_t x) noexcept { |
157 | #if BL_TARGET_ARCH_BITS >= 64 |
158 | return _mm_set1_epi64x(x); |
159 | #else |
160 | return vseti128i32(int32_t(uint64_t(x) >> 32), int32_t(x & 0xFFFFFFFFu)); |
161 | #endif |
162 | } |
163 | |
164 | BL_INLINE I128 vseti128i64(int64_t x1, int64_t x0) noexcept { |
165 | return vseti128i32(int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu), |
166 | int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu)); |
167 | } |
168 | |
169 | BL_INLINE I128 vseti128u8(uint8_t x) noexcept { return vseti128i8(int8_t(x)); } |
170 | BL_INLINE I128 vseti128u16(uint16_t x) noexcept { return vseti128i16(int16_t(x)); } |
171 | BL_INLINE I128 vseti128u32(uint32_t x) noexcept { return vseti128i32(int32_t(x)); } |
172 | BL_INLINE I128 vseti128u32(uint32_t x1, uint32_t x0) noexcept { return vseti128i32(int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0)); } |
173 | BL_INLINE I128 vseti128u32(uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept { return vseti128i32(int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0)); } |
174 | BL_INLINE I128 vseti128u64(uint64_t x) noexcept { return vseti128i64(int64_t(x)); } |
175 | BL_INLINE I128 vseti128u64(uint64_t x1, uint64_t x0) noexcept { return vseti128i64(int64_t(x1), int64_t(x0)); } |
176 | |
177 | BL_INLINE I128 vcvti32i128(int32_t x) noexcept { return _mm_cvtsi32_si128(int(x)); } |
178 | BL_INLINE I128 vcvtu32i128(uint32_t x) noexcept { return _mm_cvtsi32_si128(int(x)); } |
179 | |
180 | BL_INLINE int32_t vcvti128i32(const I128& x) noexcept { return int32_t(_mm_cvtsi128_si32(x)); } |
181 | BL_INLINE uint32_t vcvti128u32(const I128& x) noexcept { return uint32_t(_mm_cvtsi128_si32(x)); } |
182 | |
183 | BL_INLINE I128 vcvti64i128(int64_t x) noexcept { |
184 | #if BL_TARGET_ARCH_BITS >= 64 |
185 | return _mm_cvtsi64_si128(x); |
186 | #else |
187 | return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&x)); |
188 | #endif |
189 | } |
190 | |
191 | BL_INLINE int64_t vcvti128i64(const I128& x) noexcept { |
192 | #if BL_TARGET_ARCH_BITS >= 64 |
193 | return int64_t(_mm_cvtsi128_si64(x)); |
194 | #else |
195 | int64_t result; |
196 | _mm_storel_epi64(reinterpret_cast<__m128i*>(&result), x); |
197 | return result; |
198 | #endif |
199 | } |
200 | |
201 | BL_INLINE I128 vcvtu64i128(uint64_t x) noexcept { return vcvti64i128(int64_t(x)); } |
202 | BL_INLINE uint64_t vcvti128u64(const I128& x) noexcept { return uint64_t(vcvti128i64(x)); } |
203 | |
204 | template<uint8_t A, uint8_t B, uint8_t C, uint8_t D> |
205 | BL_INLINE I128 vswizli16(const I128& x) noexcept { return _mm_shufflelo_epi16(x, _MM_SHUFFLE(A, B, C, D)); } |
206 | template<uint8_t A, uint8_t B, uint8_t C, uint8_t D> |
207 | BL_INLINE I128 vswizhi16(const I128& x) noexcept { return _mm_shufflehi_epi16(x, _MM_SHUFFLE(A, B, C, D)); } |
208 | |
209 | template<uint8_t A, uint8_t B, uint8_t C, uint8_t D> |
210 | BL_INLINE I128 vswizi16(const I128& x) noexcept { return vswizhi16<A, B, C, D>(vswizli16<A, B, C, D>(x)); } |
211 | template<uint8_t A, uint8_t B, uint8_t C, uint8_t D> |
212 | BL_INLINE I128 vswizi32(const I128& x) noexcept { return _mm_shuffle_epi32(x, _MM_SHUFFLE(A, B, C, D)); } |
213 | template<int A, int B> |
214 | BL_INLINE I128 vswizi64(const I128& x) noexcept { return vswizi32<A*2 + 1, A*2, B*2 + 1, B*2>(x); } |
215 | |
216 | #if defined(BL_TARGET_OPT_SSSE3) |
217 | BL_INLINE I128 vpshufb(const I128& x, const I128& y) noexcept { return _mm_shuffle_epi8(x, y); } |
218 | |
219 | template<int N_BYTES> |
220 | BL_INLINE I128 vpalignr(const I128& x, const I128& y) noexcept { return _mm_alignr_epi8(x, y, N_BYTES); } |
221 | #endif |
222 | |
223 | BL_INLINE I128 vswapi64(const I128& x) noexcept { return vswizi64<0, 1>(x); } |
224 | BL_INLINE I128 vdupli64(const I128& x) noexcept { return vswizi64<0, 0>(x); } |
225 | BL_INLINE I128 vduphi64(const I128& x) noexcept { return vswizi64<1, 1>(x); } |
226 | |
227 | BL_INLINE I128 vmovli64u8u16(const I128& x) noexcept { |
228 | #if defined(BL_TARGET_OPT_SSE4_1) |
229 | return _mm_cvtepu8_epi16(x); |
230 | #else |
231 | return _mm_unpacklo_epi8(x, _mm_setzero_si128()); |
232 | #endif |
233 | } |
234 | |
235 | BL_INLINE I128 vmovli64u16u32(const I128& x) noexcept { |
236 | #if defined(BL_TARGET_OPT_SSE4_1) |
237 | return _mm_cvtepu16_epi32(x); |
238 | #else |
239 | return _mm_unpacklo_epi16(x, _mm_setzero_si128()); |
240 | #endif |
241 | } |
242 | |
243 | BL_INLINE I128 vmovli64u32u64(const I128& x) noexcept { |
244 | #if defined(BL_TARGET_OPT_SSE4_1) |
245 | return _mm_cvtepu32_epi64(x); |
246 | #else |
247 | return _mm_unpacklo_epi32(x, _mm_setzero_si128()); |
248 | #endif |
249 | } |
250 | |
251 | BL_INLINE I128 vmovhi64u8u16(const I128& x) noexcept { return _mm_unpackhi_epi8(x, _mm_setzero_si128()); } |
252 | BL_INLINE I128 vmovhi64u16u32(const I128& x) noexcept { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); } |
253 | BL_INLINE I128 vmovhi64u32u64(const I128& x) noexcept { return _mm_unpackhi_epi32(x, _mm_setzero_si128()); } |
254 | |
255 | BL_INLINE I128 vpacki16i8(const I128& x, const I128& y) noexcept { return _mm_packs_epi16(x, y); } |
256 | BL_INLINE I128 vpacki16u8(const I128& x, const I128& y) noexcept { return _mm_packus_epi16(x, y); } |
257 | BL_INLINE I128 vpacki32i16(const I128& x, const I128& y) noexcept { return _mm_packs_epi32(x, y); } |
258 | |
259 | BL_INLINE I128 vpacki16i8(const I128& x) noexcept { return vpacki16i8(x, x); } |
260 | BL_INLINE I128 vpacki16u8(const I128& x) noexcept { return vpacki16u8(x, x); } |
261 | BL_INLINE I128 vpacki32i16(const I128& x) noexcept { return vpacki32i16(x, x); } |
262 | |
263 | BL_INLINE I128 vpacki32u16(const I128& x, const I128& y) noexcept { |
264 | #if defined(BL_TARGET_OPT_SSE4_1) |
265 | return _mm_packus_epi32(x, y); |
266 | #else |
267 | I128 xShifted = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16); |
268 | I128 yShifted = _mm_srai_epi32(_mm_slli_epi32(y, 16), 16); |
269 | return _mm_packs_epi32(xShifted, yShifted); |
270 | #endif |
271 | } |
272 | |
273 | BL_INLINE I128 vpacki32u16(const I128& x) noexcept { |
274 | #if defined(BL_TARGET_OPT_SSE4_1) |
275 | return vpacki32u16(x, x); |
276 | #else |
277 | I128 xShifted = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16); |
278 | return _mm_packs_epi32(xShifted, xShifted); |
279 | #endif |
280 | } |
281 | |
282 | BL_INLINE I128 vpacki32i8(const I128& x) noexcept { return vpacki16i8(vpacki32i16(x)); } |
283 | BL_INLINE I128 vpacki32i8(const I128& x, const I128& y) noexcept { return vpacki16i8(vpacki32i16(x, y)); } |
284 | BL_INLINE I128 vpacki32i8(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16i8(vpacki32i16(x, y), vpacki32i16(z, w)); } |
285 | |
286 | BL_INLINE I128 vpacki32u8(const I128& x) noexcept { return vpacki16u8(vpacki32i16(x)); } |
287 | BL_INLINE I128 vpacki32u8(const I128& x, const I128& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); } |
288 | BL_INLINE I128 vpacki32u8(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); } |
289 | |
290 | // These assume that HI bytes of all inputs are always zero, so the implementation |
291 | // can decide between packing with signed/unsigned saturation or vector swizzling. |
292 | BL_INLINE I128 vpackzzwb(const I128& x) noexcept { return vpacki16u8(x); } |
293 | BL_INLINE I128 vpackzzwb(const I128& x, const I128& y) noexcept { return vpacki16u8(x, y); } |
294 | |
295 | BL_INLINE I128 vpackzzdw(const I128& x) noexcept { |
296 | #if defined(BL_TARGET_OPT_SSE4_1) || !defined(BL_TARGET_OPT_SSSE3) |
297 | return vpacki32u16(x); |
298 | #else |
299 | return vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo)); |
300 | #endif |
301 | } |
302 | |
303 | BL_INLINE I128 vpackzzdw(const I128& x, const I128& y) noexcept { |
304 | #if defined(BL_TARGET_OPT_SSE4_1) || !defined(BL_TARGET_OPT_SSSE3) |
305 | return vpacki32u16(x, y); |
306 | #else |
307 | I128 xLo = vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo)); |
308 | I128 yLo = vpshufb(y, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u16_lo)); |
309 | return _mm_unpacklo_epi64(xLo, yLo); |
310 | #endif |
311 | } |
312 | |
313 | BL_INLINE I128 vpackzzdb(const I128& x) noexcept { |
314 | #if defined(BL_TARGET_OPT_SSSE3) |
315 | return vpshufb(x, v_const_as<I128>(blCommonTable.i128_pshufb_u32_to_u8_lo)); |
316 | #else |
317 | return vpacki16u8(vpacki32i16(x)); |
318 | #endif |
319 | } |
320 | |
321 | BL_INLINE I128 vpackzzdb(const I128& x, const I128& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); } |
322 | BL_INLINE I128 vpackzzdb(const I128& x, const I128& y, const I128& z, const I128& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); } |
323 | |
324 | BL_INLINE I128 vunpackli8(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi8(x, y); } |
325 | BL_INLINE I128 vunpackhi8(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi8(x, y); } |
326 | |
327 | BL_INLINE I128 vunpackli16(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi16(x, y); } |
328 | BL_INLINE I128 vunpackhi16(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi16(x, y); } |
329 | |
330 | BL_INLINE I128 vunpackli32(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi32(x, y); } |
331 | BL_INLINE I128 vunpackhi32(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi32(x, y); } |
332 | |
333 | BL_INLINE I128 vunpackli64(const I128& x, const I128& y) noexcept { return _mm_unpacklo_epi64(x, y); } |
334 | BL_INLINE I128 vunpackhi64(const I128& x, const I128& y) noexcept { return _mm_unpackhi_epi64(x, y); } |
335 | |
336 | BL_INLINE I128 vor(const I128& x, const I128& y) noexcept { return _mm_or_si128(x, y); } |
337 | BL_INLINE I128 vxor(const I128& x, const I128& y) noexcept { return _mm_xor_si128(x, y); } |
338 | BL_INLINE I128 vand(const I128& x, const I128& y) noexcept { return _mm_and_si128(x, y); } |
339 | BL_INLINE I128 vandnot_a(const I128& x, const I128& y) noexcept { return _mm_andnot_si128(x, y); } |
340 | BL_INLINE I128 vandnot_b(const I128& x, const I128& y) noexcept { return _mm_andnot_si128(y, x); } |
341 | BL_INLINE I128 vblendmask(const I128& x, const I128& y, const I128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); } |
342 | |
343 | //! Blend BITs or BYTEs, taking advantage of `pblendvb` (SSE4.1), if possible. |
344 | BL_INLINE I128 vblendx(const I128& x, const I128& y, const I128& mask) noexcept { |
345 | #if defined(BL_TARGET_OPT_SSE4_1) |
346 | return _mm_blendv_epi8(x, y, mask); |
347 | #else |
348 | return vblendmask(x, y, mask); |
349 | #endif |
350 | } |
351 | |
352 | BL_INLINE I128 vaddi8(const I128& x, const I128& y) noexcept { return _mm_add_epi8(x, y); } |
353 | BL_INLINE I128 vaddi16(const I128& x, const I128& y) noexcept { return _mm_add_epi16(x, y); } |
354 | BL_INLINE I128 vaddi32(const I128& x, const I128& y) noexcept { return _mm_add_epi32(x, y); } |
355 | BL_INLINE I128 vaddi64(const I128& x, const I128& y) noexcept { return _mm_add_epi64(x, y); } |
356 | |
357 | BL_INLINE I128 vaddsi8(const I128& x, const I128& y) noexcept { return _mm_adds_epi8(x, y); } |
358 | BL_INLINE I128 vaddsu8(const I128& x, const I128& y) noexcept { return _mm_adds_epu8(x, y); } |
359 | BL_INLINE I128 vaddsi16(const I128& x, const I128& y) noexcept { return _mm_adds_epi16(x, y); } |
360 | BL_INLINE I128 vaddsu16(const I128& x, const I128& y) noexcept { return _mm_adds_epu16(x, y); } |
361 | |
362 | BL_INLINE I128 vsubi8(const I128& x, const I128& y) noexcept { return _mm_sub_epi8(x, y); } |
363 | BL_INLINE I128 vsubi16(const I128& x, const I128& y) noexcept { return _mm_sub_epi16(x, y); } |
364 | BL_INLINE I128 vsubi32(const I128& x, const I128& y) noexcept { return _mm_sub_epi32(x, y); } |
365 | BL_INLINE I128 vsubi64(const I128& x, const I128& y) noexcept { return _mm_sub_epi64(x, y); } |
366 | |
367 | BL_INLINE I128 vsubsi8(const I128& x, const I128& y) noexcept { return _mm_subs_epi8(x, y); } |
368 | BL_INLINE I128 vsubsu8(const I128& x, const I128& y) noexcept { return _mm_subs_epu8(x, y); } |
369 | BL_INLINE I128 vsubsi16(const I128& x, const I128& y) noexcept { return _mm_subs_epi16(x, y); } |
370 | BL_INLINE I128 vsubsu16(const I128& x, const I128& y) noexcept { return _mm_subs_epu16(x, y); } |
371 | |
372 | BL_INLINE I128 vmuli16(const I128& x, const I128& y) noexcept { return _mm_mullo_epi16(x, y); } |
373 | BL_INLINE I128 vmulu16(const I128& x, const I128& y) noexcept { return _mm_mullo_epi16(x, y); } |
374 | BL_INLINE I128 vmulhi16(const I128& x, const I128& y) noexcept { return _mm_mulhi_epi16(x, y); } |
375 | BL_INLINE I128 vmulhu16(const I128& x, const I128& y) noexcept { return _mm_mulhi_epu16(x, y); } |
376 | |
377 | template<uint8_t N_BITS> BL_INLINE I128 vslli16(const I128& x) noexcept { return _mm_slli_epi16(x, N_BITS); } |
378 | template<uint8_t N_BITS> BL_INLINE I128 vslli32(const I128& x) noexcept { return _mm_slli_epi32(x, N_BITS); } |
379 | template<uint8_t N_BITS> BL_INLINE I128 vslli64(const I128& x) noexcept { return _mm_slli_epi64(x, N_BITS); } |
380 | |
381 | template<uint8_t N_BITS> BL_INLINE I128 vsrli16(const I128& x) noexcept { return _mm_srli_epi16(x, N_BITS); } |
382 | template<uint8_t N_BITS> BL_INLINE I128 vsrli32(const I128& x) noexcept { return _mm_srli_epi32(x, N_BITS); } |
383 | template<uint8_t N_BITS> BL_INLINE I128 vsrli64(const I128& x) noexcept { return _mm_srli_epi64(x, N_BITS); } |
384 | |
385 | template<uint8_t N_BITS> BL_INLINE I128 vsrai16(const I128& x) noexcept { return _mm_srai_epi16(x, N_BITS); } |
386 | template<uint8_t N_BITS> BL_INLINE I128 vsrai32(const I128& x) noexcept { return _mm_srai_epi32(x, N_BITS); } |
387 | |
388 | template<uint8_t N_BYTES> BL_INLINE I128 vslli128b(const I128& x) noexcept { return _mm_slli_si128(x, N_BYTES); } |
389 | template<uint8_t N_BYTES> BL_INLINE I128 vsrli128b(const I128& x) noexcept { return _mm_srli_si128(x, N_BYTES); } |
390 | |
391 | #if defined(BL_TARGET_OPT_SSE4_1) |
392 | BL_INLINE I128 vmini8(const I128& x, const I128& y) noexcept { return _mm_min_epi8(x, y); } |
393 | BL_INLINE I128 vmaxi8(const I128& x, const I128& y) noexcept { return _mm_max_epi8(x, y); } |
394 | #else |
395 | BL_INLINE I128 vmini8(const I128& x, const I128& y) noexcept { return vblendmask(y, x, _mm_cmpgt_epi8(x, y)); } |
396 | BL_INLINE I128 vmaxi8(const I128& x, const I128& y) noexcept { return vblendmask(x, y, _mm_cmpgt_epi8(x, y)); } |
397 | #endif |
398 | |
399 | BL_INLINE I128 vminu8(const I128& x, const I128& y) noexcept { return _mm_min_epu8(x, y); } |
400 | BL_INLINE I128 vmaxu8(const I128& x, const I128& y) noexcept { return _mm_max_epu8(x, y); } |
401 | |
402 | BL_INLINE I128 vmini16(const I128& x, const I128& y) noexcept { return _mm_min_epi16(x, y); } |
403 | BL_INLINE I128 vmaxi16(const I128& x, const I128& y) noexcept { return _mm_max_epi16(x, y); } |
404 | |
405 | #if defined(BL_TARGET_OPT_SSE4_1) |
406 | BL_INLINE I128 vminu16(const I128& x, const I128& y) noexcept { return _mm_min_epu16(x, y); } |
407 | BL_INLINE I128 vmaxu16(const I128& x, const I128& y) noexcept { return _mm_max_epu16(x, y); } |
408 | #else |
409 | BL_INLINE I128 vminu16(const I128& x, const I128& y) noexcept { return _mm_sub_epi16(x, _mm_subs_epu16(x, y)); } |
410 | BL_INLINE I128 vmaxu16(const I128& x, const I128& y) noexcept { return _mm_add_epi16(x, _mm_subs_epu16(x, y)); } |
411 | #endif |
412 | |
413 | #if defined(BL_TARGET_OPT_SSE4_1) |
414 | BL_INLINE I128 vmini32(const I128& x, const I128& y) noexcept { return _mm_min_epi32(x, y); } |
415 | BL_INLINE I128 vmaxi32(const I128& x, const I128& y) noexcept { return _mm_max_epi32(x, y); } |
416 | #else |
417 | BL_INLINE I128 vmini32(const I128& x, const I128& y) noexcept { return vblendmask(y, x, _mm_cmpgt_epi32(x, y)); } |
418 | BL_INLINE I128 vmaxi32(const I128& x, const I128& y) noexcept { return vblendmask(x, y, _mm_cmpgt_epi32(x, y)); } |
419 | #endif |
420 | |
421 | BL_INLINE I128 vcmpeqi8(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi8(x, y); } |
422 | BL_INLINE I128 vcmpgti8(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi8(x, y); } |
423 | |
424 | BL_INLINE I128 vcmpeqi16(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi16(x, y); } |
425 | BL_INLINE I128 vcmpgti16(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi16(x, y); } |
426 | |
427 | BL_INLINE I128 vcmpeqi32(const I128& x, const I128& y) noexcept { return _mm_cmpeq_epi32(x, y); } |
428 | BL_INLINE I128 vcmpgti32(const I128& x, const I128& y) noexcept { return _mm_cmpgt_epi32(x, y); } |
429 | |
430 | #if defined(BL_TARGET_OPT_SSSE3) |
431 | BL_INLINE I128 vabsi8(const I128& x) noexcept { return _mm_abs_epi8(x); } |
432 | BL_INLINE I128 vabsi16(const I128& x) noexcept { return _mm_abs_epi16(x); } |
433 | BL_INLINE I128 vabsi32(const I128& x) noexcept { return _mm_abs_epi32(x); } |
434 | #else |
435 | BL_INLINE I128 vabsi8(const I128& x) noexcept { return vminu8(vsubi8(vzeroi128(), x), x); } |
436 | BL_INLINE I128 vabsi16(const I128& x) noexcept { return vmaxi16(vsubi16(vzeroi128(), x), x); } |
437 | BL_INLINE I128 vabsi32(const I128& x) noexcept { I128 y = vsrai32<31>(x); return vsubi32(vxor(x, y), y); } |
438 | #endif |
439 | |
440 | BL_INLINE I128 vloadi128_32(const void* p) noexcept { return _mm_cvtsi32_si128(int(*(BLMisalignedUInt<uint32_t, 1>::T*)(p))); } |
441 | BL_INLINE I128 vloadi128_64(const void* p) noexcept { return _mm_loadl_epi64(static_cast<const I128*>(p)); } |
442 | BL_INLINE I128 vloadi128a(const void* p) noexcept { return _mm_load_si128(static_cast<const I128*>(p)); } |
443 | BL_INLINE I128 vloadi128u(const void* p) noexcept { return _mm_loadu_si128(static_cast<const I128*>(p)); } |
444 | |
445 | BL_INLINE I128 vloadi128_l64(const I128& x, const void* p) noexcept { return vcast<I128>(_mm_loadl_pd(vcast<D128>(x), static_cast<const double*>(p))); } |
446 | BL_INLINE I128 vloadi128_h64(const I128& x, const void* p) noexcept { return vcast<I128>(_mm_loadh_pd(vcast<D128>(x), static_cast<const double*>(p))); } |
447 | |
448 | BL_INLINE void vstorei32(void* p, const I128& x) noexcept { static_cast<int*>(p)[0] = _mm_cvtsi128_si32(x); } |
449 | BL_INLINE void vstorei64(void* p, const I128& x) noexcept { _mm_storel_epi64(static_cast<I128*>(p), x); } |
450 | BL_INLINE void vstorei128a(void* p, const I128& x) noexcept { _mm_store_si128(static_cast<I128*>(p), x); } |
451 | BL_INLINE void vstorei128u(void* p, const I128& x) noexcept { _mm_storeu_si128(static_cast<I128*>(p), x); } |
452 | |
453 | BL_INLINE void vstoreli64(void* p, const I128& x) noexcept { _mm_storel_epi64(static_cast<I128*>(p), x); } |
454 | BL_INLINE void vstorehi64(void* p, const I128& x) noexcept { _mm_storeh_pd(static_cast<double*>(p), vcast<D128>(x)); } |
455 | |
456 | BL_INLINE bool vhasmaski8(const I128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; } |
457 | BL_INLINE bool vhasmaski8(const F128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; } |
458 | BL_INLINE bool vhasmaski8(const D128& x, int bits0_15) noexcept { return _mm_movemask_epi8(vcast<I128>(x)) == bits0_15; } |
459 | |
460 | BL_INLINE bool vhasmaski32(const I128& x, int bits0_3) noexcept { return _mm_movemask_ps(vcast<F128>(x)) == bits0_3; } |
461 | BL_INLINE bool vhasmaski64(const I128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; } |
462 | |
463 | BL_INLINE I128 vdiv255u16(const I128& x) noexcept { |
464 | I128 y = vaddi16(x, v_const_as<I128>(blCommonTable.i128_0080008000800080)); |
465 | return vmulhu16(y, v_const_as<I128>(blCommonTable.i128_0101010101010101)); |
466 | } |
467 | #endif |
468 | |
469 | // ============================================================================ |
470 | // [BLSIMD - F128] |
471 | // ============================================================================ |
472 | |
473 | #if defined(BL_TARGET_OPT_SSE) |
474 | BL_INLINE F128 vzerof128() noexcept { return _mm_setzero_ps(); } |
475 | |
476 | BL_INLINE F128 vsetf128(float x) noexcept { return _mm_set1_ps(x); } |
477 | BL_INLINE F128 vsetf128(float x3, float x2, float x1, float x0) noexcept { return _mm_set_ps(x3, x2, x1, x0); } |
478 | |
479 | //! Cast a scalar `float` to `F128` vector type. |
480 | BL_INLINE F128 vcvtf32f128(float x) noexcept { |
481 | #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) |
482 | // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70708 |
483 | F128 reg; |
484 | __asm__("" : "=x" (reg) : "0" (x)); |
485 | return reg; |
486 | #else |
487 | return _mm_set_ss(x); |
488 | #endif |
489 | } |
490 | BL_INLINE float vcvtf128f32(const F128& x) noexcept { return _mm_cvtss_f32(x); } |
491 | |
492 | BL_INLINE F128 vcvti32f128(int32_t x) noexcept { return _mm_cvtsi32_ss(vzerof128(), x); } |
493 | BL_INLINE int32_t vcvtf128i32(const F128& x) noexcept { return _mm_cvtss_si32(x); } |
494 | BL_INLINE int32_t vcvttf128i32(const F128& x) noexcept { return _mm_cvttss_si32(x); } |
495 | |
496 | #if BL_TARGET_ARCH_BITS >= 64 |
497 | BL_INLINE F128 vcvti64f128(int64_t x) noexcept { return _mm_cvtsi64_ss(vzerof128(), x); } |
498 | BL_INLINE int64_t vcvtf128i64(const F128& x) noexcept { return _mm_cvtss_si64(x); } |
499 | BL_INLINE int64_t vcvttf128i64(const F128& x) noexcept { return _mm_cvttss_si64(x); } |
500 | #endif |
501 | |
502 | template<int A, int B, int C, int D> |
503 | BL_INLINE F128 vshuff32(const F128& x, const F128& y) noexcept { return _mm_shuffle_ps(x, y, _MM_SHUFFLE(A, B, C, D)); } |
504 | |
505 | template<int A, int B, int C, int D> |
506 | BL_INLINE F128 vswizf32(const F128& x) noexcept { |
507 | #if defined(BL_TARGET_OPT_SSE2) && !defined(BL_TARGET_OPT_AVX) |
508 | return vcast<F128>(vswizi32<A, B, C, D>(vcast<I128>(x))); |
509 | #else |
510 | return vshuff32<A, B, C, D>(x, x); |
511 | #endif |
512 | } |
513 | |
514 | template<int A, int B> |
515 | BL_INLINE F128 vswizf64(const F128& x) noexcept { |
516 | #if defined(BL_TARGET_OPT_SSE2) && !defined(BL_TARGET_OPT_AVX) |
517 | return vcast<F128>(vswizi64<A, B>(vcast<I128>(x))); |
518 | #else |
519 | return vswizf32<A*2 + 1, A*2, B*2 + 1, B*2>(x); |
520 | #endif |
521 | } |
522 | |
523 | BL_INLINE F128 vduplf32(const F128& x) noexcept { return vswizf32<2, 2, 0, 0>(x); } |
524 | BL_INLINE F128 vduphf32(const F128& x) noexcept { return vswizf32<3, 3, 1, 1>(x); } |
525 | |
526 | BL_INLINE F128 vswapf64(const F128& x) noexcept { return vswizf64<0, 1>(x); } |
527 | BL_INLINE F128 vduplf64(const F128& x) noexcept { return vswizf64<0, 0>(x); } |
528 | BL_INLINE F128 vduphf64(const F128& x) noexcept { return vswizf64<1, 1>(x); } |
529 | |
530 | BL_INLINE F128 vunpacklf32(const F128& x, const F128& y) noexcept { return _mm_unpacklo_ps(x, y); } |
531 | BL_INLINE F128 vunpackhf32(const F128& x, const F128& y) noexcept { return _mm_unpackhi_ps(x, y); } |
532 | |
533 | BL_INLINE F128 vor(const F128& x, const F128& y) noexcept { return _mm_or_ps(x, y); } |
534 | BL_INLINE F128 vxor(const F128& x, const F128& y) noexcept { return _mm_xor_ps(x, y); } |
535 | BL_INLINE F128 vand(const F128& x, const F128& y) noexcept { return _mm_and_ps(x, y); } |
536 | BL_INLINE F128 vandnot_a(const F128& x, const F128& y) noexcept { return _mm_andnot_ps(x, y); } |
537 | BL_INLINE F128 vandnot_b(const F128& x, const F128& y) noexcept { return _mm_andnot_ps(y, x); } |
538 | BL_INLINE F128 vblendmask(const F128& x, const F128& y, const F128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); } |
539 | |
540 | BL_INLINE F128 vaddss(const F128& x, const F128& y) noexcept { return _mm_add_ss(x, y); } |
541 | BL_INLINE F128 vaddps(const F128& x, const F128& y) noexcept { return _mm_add_ps(x, y); } |
542 | |
543 | BL_INLINE F128 vsubss(const F128& x, const F128& y) noexcept { return _mm_sub_ss(x, y); } |
544 | BL_INLINE F128 vsubps(const F128& x, const F128& y) noexcept { return _mm_sub_ps(x, y); } |
545 | |
546 | BL_INLINE F128 vmulss(const F128& x, const F128& y) noexcept { return _mm_mul_ss(x, y); } |
547 | BL_INLINE F128 vmulps(const F128& x, const F128& y) noexcept { return _mm_mul_ps(x, y); } |
548 | |
549 | BL_INLINE F128 vdivss(const F128& x, const F128& y) noexcept { return _mm_div_ss(x, y); } |
550 | BL_INLINE F128 vdivps(const F128& x, const F128& y) noexcept { return _mm_div_ps(x, y); } |
551 | |
552 | BL_INLINE F128 vminss(const F128& x, const F128& y) noexcept { return _mm_min_ss(x, y); } |
553 | BL_INLINE F128 vminps(const F128& x, const F128& y) noexcept { return _mm_min_ps(x, y); } |
554 | |
555 | BL_INLINE F128 vmaxss(const F128& x, const F128& y) noexcept { return _mm_max_ss(x, y); } |
556 | BL_INLINE F128 vmaxps(const F128& x, const F128& y) noexcept { return _mm_max_ps(x, y); } |
557 | |
558 | BL_INLINE F128 vcmpeqss(const F128& x, const F128& y) noexcept { return _mm_cmpeq_ss(x, y); } |
559 | BL_INLINE F128 vcmpeqps(const F128& x, const F128& y) noexcept { return _mm_cmpeq_ps(x, y); } |
560 | |
561 | BL_INLINE F128 vcmpness(const F128& x, const F128& y) noexcept { return _mm_cmpneq_ss(x, y); } |
562 | BL_INLINE F128 vcmpneps(const F128& x, const F128& y) noexcept { return _mm_cmpneq_ps(x, y); } |
563 | |
564 | BL_INLINE F128 vcmpgess(const F128& x, const F128& y) noexcept { return _mm_cmpge_ss(x, y); } |
565 | BL_INLINE F128 vcmpgeps(const F128& x, const F128& y) noexcept { return _mm_cmpge_ps(x, y); } |
566 | |
567 | BL_INLINE F128 vcmpgtss(const F128& x, const F128& y) noexcept { return _mm_cmpgt_ss(x, y); } |
568 | BL_INLINE F128 vcmpgtps(const F128& x, const F128& y) noexcept { return _mm_cmpgt_ps(x, y); } |
569 | |
570 | BL_INLINE F128 vcmpless(const F128& x, const F128& y) noexcept { return _mm_cmple_ss(x, y); } |
571 | BL_INLINE F128 vcmpleps(const F128& x, const F128& y) noexcept { return _mm_cmple_ps(x, y); } |
572 | |
573 | BL_INLINE F128 vcmpltss(const F128& x, const F128& y) noexcept { return _mm_cmplt_ss(x, y); } |
574 | BL_INLINE F128 vcmpltps(const F128& x, const F128& y) noexcept { return _mm_cmplt_ps(x, y); } |
575 | |
576 | BL_INLINE F128 vsqrtss(const F128& x) noexcept { return _mm_sqrt_ss(x); } |
577 | BL_INLINE F128 vsqrtps(const F128& x) noexcept { return _mm_sqrt_ps(x); } |
578 | |
579 | BL_INLINE F128 vloadf128_32(const void* p) noexcept { return _mm_load_ss(static_cast<const float*>(p)); } |
580 | BL_INLINE F128 vloadf128_64(const void* p) noexcept { return vcast<F128>(vloadi128_64(p)); } |
581 | |
582 | BL_INLINE F128 vloadf128a(const void* p) noexcept { return _mm_load_ps(static_cast<const float*>(p)); } |
583 | BL_INLINE F128 vloadf128u(const void* p) noexcept { return _mm_loadu_ps(static_cast<const float*>(p)); } |
584 | |
585 | BL_INLINE F128 vloadf128_l64(const F128& x, const void* p) noexcept { return _mm_loadl_pi(x, static_cast<const __m64*>(p)); } |
586 | BL_INLINE F128 vloadf128_h64(const F128& x, const void* p) noexcept { return _mm_loadh_pi(x, static_cast<const __m64*>(p)); } |
587 | |
588 | BL_INLINE void vstoref32(void* p, const F128& x) noexcept { _mm_store_ss(static_cast<float*>(p), x); } |
589 | BL_INLINE void vstoref64(void* p, const F128& x) noexcept { _mm_storel_pi(static_cast<__m64*>(p), x); } |
590 | BL_INLINE void vstorelf64(void* p, const F128& x) noexcept { _mm_storel_pi(static_cast<__m64*>(p), x); } |
591 | BL_INLINE void vstorehf64(void* p, const F128& x) noexcept { _mm_storeh_pi(static_cast<__m64*>(p), x); } |
592 | BL_INLINE void vstoref128a(void* p, const F128& x) noexcept { _mm_store_ps(static_cast<float*>(p), x); } |
593 | BL_INLINE void vstoref128u(void* p, const F128& x) noexcept { _mm_storeu_ps(static_cast<float*>(p), x); } |
594 | |
595 | BL_INLINE F128 vbroadcastf128_64(const void* p) noexcept { |
596 | #if defined(BL_TARGET_OPT_SSE3) |
597 | return vcast<F128>(_mm_loaddup_pd(static_cast<const double*>(p))); |
598 | #else |
599 | return vduplf64(vloadf128_64(p)); |
600 | #endif |
601 | } |
602 | |
603 | BL_INLINE bool vhasmaskf32(const F128& x, int bits0_3) noexcept { return _mm_movemask_ps(vcast<F128>(x)) == bits0_3; } |
604 | BL_INLINE bool vhasmaskf64(const F128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; } |
605 | |
606 | // ============================================================================ |
607 | // [BLSIMD - D128] |
608 | // ============================================================================ |
609 | |
610 | #if defined(BL_TARGET_OPT_SSE2) |
611 | BL_INLINE D128 vzerod128() noexcept { return _mm_setzero_pd(); } |
612 | |
613 | BL_INLINE D128 vsetd128(double x) noexcept { return _mm_set1_pd(x); } |
614 | BL_INLINE D128 vsetd128(double x1, double x0) noexcept { return _mm_set_pd(x1, x0); } |
615 | |
616 | //! Cast a scalar `double` to `D128` vector type. |
617 | BL_INLINE D128 vcvtd64d128(double x) noexcept { |
618 | #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) |
619 | // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70708 |
620 | D128 reg; |
621 | __asm__("" : "=x" (reg) : "0" (x)); |
622 | return reg; |
623 | #else |
624 | return _mm_set_sd(x); |
625 | #endif |
626 | } |
627 | BL_INLINE double vcvtd128d64(const D128& x) noexcept { return _mm_cvtsd_f64(x); } |
628 | |
629 | BL_INLINE D128 vcvti32d128(int32_t x) noexcept { return _mm_cvtsi32_sd(vzerod128(), x); } |
630 | BL_INLINE int32_t vcvtd128i32(const D128& x) noexcept { return _mm_cvtsd_si32(x); } |
631 | BL_INLINE int32_t vcvttd128i32(const D128& x) noexcept { return _mm_cvttsd_si32(x); } |
632 | |
633 | #if BL_TARGET_ARCH_BITS >= 64 |
634 | BL_INLINE D128 vcvti64d128(int64_t x) noexcept { return _mm_cvtsi64_sd(vzerod128(), x); } |
635 | BL_INLINE int64_t vcvtd128i64(const D128& x) noexcept { return _mm_cvtsd_si64(x); } |
636 | BL_INLINE int64_t vcvttd128i64(const D128& x) noexcept { return _mm_cvttsd_si64(x); } |
637 | #endif |
638 | |
639 | BL_INLINE D128 vcvtf128d128(const F128& x) noexcept { return _mm_cvtps_pd(x); } |
640 | BL_INLINE F128 vcvtd128f128(const D128& x) noexcept { return _mm_cvtpd_ps(x); } |
641 | |
642 | BL_INLINE F128 vcvti128f128(const I128& x) noexcept { return _mm_cvtepi32_ps(x); } |
643 | BL_INLINE D128 vcvti128d128(const I128& x) noexcept { return _mm_cvtepi32_pd(x); } |
644 | |
645 | BL_INLINE I128 vcvtf128i128(const F128& x) noexcept { return _mm_cvtps_epi32(x); } |
646 | BL_INLINE I128 vcvttf128i128(const F128& x) noexcept { return _mm_cvttps_epi32(x); } |
647 | |
648 | BL_INLINE I128 vcvtd128i128(const D128& x) noexcept { return _mm_cvtpd_epi32(x); } |
649 | BL_INLINE I128 vcvttd128i128(const D128& x) noexcept { return _mm_cvttpd_epi32(x); } |
650 | |
651 | template<int A, int B> |
652 | BL_INLINE D128 vshufd64(const D128& x, const D128& y) noexcept { return _mm_shuffle_pd(x, y, (A << 1) | B); } |
653 | |
654 | template<int A, int B> |
655 | BL_INLINE D128 vswizd64(const D128& x) noexcept { |
656 | #if !defined(BL_TARGET_OPT_AVX) |
657 | return vcast<D128>(vswizi64<A, B>(vcast<I128>(x))); |
658 | #else |
659 | return vshufd64<A, B>(x, x); |
660 | #endif |
661 | } |
662 | |
663 | BL_INLINE D128 vswapd64(const D128& x) noexcept { return vswizd64<0, 1>(x); } |
664 | BL_INLINE D128 vdupld64(const D128& x) noexcept { return vswizd64<0, 0>(x); } |
665 | BL_INLINE D128 vduphd64(const D128& x) noexcept { return vswizd64<1, 1>(x); } |
666 | |
667 | BL_INLINE D128 vunpackld64(const D128& x, const D128& y) noexcept { return _mm_unpacklo_pd(x, y); } |
668 | BL_INLINE D128 vunpackhd64(const D128& x, const D128& y) noexcept { return _mm_unpackhi_pd(x, y); } |
669 | |
670 | BL_INLINE D128 vor(const D128& x, const D128& y) noexcept { return _mm_or_pd(x, y); } |
671 | BL_INLINE D128 vxor(const D128& x, const D128& y) noexcept { return _mm_xor_pd(x, y); } |
672 | BL_INLINE D128 vand(const D128& x, const D128& y) noexcept { return _mm_and_pd(x, y); } |
673 | BL_INLINE D128 vandnot_a(const D128& x, const D128& y) noexcept { return _mm_andnot_pd(x, y); } |
674 | BL_INLINE D128 vandnot_b(const D128& x, const D128& y) noexcept { return _mm_andnot_pd(y, x); } |
675 | BL_INLINE D128 vblendmask(const D128& x, const D128& y, const D128& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); } |
676 | |
677 | BL_INLINE D128 vaddsd(const D128& x, const D128& y) noexcept { return _mm_add_sd(x, y); } |
678 | BL_INLINE D128 vaddpd(const D128& x, const D128& y) noexcept { return _mm_add_pd(x, y); } |
679 | |
680 | BL_INLINE D128 vsubsd(const D128& x, const D128& y) noexcept { return _mm_sub_sd(x, y); } |
681 | BL_INLINE D128 vsubpd(const D128& x, const D128& y) noexcept { return _mm_sub_pd(x, y); } |
682 | |
683 | BL_INLINE D128 vmulsd(const D128& x, const D128& y) noexcept { return _mm_mul_sd(x, y); } |
684 | BL_INLINE D128 vmulpd(const D128& x, const D128& y) noexcept { return _mm_mul_pd(x, y); } |
685 | |
686 | BL_INLINE D128 vdivsd(const D128& x, const D128& y) noexcept { return _mm_div_sd(x, y); } |
687 | BL_INLINE D128 vdivpd(const D128& x, const D128& y) noexcept { return _mm_div_pd(x, y); } |
688 | |
689 | BL_INLINE D128 vminsd(const D128& x, const D128& y) noexcept { return _mm_min_sd(x, y); } |
690 | BL_INLINE D128 vminpd(const D128& x, const D128& y) noexcept { return _mm_min_pd(x, y); } |
691 | |
692 | BL_INLINE D128 vmaxsd(const D128& x, const D128& y) noexcept { return _mm_max_sd(x, y); } |
693 | BL_INLINE D128 vmaxpd(const D128& x, const D128& y) noexcept { return _mm_max_pd(x, y); } |
694 | |
695 | BL_INLINE D128 vcmpeqsd(const D128& x, const D128& y) noexcept { return _mm_cmpeq_sd(x, y); } |
696 | BL_INLINE D128 vcmpeqpd(const D128& x, const D128& y) noexcept { return _mm_cmpeq_pd(x, y); } |
697 | |
698 | BL_INLINE D128 vcmpnesd(const D128& x, const D128& y) noexcept { return _mm_cmpneq_sd(x, y); } |
699 | BL_INLINE D128 vcmpnepd(const D128& x, const D128& y) noexcept { return _mm_cmpneq_pd(x, y); } |
700 | |
701 | BL_INLINE D128 vcmpgesd(const D128& x, const D128& y) noexcept { return _mm_cmpge_sd(x, y); } |
702 | BL_INLINE D128 vcmpgepd(const D128& x, const D128& y) noexcept { return _mm_cmpge_pd(x, y); } |
703 | |
704 | BL_INLINE D128 vcmpgtsd(const D128& x, const D128& y) noexcept { return _mm_cmpgt_sd(x, y); } |
705 | BL_INLINE D128 vcmpgtpd(const D128& x, const D128& y) noexcept { return _mm_cmpgt_pd(x, y); } |
706 | |
707 | BL_INLINE D128 vcmplesd(const D128& x, const D128& y) noexcept { return _mm_cmple_sd(x, y); } |
708 | BL_INLINE D128 vcmplepd(const D128& x, const D128& y) noexcept { return _mm_cmple_pd(x, y); } |
709 | |
710 | BL_INLINE D128 vcmpltsd(const D128& x, const D128& y) noexcept { return _mm_cmplt_sd(x, y); } |
711 | BL_INLINE D128 vcmpltpd(const D128& x, const D128& y) noexcept { return _mm_cmplt_pd(x, y); } |
712 | |
713 | BL_INLINE D128 vsqrtsd(const D128& x) noexcept { return _mm_sqrt_sd(x, x); } |
714 | BL_INLINE D128 vsqrtpd(const D128& x) noexcept { return _mm_sqrt_pd(x); } |
715 | |
716 | BL_INLINE D128 vloadd128_64(const void* p) noexcept { return _mm_load_sd(static_cast<const double*>(p)); } |
717 | BL_INLINE D128 vloadd128a(const void* p) noexcept { return _mm_load_pd(static_cast<const double*>(p)); } |
718 | BL_INLINE D128 vloadd128u(const void* p) noexcept { return _mm_loadu_pd(static_cast<const double*>(p)); } |
719 | |
720 | BL_INLINE D128 vloadd128_l64(const D128& x, const void* p) noexcept { return _mm_loadl_pd(x, static_cast<const double*>(p)); } |
721 | BL_INLINE D128 vloadd128_h64(const D128& x, const void* p) noexcept { return _mm_loadh_pd(x, static_cast<const double*>(p)); } |
722 | |
723 | BL_INLINE D128 vbroadcastd128_64(const void* p) noexcept { |
724 | #if defined(BL_TARGET_OPT_SSE3) |
725 | return _mm_loaddup_pd(static_cast<const double*>(p)); |
726 | #else |
727 | return vdupld64(vloadd128_64(p)); |
728 | #endif |
729 | } |
730 | |
731 | BL_INLINE void vstored64(void* p, const D128& x) noexcept { _mm_store_sd(static_cast<double*>(p), x); } |
732 | BL_INLINE void vstoreld64(void* p, const D128& x) noexcept { _mm_storel_pd(static_cast<double*>(p), x); } |
733 | BL_INLINE void vstorehd64(void* p, const D128& x) noexcept { _mm_storeh_pd(static_cast<double*>(p), x); } |
734 | BL_INLINE void vstored128a(void* p, const D128& x) noexcept { _mm_store_pd(static_cast<double*>(p), x); } |
735 | BL_INLINE void vstored128u(void* p, const D128& x) noexcept { _mm_storeu_pd(static_cast<double*>(p), x); } |
736 | |
737 | BL_INLINE bool vhasmaskd64(const D128& x, int bits0_1) noexcept { return _mm_movemask_pd(vcast<D128>(x)) == bits0_1; } |
738 | #endif |
739 | |
740 | // ============================================================================ |
741 | // [BLSIMD::I256] |
742 | // ============================================================================ |
743 | |
744 | #if defined(BL_TARGET_OPT_AVX) |
745 | BL_INLINE I256 vzeroi256() noexcept { return _mm256_setzero_si256(); } |
746 | |
747 | BL_INLINE F256 vcvti256f256(const I256& x) noexcept { return _mm256_cvtepi32_ps(x); } |
748 | BL_INLINE D256 vcvti128d256(const I128& x) noexcept { return _mm256_cvtepi32_pd(vcast<I128>(x)); } |
749 | BL_INLINE D256 vcvti256d256(const I256& x) noexcept { return _mm256_cvtepi32_pd(vcast<I128>(x)); } |
750 | #endif |
751 | |
752 | #if defined(BL_TARGET_OPT_AVX2) |
753 | BL_INLINE I256 vseti256i8(int8_t x) noexcept { return _mm256_set1_epi8(x); } |
754 | BL_INLINE I256 vseti256i16(int16_t x) noexcept { return _mm256_set1_epi16(x); } |
755 | |
756 | BL_INLINE I256 vseti256i32(int32_t x) noexcept { return _mm256_set1_epi32(x); } |
757 | BL_INLINE I256 vseti256i32(int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x1, x0, x1, x0, x1, x0, x1, x0); } |
758 | BL_INLINE I256 vseti256i32(int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x3, x2, x1, x0, x3, x2, x1, x0); } |
759 | BL_INLINE I256 vseti256i32(int32_t x7, int32_t x6, int32_t x5, int32_t x4, int32_t x3, int32_t x2, int32_t x1, int32_t x0) noexcept { return _mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0); } |
760 | |
761 | BL_INLINE I256 vseti256i64(int64_t x) noexcept { |
762 | #if BL_TARGET_ARCH_BITS >= 64 |
763 | return _mm256_set1_epi64x(x); |
764 | #else |
765 | return vseti256i32(int32_t(uint64_t(x) >> 32), int32_t(x & 0xFFFFFFFFu)); |
766 | #endif |
767 | } |
768 | |
769 | BL_INLINE I256 vseti256i64(int64_t x1, int64_t x0) noexcept { |
770 | return vseti256i32(int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu), |
771 | int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu), |
772 | int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu), |
773 | int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu)); |
774 | } |
775 | |
776 | BL_INLINE I256 vseti256i64(int64_t x3, int64_t x2, int64_t x1, int64_t x0) noexcept { |
777 | return vseti256i32(int32_t(uint64_t(x3) >> 32), int32_t(x3 & 0xFFFFFFFFu), |
778 | int32_t(uint64_t(x2) >> 32), int32_t(x2 & 0xFFFFFFFFu), |
779 | int32_t(uint64_t(x1) >> 32), int32_t(x1 & 0xFFFFFFFFu), |
780 | int32_t(uint64_t(x0) >> 32), int32_t(x0 & 0xFFFFFFFFu)); |
781 | } |
782 | |
783 | BL_INLINE I256 vseti256u8(uint8_t x) noexcept { return vseti256i8(int8_t(x)); } |
784 | BL_INLINE I256 vseti256u16(uint16_t x) noexcept { return vseti256i16(int16_t(x)); } |
785 | BL_INLINE I256 vseti256u32(uint32_t x) noexcept { return vseti256i32(int32_t(x)); } |
786 | BL_INLINE I256 vseti256u64(uint64_t x) noexcept { return vseti256i64(int64_t(x)); } |
787 | |
788 | BL_INLINE I256 vseti256u32(uint32_t x1, uint32_t x0) noexcept { |
789 | return vseti256i32(int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0), |
790 | int32_t(x1), int32_t(x0), int32_t(x1), int32_t(x0)); |
791 | } |
792 | |
793 | BL_INLINE I256 vseti256u32(uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept { |
794 | return vseti256i32(int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0), |
795 | int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0)); |
796 | } |
797 | |
798 | BL_INLINE I256 vseti256u32(uint32_t x7, uint32_t x6, uint32_t x5, uint32_t x4, uint32_t x3, uint32_t x2, uint32_t x1, uint32_t x0) noexcept { |
799 | return vseti256i32(int32_t(x7), int32_t(x6), int32_t(x5), int32_t(x4), |
800 | int32_t(x3), int32_t(x2), int32_t(x1), int32_t(x0)); |
801 | } |
802 | |
803 | BL_INLINE I256 vseti256u64(uint64_t x1, uint64_t x0) noexcept { |
804 | return vseti256i64(int64_t(x1), int64_t(x0)); |
805 | } |
806 | |
807 | BL_INLINE I256 vseti256u64(uint64_t x3, uint64_t x2, uint64_t x1, uint64_t x0) noexcept { |
808 | return vseti256i64(int64_t(x3), int64_t(x2), int64_t(x1), int64_t(x0)); |
809 | } |
810 | |
811 | BL_INLINE I256 vcvti32i256(int32_t x) noexcept { return vcast<I256>(vcvti32i128(x)); } |
812 | BL_INLINE I256 vcvtu32i256(uint32_t x) noexcept { return vcast<I256>(vcvtu32i128(x)); } |
813 | |
814 | BL_INLINE int32_t vcvti256i32(const I256& x) noexcept { return vcvti128i32(vcast<I128>(x)); } |
815 | BL_INLINE uint32_t vcvti256u32(const I256& x) noexcept { return vcvti128u32(vcast<I128>(x)); } |
816 | |
817 | BL_INLINE I256 vcvti64i256(int64_t x) noexcept { return vcast<I256>(vcvti64i128(x)); } |
818 | BL_INLINE I256 vcvtu64i256(uint64_t x) noexcept { return vcast<I256>(vcvtu64i128(x)); } |
819 | |
820 | BL_INLINE int64_t vcvti256i64(const I256& x) noexcept { return vcvti128i64(vcast<I128>(x)); } |
821 | BL_INLINE uint64_t vcvti256u64(const I256& x) noexcept { return vcvti128u64(vcast<I128>(x)); } |
822 | |
823 | template<int A, int B> |
824 | BL_INLINE I256 vpermi128(const I256& x, const I256& y) noexcept { return _mm256_permute2x128_si256(x, y, ((A & 0xF) << 4) + (B & 0xF)); } |
825 | template<int A, int B> |
826 | BL_INLINE I256 vpermi128(const I256& x) noexcept { return vpermi128<A, B>(x, x); } |
827 | |
828 | template<uint8_t A, uint8_t B, uint8_t C, uint8_t D> |
829 | BL_INLINE I256 vswizli16(const I256& x) noexcept { return _mm256_shufflelo_epi16(x, _MM_SHUFFLE(A, B, C, D)); } |
830 | template<uint8_t A, uint8_t B, uint8_t C, uint8_t D> |
831 | BL_INLINE I256 vswizhi16(const I256& x) noexcept { return _mm256_shufflehi_epi16(x, _MM_SHUFFLE(A, B, C, D)); } |
832 | |
833 | template<uint8_t A, uint8_t B, uint8_t C, uint8_t D> |
834 | BL_INLINE I256 vswizi16(const I256& x) noexcept { return vswizhi16<A, B, C, D>(vswizli16<A, B, C, D>(x)); } |
835 | template<uint8_t A, uint8_t B, uint8_t C, uint8_t D> |
836 | BL_INLINE I256 vswizi32(const I256& x) noexcept { return _mm256_shuffle_epi32(x, _MM_SHUFFLE(A, B, C, D)); } |
837 | template<int A, int B> |
838 | BL_INLINE I256 vswizi64(const I256& x) noexcept { return vswizi32<A*2 + 1, A*2, B*2 + 1, B*2>(x); } |
839 | |
840 | BL_INLINE I256 vpshufb(const I256& x, const I256& y) noexcept { return _mm256_shuffle_epi8(x, y); } |
841 | |
842 | template<int N_BYTES> |
843 | BL_INLINE I256 vpalignr(const I256& x, const I256& y) noexcept { return _mm256_alignr_epi8(x, y, N_BYTES); } |
844 | |
845 | BL_INLINE I256 vsplati8i256(const I128& x) noexcept { return _mm256_broadcastb_epi8(vcast<I128>(x)); } |
846 | BL_INLINE I256 vsplati8i256(const I256& x) noexcept { return _mm256_broadcastb_epi8(vcast<I128>(x)); } |
847 | |
848 | BL_INLINE I256 vsplati16i256(const I128& x) noexcept { return _mm256_broadcastw_epi16(vcast<I128>(x)); } |
849 | BL_INLINE I256 vsplati16i256(const I256& x) noexcept { return _mm256_broadcastw_epi16(vcast<I128>(x)); } |
850 | |
851 | BL_INLINE I256 vsplati32i256(const I128& x) noexcept { return _mm256_broadcastd_epi32(vcast<I128>(x)); } |
852 | BL_INLINE I256 vsplati32i256(const I256& x) noexcept { return _mm256_broadcastd_epi32(vcast<I128>(x)); } |
853 | |
854 | BL_INLINE I256 vsplati64i256(const I128& x) noexcept { return _mm256_broadcastq_epi64(vcast<I128>(x)); } |
855 | BL_INLINE I256 vsplati64i256(const I256& x) noexcept { return _mm256_broadcastq_epi64(vcast<I128>(x)); } |
856 | |
857 | BL_INLINE I256 vswapi64(const I256& x) noexcept { return vswizi64<0, 1>(x); } |
858 | BL_INLINE I256 vdupli64(const I256& x) noexcept { return vswizi64<0, 0>(x); } |
859 | BL_INLINE I256 vduphi64(const I256& x) noexcept { return vswizi64<1, 1>(x); } |
860 | |
861 | BL_INLINE I256 vswapi128(const I256& x) noexcept { return vpermi128<0, 1>(x); } |
862 | BL_INLINE I256 vdupli128(const I128& x) noexcept { return vpermi128<0, 0>(vcast<I256>(x)); } |
863 | BL_INLINE I256 vdupli128(const I256& x) noexcept { return vpermi128<0, 0>(x); } |
864 | BL_INLINE I256 vduphi128(const I256& x) noexcept { return vpermi128<1, 1>(x); } |
865 | |
866 | BL_INLINE I256 vmovli128u8u16(const I128& x) noexcept { return _mm256_cvtepu8_epi16(x); } |
867 | BL_INLINE I256 vmovli128u8u32(const I128& x) noexcept { return _mm256_cvtepu8_epi32(x); } |
868 | BL_INLINE I256 vmovli128u8u64(const I128& x) noexcept { return _mm256_cvtepu8_epi64(x); } |
869 | BL_INLINE I256 vmovli128u16u32(const I128& x) noexcept { return _mm256_cvtepu16_epi32(x); } |
870 | BL_INLINE I256 vmovli128u16u64(const I128& x) noexcept { return _mm256_cvtepu16_epi64(x); } |
871 | BL_INLINE I256 vmovli128u32u64(const I128& x) noexcept { return _mm256_cvtepu32_epi64(x); } |
872 | |
873 | BL_INLINE I256 vpacki16i8(const I256& x, const I256& y) noexcept { return _mm256_packs_epi16(x, y); } |
874 | BL_INLINE I256 vpacki16u8(const I256& x, const I256& y) noexcept { return _mm256_packus_epi16(x, y); } |
875 | BL_INLINE I256 vpacki32i16(const I256& x, const I256& y) noexcept { return _mm256_packs_epi32(x, y); } |
876 | BL_INLINE I256 vpacki32u16(const I256& x, const I256& y) noexcept { return _mm256_packus_epi32(x, y); } |
877 | |
878 | BL_INLINE I256 vpacki16i8(const I256& x) noexcept { return vpacki16i8(x, x); } |
879 | BL_INLINE I256 vpacki16u8(const I256& x) noexcept { return vpacki16u8(x, x); } |
880 | BL_INLINE I256 vpacki32i16(const I256& x) noexcept { return vpacki32i16(x, x); } |
881 | BL_INLINE I256 vpacki32u16(const I256& x) noexcept { return vpacki32u16(x, x); } |
882 | |
883 | BL_INLINE I256 vpacki32i8(const I256& x) noexcept { return vpacki16i8(vpacki32i16(x)); } |
884 | BL_INLINE I256 vpacki32i8(const I256& x, const I256& y) noexcept { return vpacki16i8(vpacki32i16(x, y)); } |
885 | BL_INLINE I256 vpacki32i8(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16i8(vpacki32i16(x, y), vpacki32i16(z, w)); } |
886 | |
887 | BL_INLINE I256 vpacki32u8(const I256& x) noexcept { return vpacki16u8(vpacki32i16(x)); } |
888 | BL_INLINE I256 vpacki32u8(const I256& x, const I256& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); } |
889 | BL_INLINE I256 vpacki32u8(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); } |
890 | |
891 | BL_INLINE I256 vpackzzdb(const I256& x, const I256& y) noexcept { return vpacki16u8(vpacki32i16(x, y)); } |
892 | BL_INLINE I256 vpackzzdb(const I256& x, const I256& y, const I256& z, const I256& w) noexcept { return vpacki16u8(vpacki32i16(x, y), vpacki32i16(z, w)); } |
893 | |
894 | BL_INLINE I256 vunpackli8(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi8(x, y); } |
895 | BL_INLINE I256 vunpackhi8(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi8(x, y); } |
896 | |
897 | BL_INLINE I256 vunpackli16(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi16(x, y); } |
898 | BL_INLINE I256 vunpackhi16(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi16(x, y); } |
899 | |
900 | BL_INLINE I256 vunpackli32(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi32(x, y); } |
901 | BL_INLINE I256 vunpackhi32(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi32(x, y); } |
902 | |
903 | BL_INLINE I256 vunpackli64(const I256& x, const I256& y) noexcept { return _mm256_unpacklo_epi64(x, y); } |
904 | BL_INLINE I256 vunpackhi64(const I256& x, const I256& y) noexcept { return _mm256_unpackhi_epi64(x, y); } |
905 | |
906 | BL_INLINE I256 vor(const I256& x, const I256& y) noexcept { return _mm256_or_si256(x, y); } |
907 | BL_INLINE I256 vxor(const I256& x, const I256& y) noexcept { return _mm256_xor_si256(x, y); } |
908 | BL_INLINE I256 vand(const I256& x, const I256& y) noexcept { return _mm256_and_si256(x, y); } |
909 | BL_INLINE I256 vandnot_a(const I256& x, const I256& y) noexcept { return _mm256_andnot_si256(x, y); } |
910 | BL_INLINE I256 vandnot_b(const I256& x, const I256& y) noexcept { return _mm256_andnot_si256(y, x); } |
911 | |
912 | BL_INLINE I256 vblendmask(const I256& x, const I256& y, const I256& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); } |
913 | BL_INLINE I256 vblendx(const I256& x, const I256& y, const I256& mask) noexcept { return _mm256_blendv_epi8(x, y, mask); } |
914 | |
915 | BL_INLINE I256 vaddi8(const I256& x, const I256& y) noexcept { return _mm256_add_epi8(x, y); } |
916 | BL_INLINE I256 vaddi16(const I256& x, const I256& y) noexcept { return _mm256_add_epi16(x, y); } |
917 | BL_INLINE I256 vaddi32(const I256& x, const I256& y) noexcept { return _mm256_add_epi32(x, y); } |
918 | BL_INLINE I256 vaddi64(const I256& x, const I256& y) noexcept { return _mm256_add_epi64(x, y); } |
919 | |
920 | BL_INLINE I256 vaddsi8(const I256& x, const I256& y) noexcept { return _mm256_adds_epi8(x, y); } |
921 | BL_INLINE I256 vaddsu8(const I256& x, const I256& y) noexcept { return _mm256_adds_epu8(x, y); } |
922 | BL_INLINE I256 vaddsi16(const I256& x, const I256& y) noexcept { return _mm256_adds_epi16(x, y); } |
923 | BL_INLINE I256 vaddsu16(const I256& x, const I256& y) noexcept { return _mm256_adds_epu16(x, y); } |
924 | |
925 | BL_INLINE I256 vsubi8(const I256& x, const I256& y) noexcept { return _mm256_sub_epi8(x, y); } |
926 | BL_INLINE I256 vsubi16(const I256& x, const I256& y) noexcept { return _mm256_sub_epi16(x, y); } |
927 | BL_INLINE I256 vsubi32(const I256& x, const I256& y) noexcept { return _mm256_sub_epi32(x, y); } |
928 | BL_INLINE I256 vsubi64(const I256& x, const I256& y) noexcept { return _mm256_sub_epi64(x, y); } |
929 | |
930 | BL_INLINE I256 vsubsi8(const I256& x, const I256& y) noexcept { return _mm256_subs_epi8(x, y); } |
931 | BL_INLINE I256 vsubsu8(const I256& x, const I256& y) noexcept { return _mm256_subs_epu8(x, y); } |
932 | BL_INLINE I256 vsubsi16(const I256& x, const I256& y) noexcept { return _mm256_subs_epi16(x, y); } |
933 | BL_INLINE I256 vsubsu16(const I256& x, const I256& y) noexcept { return _mm256_subs_epu16(x, y); } |
934 | |
935 | BL_INLINE I256 vmuli16(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi16(x, y); } |
936 | BL_INLINE I256 vmulu16(const I256& x, const I256& y) noexcept { return _mm256_mullo_epi16(x, y); } |
937 | BL_INLINE I256 vmulhi16(const I256& x, const I256& y) noexcept { return _mm256_mulhi_epi16(x, y); } |
938 | BL_INLINE I256 vmulhu16(const I256& x, const I256& y) noexcept { return _mm256_mulhi_epu16(x, y); } |
939 | |
940 | template<uint8_t N_BITS> BL_INLINE I256 vslli16(const I256& x) noexcept { return _mm256_slli_epi16(x, N_BITS); } |
941 | template<uint8_t N_BITS> BL_INLINE I256 vslli32(const I256& x) noexcept { return _mm256_slli_epi32(x, N_BITS); } |
942 | template<uint8_t N_BITS> BL_INLINE I256 vslli64(const I256& x) noexcept { return _mm256_slli_epi64(x, N_BITS); } |
943 | |
944 | template<uint8_t N_BITS> BL_INLINE I256 vsrli16(const I256& x) noexcept { return _mm256_srli_epi16(x, N_BITS); } |
945 | template<uint8_t N_BITS> BL_INLINE I256 vsrli32(const I256& x) noexcept { return _mm256_srli_epi32(x, N_BITS); } |
946 | template<uint8_t N_BITS> BL_INLINE I256 vsrli64(const I256& x) noexcept { return _mm256_srli_epi64(x, N_BITS); } |
947 | |
948 | template<uint8_t N_BITS> BL_INLINE I256 vsrai16(const I256& x) noexcept { return _mm256_srai_epi16(x, N_BITS); } |
949 | template<uint8_t N_BITS> BL_INLINE I256 vsrai32(const I256& x) noexcept { return _mm256_srai_epi32(x, N_BITS); } |
950 | |
951 | template<uint8_t N_BYTES> BL_INLINE I256 vslli128b(const I256& x) noexcept { return _mm256_slli_si256(x, N_BYTES); } |
952 | template<uint8_t N_BYTES> BL_INLINE I256 vsrli128b(const I256& x) noexcept { return _mm256_srli_si256(x, N_BYTES); } |
953 | |
954 | BL_INLINE I256 vmini8(const I256& x, const I256& y) noexcept { return _mm256_min_epi8(x, y); } |
955 | BL_INLINE I256 vmaxi8(const I256& x, const I256& y) noexcept { return _mm256_max_epi8(x, y); } |
956 | BL_INLINE I256 vminu8(const I256& x, const I256& y) noexcept { return _mm256_min_epu8(x, y); } |
957 | BL_INLINE I256 vmaxu8(const I256& x, const I256& y) noexcept { return _mm256_max_epu8(x, y); } |
958 | |
959 | BL_INLINE I256 vmini16(const I256& x, const I256& y) noexcept { return _mm256_min_epi16(x, y); } |
960 | BL_INLINE I256 vmaxi16(const I256& x, const I256& y) noexcept { return _mm256_max_epi16(x, y); } |
961 | BL_INLINE I256 vminu16(const I256& x, const I256& y) noexcept { return _mm256_min_epu16(x, y); } |
962 | BL_INLINE I256 vmaxu16(const I256& x, const I256& y) noexcept { return _mm256_max_epu16(x, y); } |
963 | |
964 | BL_INLINE I256 vmini32(const I256& x, const I256& y) noexcept { return _mm256_min_epi32(x, y); } |
965 | BL_INLINE I256 vmaxi32(const I256& x, const I256& y) noexcept { return _mm256_max_epi32(x, y); } |
966 | BL_INLINE I256 vminu32(const I256& x, const I256& y) noexcept { return _mm256_min_epu32(x, y); } |
967 | BL_INLINE I256 vmaxu32(const I256& x, const I256& y) noexcept { return _mm256_max_epu32(x, y); } |
968 | |
969 | BL_INLINE I256 vcmpeqi8(const I256& x, const I256& y) noexcept { return _mm256_cmpeq_epi8(x, y); } |
970 | BL_INLINE I256 vcmpgti8(const I256& x, const I256& y) noexcept { return _mm256_cmpgt_epi8(x, y); } |
971 | |
972 | BL_INLINE I256 vcmpeqi16(const I256& x, const I256& y) noexcept { return _mm256_cmpeq_epi16(x, y); } |
973 | BL_INLINE I256 vcmpgti16(const I256& x, const I256& y) noexcept { return _mm256_cmpgt_epi16(x, y); } |
974 | |
975 | BL_INLINE I256 vcmpeqi32(const I256& x, const I256& y) noexcept { return _mm256_cmpeq_epi32(x, y); } |
976 | BL_INLINE I256 vcmpgti32(const I256& x, const I256& y) noexcept { return _mm256_cmpgt_epi32(x, y); } |
977 | |
978 | BL_INLINE I256 vloadi256_32(const void* p) noexcept { return vcast<I256>(vloadi128_32(p)); } |
979 | BL_INLINE I256 vloadi256_64(const void* p) noexcept { return vcast<I256>(vloadi128_64(p)); } |
980 | BL_INLINE I256 vloadi256_128a(const void* p) noexcept { return vcast<I256>(vloadi128a(p)); } |
981 | BL_INLINE I256 vloadi256_128u(const void* p) noexcept { return vcast<I256>(vloadi128u(p)); } |
982 | BL_INLINE I256 vloadi256a(const void* p) noexcept { return _mm256_load_si256(static_cast<const I256*>(p)); } |
983 | BL_INLINE I256 vloadi256u(const void* p) noexcept { return _mm256_loadu_si256(static_cast<const I256*>(p)); } |
984 | |
985 | BL_INLINE I256 vloadi256_l64(const I256& x, const void* p) noexcept { return vcast<I256>(vloadi128_l64(vcast<I128>(x), p)); } |
986 | BL_INLINE I256 vloadi256_h64(const I256& x, const void* p) noexcept { return vcast<I256>(vloadi128_h64(vcast<I128>(x), p)); } |
987 | |
988 | BL_INLINE void vstorei32(void* p, const I256& x) noexcept { vstorei32(p, vcast<I128>(x)); } |
989 | BL_INLINE void vstorei64(void* p, const I256& x) noexcept { vstorei64(p, vcast<I128>(x)); } |
990 | BL_INLINE void vstorei128a(void* p, const I256& x) noexcept { vstorei128a(p, vcast<I128>(x)); } |
991 | BL_INLINE void vstorei128u(void* p, const I256& x) noexcept { vstorei128u(p, vcast<I128>(x)); } |
992 | BL_INLINE void vstorei256a(void* p, const I256& x) noexcept { _mm256_store_si256(static_cast<I256*>(p), x); } |
993 | BL_INLINE void vstorei256u(void* p, const I256& x) noexcept { _mm256_storeu_si256(static_cast<I256*>(p), x); } |
994 | |
995 | BL_INLINE void vstoreli64(void* p, const I256& x) noexcept { vstoreli64(p, vcast<I128>(x)); } |
996 | BL_INLINE void vstorehi64(void* p, const I256& x) noexcept { vstorehi64(p, vcast<I128>(x)); } |
997 | |
998 | BL_INLINE bool vhasmaski8(const I256& x, int bits0_31) noexcept { return _mm256_movemask_epi8(vcast<I256>(x)) == bits0_31; } |
999 | BL_INLINE bool vhasmaski8(const F256& x, int bits0_31) noexcept { return _mm256_movemask_epi8(vcast<I256>(x)) == bits0_31; } |
1000 | BL_INLINE bool vhasmaski8(const D256& x, int bits0_31) noexcept { return _mm256_movemask_epi8(vcast<I256>(x)) == bits0_31; } |
1001 | |
1002 | BL_INLINE bool vhasmaski32(const I256& x, int bits0_7) noexcept { return _mm256_movemask_ps(vcast<F256>(x)) == bits0_7; } |
1003 | BL_INLINE bool vhasmaski64(const I256& x, int bits0_3) noexcept { return _mm256_movemask_pd(vcast<D256>(x)) == bits0_3; } |
1004 | |
1005 | BL_INLINE I256 vdiv255u16(const I256& x) noexcept { |
1006 | I256 y = vaddi16(x, v_const_as<I256>(blCommonTable.i256_0080008000800080)); |
1007 | return vmulhu16(y, v_const_as<I256>(blCommonTable.i256_0101010101010101)); |
1008 | } |
1009 | #endif |
1010 | |
1011 | // ============================================================================ |
1012 | // [BLSIMD::F256] |
1013 | // ============================================================================ |
1014 | |
1015 | #if defined(BL_TARGET_OPT_AVX) |
1016 | BL_INLINE F256 vzerof256() noexcept { return _mm256_setzero_ps(); } |
1017 | |
1018 | BL_INLINE F256 vsetf256(float x) noexcept { return _mm256_set1_ps(x); } |
1019 | BL_INLINE F256 vsetf256(float x1, float x0) noexcept { return _mm256_set_ps(x1, x0, x1, x0, x1, x0, x1, x0); } |
1020 | BL_INLINE F256 vsetf256(float x3, float x2, float x1, float x0) noexcept { return _mm256_set_ps(x3, x2, x1, x0, x3, x2, x1, x0); } |
1021 | BL_INLINE F256 vsetf256(float x7, float x6, float x5, float x4, float x3, float x2, float x1, float x0) noexcept { return _mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0); } |
1022 | |
1023 | BL_INLINE F256 vcvtf32f256(float x) noexcept { return vcast<F256>(vcvtf32f128(x)); } |
1024 | BL_INLINE float vcvtf256f32(const F256& x) noexcept { return vcvtf128f32(vcast<F128>(x)); } |
1025 | |
1026 | BL_INLINE F256 vcvti32f256(int32_t x) noexcept { return vcast<F256>(vcvti32f128(x)); } |
1027 | BL_INLINE int32_t vcvtf256i32(const F256& x) noexcept { return vcvtf128i32(vcast<F128>(x)); } |
1028 | BL_INLINE int32_t vcvttf256i32(const F256& x) noexcept { return vcvttf128i32(vcast<F128>(x)); } |
1029 | |
1030 | #if BL_TARGET_ARCH_BITS >= 64 |
1031 | BL_INLINE F256 vcvti64f256(int64_t x) noexcept { return vcast<F256>(vcvti64f128(x)); } |
1032 | BL_INLINE int64_t vcvtf256i64(const F256& x) noexcept { return vcvtf128i64(vcast<F128>(x)); } |
1033 | BL_INLINE int64_t vcvttf256i64(const F256& x) noexcept { return vcvttf128i64(vcast<F128>(x)); } |
1034 | #endif |
1035 | |
1036 | BL_INLINE I256 vcvtf256i256(const F256& x) noexcept { return _mm256_cvtps_epi32(x); } |
1037 | BL_INLINE I256 vcvttf256i256(const F256& x) noexcept { return _mm256_cvttps_epi32(x); } |
1038 | |
1039 | BL_INLINE D256 vcvtf128d256(const F128& x) noexcept { return _mm256_cvtps_pd(vcast<F128>(x)); } |
1040 | BL_INLINE D256 vcvtf256d256(const F256& x) noexcept { return _mm256_cvtps_pd(vcast<F128>(x)); } |
1041 | |
1042 | template<int A, int B, int C, int D> |
1043 | BL_INLINE F256 vshuff32(const F256& x, const F256& y) noexcept { return _mm256_shuffle_ps(x, y, _MM_SHUFFLE(A, B, C, D)); } |
1044 | template<int A, int B, int C, int D> |
1045 | BL_INLINE F256 vswizf32(const F256& x) noexcept { return vshuff32<A, B, C, D>(x, x); } |
1046 | |
1047 | template<int A, int B> |
1048 | BL_INLINE F256 vswizf64(const F256& x) noexcept { return vshuff32<A*2 + 1, A*2, B*2 + 1, B*2>(x, x); } |
1049 | |
1050 | template<int A, int B> |
1051 | BL_INLINE F256 vpermf128(const F256& x, const F256& y) noexcept { return _mm256_permute2f128_ps(x, y, ((A & 0xF) << 4) + (B & 0xF)); } |
1052 | template<int A, int B> |
1053 | BL_INLINE F256 vpermf128(const F256& x) noexcept { return vpermf128<A, B>(x, x); } |
1054 | |
1055 | BL_INLINE F256 vduplf32(const F256& x) noexcept { return vswizf32<2, 2, 0, 0>(x); } |
1056 | BL_INLINE F256 vduphf32(const F256& x) noexcept { return vswizf32<3, 3, 1, 1>(x); } |
1057 | |
1058 | BL_INLINE F256 vswapf64(const F256& x) noexcept { return vswizf64<0, 1>(x); } |
1059 | BL_INLINE F256 vduplf64(const F256& x) noexcept { return vswizf64<0, 0>(x); } |
1060 | BL_INLINE F256 vduphf64(const F256& x) noexcept { return vswizf64<1, 1>(x); } |
1061 | |
1062 | BL_INLINE F256 vswapf128(const F256& x) noexcept { return vpermf128<0, 1>(x); } |
1063 | BL_INLINE F256 vduplf128(const F128& x) noexcept { return vpermf128<0, 0>(vcast<F256>(x)); } |
1064 | BL_INLINE F256 vduplf128(const F256& x) noexcept { return vpermf128<0, 0>(x); } |
1065 | BL_INLINE F256 vduphf128(const F256& x) noexcept { return vpermf128<1, 1>(x); } |
1066 | |
1067 | BL_INLINE F256 vunpacklf32(const F256& x, const F256& y) noexcept { return _mm256_unpacklo_ps(x, y); } |
1068 | BL_INLINE F256 vunpackhf32(const F256& x, const F256& y) noexcept { return _mm256_unpackhi_ps(x, y); } |
1069 | |
1070 | #if defined(BL_TARGET_OPT_AVX2) |
1071 | BL_INLINE F256 vsplatf32f256(const F128& x) noexcept { return _mm256_broadcastss_ps(vcast<F128>(x)); } |
1072 | BL_INLINE F256 vsplatf32f256(const F256& x) noexcept { return _mm256_broadcastss_ps(vcast<F128>(x)); } |
1073 | #else |
1074 | BL_INLINE F256 vsplatf32f256(const F128& x) noexcept { return vduplf128(vswizf32<0, 0, 0, 0>(vcast<F128>(x))); } |
1075 | BL_INLINE F256 vsplatf32f256(const F256& x) noexcept { return vduplf128(vswizf32<0, 0, 0, 0>(vcast<F128>(x))); } |
1076 | #endif |
1077 | |
1078 | BL_INLINE F256 vor(const F256& x, const F256& y) noexcept { return _mm256_or_ps(x, y); } |
1079 | BL_INLINE F256 vxor(const F256& x, const F256& y) noexcept { return _mm256_xor_ps(x, y); } |
1080 | BL_INLINE F256 vand(const F256& x, const F256& y) noexcept { return _mm256_and_ps(x, y); } |
1081 | BL_INLINE F256 vandnot_a(const F256& x, const F256& y) noexcept { return _mm256_andnot_ps(x, y); } |
1082 | BL_INLINE F256 vandnot_b(const F256& x, const F256& y) noexcept { return _mm256_andnot_ps(y, x); } |
1083 | BL_INLINE F256 vblendmask(const F256& x, const F256& y, const F256& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); } |
1084 | |
1085 | BL_INLINE F256 vaddss(const F256& x, const F256& y) noexcept { return vcast<F256>(vaddss(vcast<F128>(x), vcast<F128>(y))); } |
1086 | BL_INLINE F256 vaddps(const F256& x, const F256& y) noexcept { return _mm256_add_ps(x, y); } |
1087 | |
1088 | BL_INLINE F256 vsubss(const F256& x, const F256& y) noexcept { return vcast<F256>(vsubss(vcast<F128>(x), vcast<F128>(y))); } |
1089 | BL_INLINE F256 vsubps(const F256& x, const F256& y) noexcept { return _mm256_sub_ps(x, y); } |
1090 | |
1091 | BL_INLINE F256 vmulss(const F256& x, const F256& y) noexcept { return vcast<F256>(vmulss(vcast<F128>(x), vcast<F128>(y))); } |
1092 | BL_INLINE F256 vmulps(const F256& x, const F256& y) noexcept { return _mm256_mul_ps(x, y); } |
1093 | |
1094 | BL_INLINE F256 vdivss(const F256& x, const F256& y) noexcept { return vcast<F256>(vdivss(vcast<F128>(x), vcast<F128>(y))); } |
1095 | BL_INLINE F256 vdivps(const F256& x, const F256& y) noexcept { return _mm256_div_ps(x, y); } |
1096 | |
1097 | BL_INLINE F256 vminss(const F256& x, const F256& y) noexcept { return vcast<F256>(vminss(vcast<F128>(x), vcast<F128>(y))); } |
1098 | BL_INLINE F256 vminps(const F256& x, const F256& y) noexcept { return _mm256_min_ps(x, y); } |
1099 | |
1100 | BL_INLINE F256 vmaxss(const F256& x, const F256& y) noexcept { return vcast<F256>(vmaxss(vcast<F128>(x), vcast<F128>(y))); } |
1101 | BL_INLINE F256 vmaxps(const F256& x, const F256& y) noexcept { return _mm256_max_ps(x, y); } |
1102 | |
1103 | BL_INLINE F256 vcmpeqss(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpeqss(vcast<F128>(x), vcast<F128>(y))); } |
1104 | BL_INLINE F256 vcmpeqps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_EQ_OQ); } |
1105 | |
1106 | BL_INLINE F256 vcmpness(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpness(vcast<F128>(x), vcast<F128>(y))); } |
1107 | BL_INLINE F256 vcmpneps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_NEQ_OQ); } |
1108 | |
1109 | BL_INLINE F256 vcmpgess(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpgess(vcast<F128>(x), vcast<F128>(y))); } |
1110 | BL_INLINE F256 vcmpgeps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_GE_OQ); } |
1111 | |
1112 | BL_INLINE F256 vcmpgtss(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpgtss(vcast<F128>(x), vcast<F128>(y))); } |
1113 | BL_INLINE F256 vcmpgtps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_GT_OQ); } |
1114 | |
1115 | BL_INLINE F256 vcmpless(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpless(vcast<F128>(x), vcast<F128>(y))); } |
1116 | BL_INLINE F256 vcmpleps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_LE_OQ); } |
1117 | |
1118 | BL_INLINE F256 vcmpltss(const F256& x, const F256& y) noexcept { return vcast<F256>(vcmpltss(vcast<F128>(x), vcast<F128>(y))); } |
1119 | BL_INLINE F256 vcmpltps(const F256& x, const F256& y) noexcept { return _mm256_cmp_ps(x, y, _CMP_LT_OQ); } |
1120 | |
1121 | BL_INLINE F256 vsqrtss(const F256& x) noexcept { return vcast<F256>(vsqrtss(vcast<F128>(x))); } |
1122 | BL_INLINE F256 vsqrtps(const F256& x) noexcept { return _mm256_sqrt_ps(x); } |
1123 | |
1124 | BL_INLINE F256 vloadf256_32(const void* p) noexcept { return vcast<F256>(vloadf128_32(p)); } |
1125 | BL_INLINE F256 vloadf256_64(const void* p) noexcept { return vcast<F256>(vloadf128_64(p)); } |
1126 | BL_INLINE F256 vloadf256_128a(const void* p) noexcept { return vcast<F256>(vloadf128a(p)); } |
1127 | BL_INLINE F256 vloadf256_128u(const void* p) noexcept { return vcast<F256>(vloadf128u(p)); } |
1128 | BL_INLINE F256 vloadf256a(const void* p) noexcept { return _mm256_load_ps(static_cast<const float*>(p)); } |
1129 | BL_INLINE F256 vloadf256u(const void* p) noexcept { return _mm256_loadu_ps(static_cast<const float*>(p)); } |
1130 | |
1131 | BL_INLINE F256 vloadf256_l64(const F256& x, const void* p) noexcept { return vcast<F256>(vloadf128_l64(vcast<F128>(x), p)); } |
1132 | BL_INLINE F256 vloadf256_h64(const F256& x, const void* p) noexcept { return vcast<F256>(vloadf128_h64(vcast<F128>(x), p)); } |
1133 | |
1134 | BL_INLINE F128 vbroadcastf128_32(const void* p) noexcept { return vcast<F128>(_mm_broadcast_ss(static_cast<const float*>(p))); } |
1135 | BL_INLINE F256 vbroadcastf256_32(const void* p) noexcept { return vcast<F256>(_mm256_broadcast_ss(static_cast<const float*>(p))); } |
1136 | BL_INLINE F256 vbroadcastf256_64(const void* p) noexcept { return vcast<F256>(_mm256_broadcast_sd(static_cast<const double*>(p))); } |
1137 | BL_INLINE F256 vbroadcastf256_128(const void* p) noexcept { return vcast<F256>(_mm256_broadcast_ps(static_cast<const __m128*>(p))); } |
1138 | |
1139 | BL_INLINE void vstoref32(void* p, const F256& x) noexcept { vstoref32(p, vcast<F128>(x)); } |
1140 | BL_INLINE void vstoref64(void* p, const F256& x) noexcept { vstoref64(p, vcast<F128>(x)); } |
1141 | BL_INLINE void vstorelf64(void* p, const F256& x) noexcept { vstorelf64(p, vcast<F128>(x)); } |
1142 | BL_INLINE void vstorehf64(void* p, const F256& x) noexcept { vstorehf64(p, vcast<F128>(x)); } |
1143 | BL_INLINE void vstoref128a(void* p, const F256& x) noexcept { vstoref128a(p, vcast<F128>(x)); } |
1144 | BL_INLINE void vstoref128u(void* p, const F256& x) noexcept { vstoref128u(p, vcast<F128>(x)); } |
1145 | BL_INLINE void vstoref256a(void* p, const F256& x) noexcept { _mm256_store_ps(static_cast<float*>(p), x); } |
1146 | BL_INLINE void vstoref256u(void* p, const F256& x) noexcept { _mm256_storeu_ps(static_cast<float*>(p), x); } |
1147 | |
1148 | BL_INLINE bool vhasmaskf32(const F256& x, int bits0_7) noexcept { return _mm256_movemask_ps(vcast<F256>(x)) == bits0_7; } |
1149 | BL_INLINE bool vhasmaskf64(const F256& x, int bits0_3) noexcept { return _mm256_movemask_pd(vcast<D256>(x)) == bits0_3; } |
1150 | #endif |
1151 | |
1152 | // ============================================================================ |
1153 | // [BLSIMD::D256] |
1154 | // ============================================================================ |
1155 | |
1156 | #if defined(BL_TARGET_OPT_AVX) |
1157 | BL_INLINE D256 vzerod256() noexcept { return _mm256_setzero_pd(); } |
1158 | BL_INLINE D256 vsetd256(double x) noexcept { return _mm256_set1_pd(x); } |
1159 | BL_INLINE D256 vsetd256(double x1, double x0) noexcept { return _mm256_set_pd(x1, x0, x1, x0); } |
1160 | BL_INLINE D256 vsetd256(double x3, double x2, double x1, double x0) noexcept { return _mm256_set_pd(x3, x2, x1, x0); } |
1161 | |
1162 | BL_INLINE D256 vcvtd64d256(double x) noexcept { return vcast<D256>(vcvtd64d128(x)); } |
1163 | BL_INLINE double vcvtd256d64(const D256& x) noexcept { return vcvtd128d64(vcast<D128>(x)); } |
1164 | |
1165 | BL_INLINE D256 vcvti32d256(int32_t x) noexcept { return vcast<D256>(vcvti32d128(x)); } |
1166 | BL_INLINE int32_t vcvtd256i32(const D256& x) noexcept { return vcvtd128i32(vcast<D128>(x)); } |
1167 | BL_INLINE int32_t vcvttd256i32(const D256& x) noexcept { return vcvttd128i32(vcast<D128>(x)); } |
1168 | |
1169 | #if BL_TARGET_ARCH_BITS >= 64 |
1170 | BL_INLINE D256 vcvti64d256(int64_t x) noexcept { return vcast<D256>(vcvti64d128(x)); } |
1171 | BL_INLINE int64_t vcvtd256i64(const D256& x) noexcept { return vcvtd128i64(vcast<D128>(x)); } |
1172 | BL_INLINE int64_t vcvttd256i64(const D256& x) noexcept { return vcvttd128i64(vcast<D128>(x)); } |
1173 | #endif |
1174 | |
1175 | BL_INLINE I128 vcvtd256i128(const D256& x) noexcept { return vcast<I128>(_mm256_cvtpd_epi32(x)); } |
1176 | BL_INLINE I256 vcvtd256i256(const D256& x) noexcept { return vcast<I256>(_mm256_cvtpd_epi32(x)); } |
1177 | |
1178 | BL_INLINE I128 vcvttd256i128(const D256& x) noexcept { return vcast<I128>(_mm256_cvttpd_epi32(x)); } |
1179 | BL_INLINE I256 vcvttd256i256(const D256& x) noexcept { return vcast<I256>(_mm256_cvttpd_epi32(x)); } |
1180 | |
1181 | BL_INLINE F128 vcvtd256f128(const D256& x) noexcept { return vcast<F128>(_mm256_cvtpd_ps(x)); } |
1182 | BL_INLINE F256 vcvtd256f256(const D256& x) noexcept { return vcast<F256>(_mm256_cvtpd_ps(x)); } |
1183 | |
1184 | template<int A, int B> |
1185 | BL_INLINE D256 vshufd64(const D256& x, const D256& y) noexcept { return _mm256_shuffle_pd(x, y, (A << 3) | (B << 2) | (A << 1) | B); } |
1186 | template<int A, int B> |
1187 | BL_INLINE D256 vswizd64(const D256& x) noexcept { return vshufd64<A, B>(x, x); } |
1188 | |
1189 | template<int A, int B> |
1190 | BL_INLINE D256 vpermd128(const D256& x, const D256& y) noexcept { return _mm256_permute2f128_pd(x, y, ((A & 0xF) << 4) + (B & 0xF)); } |
1191 | template<int A, int B> |
1192 | BL_INLINE D256 vpermd128(const D256& x) noexcept { return vpermd128<A, B>(x, x); } |
1193 | |
1194 | BL_INLINE D256 vswapd64(const D256& x) noexcept { return vswizd64<0, 1>(x); } |
1195 | BL_INLINE D256 vdupld64(const D256& x) noexcept { return vswizd64<0, 0>(x); } |
1196 | BL_INLINE D256 vduphd64(const D256& x) noexcept { return vswizd64<1, 1>(x); } |
1197 | |
1198 | BL_INLINE D256 vswapd128(const D256& x) noexcept { return vpermd128<0, 1>(x); } |
1199 | BL_INLINE D256 vdupld128(const D128& x) noexcept { return vpermd128<0, 0>(vcast<D256>(x)); } |
1200 | BL_INLINE D256 vdupld128(const D256& x) noexcept { return vpermd128<0, 0>(x); } |
1201 | BL_INLINE D256 vduphd128(const D256& x) noexcept { return vpermd128<1, 1>(x); } |
1202 | |
1203 | BL_INLINE D256 vunpackld64(const D256& x, const D256& y) noexcept { return _mm256_unpacklo_pd(x, y); } |
1204 | BL_INLINE D256 vunpackhd64(const D256& x, const D256& y) noexcept { return _mm256_unpackhi_pd(x, y); } |
1205 | |
1206 | #if defined(BL_TARGET_OPT_AVX2) |
1207 | BL_INLINE D256 vsplatd64d256(const D128& x) noexcept { return _mm256_broadcastsd_pd(vcast<D128>(x)); } |
1208 | BL_INLINE D256 vsplatd64d256(const D256& x) noexcept { return _mm256_broadcastsd_pd(vcast<D128>(x)); } |
1209 | #else |
1210 | BL_INLINE D256 vsplatd64d256(const D128& x) noexcept { return vdupld128(vswizd64<0, 0>(vcast<D128>(x))); } |
1211 | BL_INLINE D256 vsplatd64d256(const D256& x) noexcept { return vdupld128(vswizd64<0, 0>(vcast<D128>(x))); } |
1212 | #endif |
1213 | |
1214 | BL_INLINE D256 vor(const D256& x, const D256& y) noexcept { return _mm256_or_pd(x, y); } |
1215 | BL_INLINE D256 vxor(const D256& x, const D256& y) noexcept { return _mm256_xor_pd(x, y); } |
1216 | BL_INLINE D256 vand(const D256& x, const D256& y) noexcept { return _mm256_and_pd(x, y); } |
1217 | BL_INLINE D256 vandnot_a(const D256& x, const D256& y) noexcept { return _mm256_andnot_pd(x, y); } |
1218 | BL_INLINE D256 vandnot_b(const D256& x, const D256& y) noexcept { return _mm256_andnot_pd(y, x); } |
1219 | BL_INLINE D256 vblendmask(const D256& x, const D256& y, const D256& mask) noexcept { return vor(vandnot_a(mask, x), vand(y, mask)); } |
1220 | |
1221 | BL_INLINE D256 vaddsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vaddsd(vcast<D128>(x), vcast<D128>(y))); } |
1222 | BL_INLINE D256 vaddpd(const D256& x, const D256& y) noexcept { return _mm256_add_pd(x, y); } |
1223 | |
1224 | BL_INLINE D256 vsubsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vsubsd(vcast<D128>(x), vcast<D128>(y))); } |
1225 | BL_INLINE D256 vsubpd(const D256& x, const D256& y) noexcept { return _mm256_sub_pd(x, y); } |
1226 | |
1227 | BL_INLINE D256 vmulsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vmulsd(vcast<D128>(x), vcast<D128>(y))); } |
1228 | BL_INLINE D256 vmulpd(const D256& x, const D256& y) noexcept { return _mm256_mul_pd(x, y); } |
1229 | |
1230 | BL_INLINE D256 vdivsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vdivsd(vcast<D128>(x), vcast<D128>(y))); } |
1231 | BL_INLINE D256 vdivpd(const D256& x, const D256& y) noexcept { return _mm256_div_pd(x, y); } |
1232 | |
1233 | BL_INLINE D256 vminsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vminsd(vcast<D128>(x), vcast<D128>(y))); } |
1234 | BL_INLINE D256 vminpd(const D256& x, const D256& y) noexcept { return _mm256_min_pd(x, y); } |
1235 | |
1236 | BL_INLINE D256 vmaxsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vmaxsd(vcast<D128>(x), vcast<D128>(y))); } |
1237 | BL_INLINE D256 vmaxpd(const D256& x, const D256& y) noexcept { return _mm256_max_pd(x, y); } |
1238 | |
1239 | BL_INLINE D256 vcmpeqsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpeqsd(vcast<D128>(x), vcast<D128>(y))); } |
1240 | BL_INLINE D256 vcmpeqpd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_EQ_OQ); } |
1241 | |
1242 | BL_INLINE D256 vcmpnesd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpnesd(vcast<D128>(x), vcast<D128>(y))); } |
1243 | BL_INLINE D256 vcmpnepd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_NEQ_OQ); } |
1244 | |
1245 | BL_INLINE D256 vcmpgesd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpgesd(vcast<D128>(x), vcast<D128>(y))); } |
1246 | BL_INLINE D256 vcmpgepd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_GE_OQ); } |
1247 | |
1248 | BL_INLINE D256 vcmpgtsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpgtsd(vcast<D128>(x), vcast<D128>(y))); } |
1249 | BL_INLINE D256 vcmpgtpd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_GT_OQ); } |
1250 | |
1251 | BL_INLINE D256 vcmplesd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmplesd(vcast<D128>(x), vcast<D128>(y))); } |
1252 | BL_INLINE D256 vcmplepd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_LE_OQ); } |
1253 | |
1254 | BL_INLINE D256 vcmpltsd(const D256& x, const D256& y) noexcept { return vcast<D256>(vcmpltsd(vcast<D128>(x), vcast<D128>(y))); } |
1255 | BL_INLINE D256 vcmpltpd(const D256& x, const D256& y) noexcept { return _mm256_cmp_pd(x, y, _CMP_LE_OQ); } |
1256 | |
1257 | BL_INLINE D256 vsqrtsd(const D256& x) noexcept { return vcast<D256>(vsqrtsd(vcast<D128>(x))); } |
1258 | BL_INLINE D256 vsqrtpd(const D256& x) noexcept { return _mm256_sqrt_pd(x); } |
1259 | |
1260 | BL_INLINE D256 vloadd256_64(const void* p) noexcept { return vcast<D256>(vloadd128_64(p)); } |
1261 | BL_INLINE D256 vloadd256_128a(const void* p) noexcept { return vcast<D256>(vloadd128a(p)); } |
1262 | BL_INLINE D256 vloadd256_128u(const void* p) noexcept { return vcast<D256>(vloadd128u(p)); } |
1263 | BL_INLINE D256 vloadd256a(const void* p) noexcept { return _mm256_load_pd(static_cast<const double*>(p)); } |
1264 | BL_INLINE D256 vloadd256u(const void* p) noexcept { return _mm256_loadu_pd(static_cast<const double*>(p)); } |
1265 | |
1266 | BL_INLINE D256 vloadd256_l64(const D256& x, const void* p) noexcept { return vcast<D256>(vloadd128_l64(vcast<D128>(x), p)); } |
1267 | BL_INLINE D256 vloadd256_h64(const D256& x, const void* p) noexcept { return vcast<D256>(vloadd128_h64(vcast<D128>(x), p)); } |
1268 | |
1269 | BL_INLINE D256 vbroadcastd256_64(const void* p) noexcept { return _mm256_broadcast_sd(static_cast<const double*>(p)); } |
1270 | BL_INLINE D256 vbroadcastd256_128(const void* p) noexcept { return _mm256_broadcast_pd(static_cast<const __m128d*>(p)); } |
1271 | |
1272 | BL_INLINE void vstored64(void* p, const D256& x) noexcept { vstored64(p, vcast<D128>(x)); } |
1273 | BL_INLINE void vstoreld64(void* p, const D256& x) noexcept { vstoreld64(p, vcast<D128>(x)); } |
1274 | BL_INLINE void vstorehd64(void* p, const D256& x) noexcept { vstorehd64(p, vcast<D128>(x)); } |
1275 | BL_INLINE void vstored128a(void* p, const D256& x) noexcept { vstored128a(p, vcast<D128>(x)); } |
1276 | BL_INLINE void vstored128u(void* p, const D256& x) noexcept { vstored128u(p, vcast<D128>(x)); } |
1277 | BL_INLINE void vstored256a(void* p, const D256& x) noexcept { _mm256_store_pd(static_cast<double*>(p), x); } |
1278 | BL_INLINE void vstored256u(void* p, const D256& x) noexcept { _mm256_storeu_pd(static_cast<double*>(p), x); } |
1279 | |
1280 | BL_INLINE bool vhasmaskd64(const D256& x, int bits0_3) noexcept { return _mm256_movemask_pd(vcast<D256>(x)) == bits0_3; } |
1281 | #endif |
1282 | |
1283 | #endif |
1284 | |
1285 | } // {anonymous} |
1286 | } // {SIMD} |
1287 | |
1288 | //! \} |
1289 | //! \endcond |
1290 | |
1291 | #endif // BLEND2D_BLSIMD_X86_P_H |
1292 | |