1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_EQ_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_EQ_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
17 | #include <simdpp/core/bit_and.h> |
18 | #include <simdpp/core/bit_or.h> |
19 | #include <simdpp/core/i_shift_r.h> |
20 | #include <simdpp/core/i_shift_l.h> |
21 | #include <simdpp/core/transpose.h> |
22 | #include <simdpp/detail/null/compare.h> |
23 | #include <simdpp/detail/vector_array_macros.h> |
24 | |
25 | namespace simdpp { |
26 | namespace SIMDPP_ARCH_NAMESPACE { |
27 | namespace detail { |
28 | namespace insn { |
29 | |
30 | |
31 | static SIMDPP_INL |
32 | mask_int8x16 i_cmp_eq(const uint8x16& a, const uint8x16& b) |
33 | { |
34 | #if SIMDPP_USE_NULL |
35 | return detail::null::cmp_eq(a, b); |
36 | #elif SIMDPP_USE_AVX512VL |
37 | return _mm_cmpeq_epi8_mask(a.native(), b.native()); |
38 | #elif SIMDPP_USE_SSE2 |
39 | return _mm_cmpeq_epi8(a.native(), b.native()); |
40 | #elif SIMDPP_USE_NEON |
41 | return vceqq_u8(a.native(), b.native()); |
42 | #elif SIMDPP_USE_ALTIVEC |
43 | return vec_cmpeq(a.native(), b.native()); |
44 | #elif SIMDPP_USE_MSA |
45 | return (v16u8) __msa_ceq_b((v16i8) a.native(), (v16i8) b.native()); |
46 | #endif |
47 | } |
48 | |
49 | #if SIMDPP_USE_AVX512VL |
50 | static SIMDPP_INL |
51 | mask_int8<16> i_cmp_eq(const mask_int8<16>& a, const mask_int8<16>& b) |
52 | { |
53 | return _mm512_kxnor(a.native(), b.native()); |
54 | } |
55 | #endif |
56 | |
57 | #if SIMDPP_USE_AVX2 |
58 | static SIMDPP_INL |
59 | mask_int8x32 i_cmp_eq(const uint8x32& a, const uint8x32& b) |
60 | { |
61 | #if SIMDPP_USE_AVX512VL |
62 | return _mm256_cmpeq_epi8_mask(a.native(), b.native()); |
63 | #else |
64 | return _mm256_cmpeq_epi8(a.native(), b.native()); |
65 | #endif |
66 | } |
67 | #endif |
68 | |
69 | #if SIMDPP_USE_AVX512VL |
70 | static SIMDPP_INL |
71 | mask_int8<32> i_cmp_eq(const mask_int8<32>& a, const mask_int8<32>& b) |
72 | { |
73 | return _mm512_kxnor(a.native(), b.native()); |
74 | } |
75 | #endif |
76 | |
77 | #if SIMDPP_USE_AVX512BW |
78 | SIMDPP_INL mask_int8<64> i_cmp_eq(const uint8<64>& a, const uint8<64>& b) |
79 | { |
80 | return _mm512_cmpeq_epi8_mask(a.native(), b.native()); |
81 | } |
82 | |
83 | SIMDPP_INL mask_int8<64> i_cmp_eq(const mask_int8<64>& a, const mask_int8<64>& b) |
84 | { |
85 | return _mm512_kxnor(a.native(), b.native()); |
86 | } |
87 | #endif |
88 | |
89 | // ----------------------------------------------------------------------------- |
90 | |
91 | static SIMDPP_INL |
92 | mask_int16x8 i_cmp_eq(const uint16x8& a, const uint16x8& b) |
93 | { |
94 | #if SIMDPP_USE_NULL |
95 | return detail::null::cmp_eq(a, b); |
96 | #elif SIMDPP_USE_AVX512VL |
97 | return _mm_cmpeq_epi16_mask(a.native(), b.native()); |
98 | #elif SIMDPP_USE_SSE2 |
99 | return _mm_cmpeq_epi16(a.native(), b.native()); |
100 | #elif SIMDPP_USE_NEON |
101 | return vceqq_u16(a.native(), b.native()); |
102 | #elif SIMDPP_USE_ALTIVEC |
103 | return vec_cmpeq(a.native(), b.native()); |
104 | #elif SIMDPP_USE_MSA |
105 | return (v8u16) __msa_ceq_h((v8i16) a.native(), (v8i16) b.native()); |
106 | #endif |
107 | } |
108 | |
109 | #if SIMDPP_USE_AVX2 |
110 | static SIMDPP_INL |
111 | mask_int16x16 i_cmp_eq(const uint16x16& a, const uint16x16& b) |
112 | { |
113 | #if SIMDPP_USE_AVX512VL |
114 | return _mm256_cmpeq_epi16_mask(a.native(), b.native()); |
115 | #else |
116 | return _mm256_cmpeq_epi16(a.native(), b.native()); |
117 | #endif |
118 | } |
119 | #endif |
120 | |
121 | #if SIMDPP_USE_AVX512BW |
122 | SIMDPP_INL mask_int16<32> i_cmp_eq(const uint16<32>& a, const uint16<32>& b) |
123 | { |
124 | return _mm512_cmpeq_epi16_mask(a.native(), b.native()); |
125 | } |
126 | #endif |
127 | |
128 | // ----------------------------------------------------------------------------- |
129 | |
130 | static SIMDPP_INL |
131 | mask_int32x4 i_cmp_eq(const uint32x4& a, const uint32x4& b) |
132 | { |
133 | #if SIMDPP_USE_NULL |
134 | return detail::null::cmp_eq(a, b); |
135 | #elif SIMDPP_USE_AVX512VL |
136 | return _mm_cmpeq_epi32_mask(a.native(), b.native()); |
137 | #elif SIMDPP_USE_SSE2 |
138 | return _mm_cmpeq_epi32(a.native(), b.native()); |
139 | #elif SIMDPP_USE_NEON |
140 | return vceqq_u32(a.native(), b.native()); |
141 | #elif SIMDPP_USE_ALTIVEC |
142 | return vec_cmpeq(a.native(), b.native()); |
143 | #elif SIMDPP_USE_MSA |
144 | return (v4u32) __msa_ceq_w((v4i32) a.native(), (v4i32) b.native()); |
145 | #endif |
146 | } |
147 | |
148 | #if SIMDPP_USE_AVX512VL |
149 | static SIMDPP_INL |
150 | mask_int16<8> i_cmp_eq(const mask_int16<8>& a, const mask_int16<8>& b) |
151 | { |
152 | return _mm512_kxnor(a.native(), b.native()); |
153 | } |
154 | #endif |
155 | |
156 | #if SIMDPP_USE_AVX2 |
157 | static SIMDPP_INL |
158 | mask_int32x8 i_cmp_eq(const uint32x8& a, const uint32x8& b) |
159 | { |
160 | #if SIMDPP_USE_AVX512VL |
161 | return _mm256_cmpeq_epi32_mask(a.native(), b.native()); |
162 | #else |
163 | return _mm256_cmpeq_epi32(a.native(), b.native()); |
164 | #endif |
165 | } |
166 | #endif |
167 | |
168 | #if SIMDPP_USE_AVX512VL |
169 | static SIMDPP_INL |
170 | mask_int16<16> i_cmp_eq(const mask_int16<16>& a, const mask_int16<16>& b) |
171 | { |
172 | return _mm512_kxnor(a.native(), b.native()); |
173 | } |
174 | #endif |
175 | |
176 | #if SIMDPP_USE_AVX512F |
177 | static SIMDPP_INL |
178 | mask_int32<16> i_cmp_eq(const uint32<16>& a, const uint32<16>& b) |
179 | { |
180 | return _mm512_cmpeq_epi32_mask(a.native(), b.native()); |
181 | } |
182 | |
183 | static SIMDPP_INL |
184 | mask_int32<16> i_cmp_eq(const mask_int32<16>& a, const mask_int32<16>& b) |
185 | { |
186 | return _mm512_kxnor(a.native(), b.native()); |
187 | } |
188 | #endif |
189 | |
190 | // ----------------------------------------------------------------------------- |
191 | |
192 | static SIMDPP_INL |
193 | mask_int64x2 i_cmp_eq(const uint64x2& a, const uint64x2& b) |
194 | { |
195 | #if SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
196 | return _mm_comeq_epi64(a.native(), b.native()); |
197 | #elif SIMDPP_USE_AVX512VL |
198 | return _mm_cmpeq_epi64_mask(a.native(), b.native()); |
199 | #elif SIMDPP_USE_SSE4_1 |
200 | return _mm_cmpeq_epi64(a.native(), b.native()); |
201 | #elif SIMDPP_USE_SSE2 |
202 | uint64x2 r32, r32s; |
203 | r32 = i_cmp_eq(uint32x4(a), uint32x4(b)); |
204 | // swap the 32-bit halves |
205 | r32s = bit_or(shift_l<32>(r32), shift_r<32>(r32)); |
206 | // combine the results. Each 32-bit half is ANDed with the neighbouring pair |
207 | r32 = bit_and(r32, r32s); |
208 | return r32; |
209 | #elif SIMDPP_USE_NEON64 |
210 | return vceqq_u64(a.native(), b.native()); |
211 | #elif SIMDPP_USE_NEON32 |
212 | uint32x4 r32, r32s; |
213 | r32 = i_cmp_eq(uint32x4(a), uint32x4(b)); |
214 | r32s = r32; |
215 | // swap the 32-bit halves |
216 | transpose2(r32, r32s); |
217 | // combine the results. Each 32-bit half is ANDed with the neighbouring pair |
218 | r32 = bit_and(r32, r32s); |
219 | return uint64x2(r32); |
220 | #elif SIMDPP_USE_VSX_207 |
221 | return (__vector uint64_t) vec_cmpeq(a.native(), b.native()); |
222 | #elif SIMDPP_USE_MSA |
223 | return (v2u64) __msa_ceq_d((v2i64) a.native(), (v2i64) b.native()); |
224 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
225 | return detail::null::cmp_eq(a, b); |
226 | #endif |
227 | } |
228 | |
229 | #if SIMDPP_USE_AVX512VL |
230 | static SIMDPP_INL |
231 | mask_int64<2> i_cmp_eq(const mask_int64<2>& a, const mask_int64<2>& b) |
232 | { |
233 | return _mm512_kxnor(a.native(), b.native()); |
234 | } |
235 | #endif |
236 | |
237 | #if SIMDPP_USE_AVX2 |
238 | static SIMDPP_INL |
239 | mask_int64x4 i_cmp_eq(const uint64x4& a, const uint64x4& b) |
240 | { |
241 | #if SIMDPP_USE_AVX512VL |
242 | return _mm256_cmpeq_epi64_mask(a.native(), b.native()); |
243 | #else |
244 | return _mm256_cmpeq_epi64(a.native(), b.native()); |
245 | #endif |
246 | } |
247 | #endif |
248 | |
249 | #if SIMDPP_USE_AVX512VL |
250 | static SIMDPP_INL |
251 | mask_int64<4> i_cmp_eq(const mask_int64<4>& a, const mask_int64<4>& b) |
252 | { |
253 | return _mm512_kxnor(a.native(), b.native()); |
254 | } |
255 | #endif |
256 | |
257 | #if SIMDPP_USE_AVX512F |
258 | static SIMDPP_INL |
259 | mask_int64<8> i_cmp_eq(const uint64<8>& a, const uint64<8>& b) |
260 | { |
261 | return _mm512_cmpeq_epi64_mask(a.native(), b.native()); |
262 | } |
263 | |
264 | static SIMDPP_INL |
265 | mask_int64<8> i_cmp_eq(const mask_int64<8>& a, const mask_int64<8>& b) |
266 | { |
267 | return _mm512_kxnor(a.native(), b.native()); |
268 | } |
269 | #endif |
270 | |
271 | // ----------------------------------------------------------------------------- |
272 | |
273 | static SIMDPP_INL |
274 | mask_float32x4 i_cmp_eq(const float32x4& a, const float32x4& b) |
275 | { |
276 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
277 | return detail::null::cmp_eq(a, b); |
278 | #elif SIMDPP_USE_AVX512VL |
279 | return _mm_cmp_ps_mask(a.native(), b.native(), _CMP_EQ_OQ); |
280 | #elif SIMDPP_USE_AVX |
281 | return _mm_cmp_ps(a.native(), b.native(), _CMP_EQ_OQ); |
282 | #elif SIMDPP_USE_SSE2 |
283 | return _mm_cmpeq_ps(a.native(), b.native()); |
284 | #elif SIMDPP_USE_NEON |
285 | return vreinterpretq_f32_u32(vceqq_f32(a.native(), b.native())); |
286 | #elif SIMDPP_USE_ALTIVEC |
287 | return vec_cmpeq(a.native(), b.native()); |
288 | #elif SIMDPP_USE_MSA |
289 | return (v4f32) __msa_fceq_w(a.native(), b.native()); |
290 | #endif |
291 | } |
292 | |
293 | #if SIMDPP_USE_AVX512VL |
294 | static SIMDPP_INL |
295 | mask_float32<4> i_cmp_eq(const mask_float32<4>& a, const mask_float32<4>& b) |
296 | { |
297 | return _mm512_kxnor(a.native(), b.native()); |
298 | } |
299 | #endif |
300 | |
301 | |
302 | #if SIMDPP_USE_AVX |
303 | static SIMDPP_INL |
304 | mask_float32x8 i_cmp_eq(const float32x8& a, const float32x8& b) |
305 | { |
306 | #if SIMDPP_USE_AVX512VL |
307 | return _mm256_cmp_ps_mask(a.native(), b.native(), _CMP_EQ_OQ); |
308 | #else |
309 | return _mm256_cmp_ps(a.native(), b.native(), _CMP_EQ_OQ); |
310 | #endif |
311 | } |
312 | #endif |
313 | |
314 | #if SIMDPP_USE_AVX512VL |
315 | static SIMDPP_INL |
316 | mask_float32<8> i_cmp_eq(const mask_float32<8>& a, const mask_float32<8>& b) |
317 | { |
318 | return _mm512_kxnor(a.native(), b.native()); |
319 | } |
320 | #endif |
321 | |
322 | #if SIMDPP_USE_AVX512F |
323 | static SIMDPP_INL |
324 | mask_float32<16> i_cmp_eq(const float32<16>& a, const float32<16>& b) |
325 | { |
326 | return _mm512_cmp_ps_mask(a.native(), b.native(), _CMP_EQ_OQ); |
327 | } |
328 | |
329 | static SIMDPP_INL |
330 | mask_float32<16> i_cmp_eq(const mask_float32<16>& a, const mask_float32<16>& b) |
331 | { |
332 | return _mm512_kxnor(a.native(), b.native()); |
333 | } |
334 | #endif |
335 | |
336 | // ----------------------------------------------------------------------------- |
337 | |
338 | static SIMDPP_INL |
339 | mask_float64x2 i_cmp_eq(const float64x2& a, const float64x2& b) |
340 | { |
341 | #if SIMDPP_USE_AVX512VL |
342 | return _mm_cmp_pd_mask(a.native(), b.native(), _CMP_EQ_OQ); |
343 | #elif SIMDPP_USE_AVX |
344 | return _mm_cmp_pd(a.native(), b.native(), _CMP_EQ_OQ); |
345 | #elif SIMDPP_USE_SSE2 |
346 | return _mm_cmpeq_pd(a.native(), b.native()); |
347 | #elif SIMDPP_USE_NEON64 |
348 | return vreinterpretq_f64_u64(vceqq_f64(a.native(), b.native())); |
349 | #elif SIMDPP_USE_VSX_206 |
350 | return (__vector double) vec_cmpeq(a.native(), b.native()); |
351 | #elif SIMDPP_USE_MSA |
352 | return (v2f64) __msa_fceq_d(a.native(), b.native()); |
353 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
354 | return detail::null::cmp_eq(a, b); |
355 | #else |
356 | return SIMDPP_NOT_IMPLEMENTED2(a, b); |
357 | #endif |
358 | } |
359 | |
360 | #if SIMDPP_USE_AVX512VL |
361 | static SIMDPP_INL |
362 | mask_float64<2> i_cmp_eq(const mask_float64<2>& a, const mask_float64<2>& b) |
363 | { |
364 | return _mm512_kxnor(a.native(), b.native()); |
365 | } |
366 | #endif |
367 | |
368 | |
369 | #if SIMDPP_USE_AVX |
370 | static SIMDPP_INL |
371 | mask_float64x4 i_cmp_eq(const float64x4& a, const float64x4& b) |
372 | { |
373 | #if SIMDPP_USE_AVX512VL |
374 | return _mm256_cmp_pd_mask(a.native(), b.native(), _CMP_EQ_OQ); |
375 | #else |
376 | return _mm256_cmp_pd(a.native(), b.native(), _CMP_EQ_OQ); |
377 | #endif |
378 | } |
379 | #endif |
380 | |
381 | #if SIMDPP_USE_AVX512VL |
382 | static SIMDPP_INL |
383 | mask_float64<4> i_cmp_eq(const mask_float64<4>& a, const mask_float64<4>& b) |
384 | { |
385 | return _mm512_kxnor(a.native(), b.native()); |
386 | } |
387 | #endif |
388 | |
389 | #if SIMDPP_USE_AVX512F |
390 | static SIMDPP_INL |
391 | mask_float64<8> i_cmp_eq(const float64<8>& a, const float64<8>& b) |
392 | { |
393 | return _mm512_cmp_pd_mask(a.native(), b.native(), _CMP_EQ_OQ); |
394 | } |
395 | |
396 | static SIMDPP_INL |
397 | mask_float64<8> i_cmp_eq(const mask_float64<8>& a, const mask_float64<8>& b) |
398 | { |
399 | return _mm512_kxnor(a.native(), b.native()); |
400 | } |
401 | #endif |
402 | |
403 | // ----------------------------------------------------------------------------- |
404 | |
405 | template<class V> SIMDPP_INL |
406 | typename V::mask_vector_type i_cmp_eq(const V& a, const V& b) |
407 | { |
408 | SIMDPP_VEC_ARRAY_IMPL2(typename V::mask_vector_type, i_cmp_eq, a, b); |
409 | } |
410 | |
411 | } // namespace insn |
412 | } // namespace detail |
413 | } // namespace SIMDPP_ARCH_NAMESPACE |
414 | } // namespace simdpp |
415 | |
416 | #endif |
417 | |
418 | |