1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_EQ_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_EQ_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/make_shuffle_bytes_mask.h>
17#include <simdpp/core/bit_and.h>
18#include <simdpp/core/bit_or.h>
19#include <simdpp/core/i_shift_r.h>
20#include <simdpp/core/i_shift_l.h>
21#include <simdpp/core/transpose.h>
22#include <simdpp/detail/null/compare.h>
23#include <simdpp/detail/vector_array_macros.h>
24
25namespace simdpp {
26namespace SIMDPP_ARCH_NAMESPACE {
27namespace detail {
28namespace insn {
29
30
31static SIMDPP_INL
32mask_int8x16 i_cmp_eq(const uint8x16& a, const uint8x16& b)
33{
34#if SIMDPP_USE_NULL
35 return detail::null::cmp_eq(a, b);
36#elif SIMDPP_USE_AVX512VL
37 return _mm_cmpeq_epi8_mask(a.native(), b.native());
38#elif SIMDPP_USE_SSE2
39 return _mm_cmpeq_epi8(a.native(), b.native());
40#elif SIMDPP_USE_NEON
41 return vceqq_u8(a.native(), b.native());
42#elif SIMDPP_USE_ALTIVEC
43 return vec_cmpeq(a.native(), b.native());
44#elif SIMDPP_USE_MSA
45 return (v16u8) __msa_ceq_b((v16i8) a.native(), (v16i8) b.native());
46#endif
47}
48
49#if SIMDPP_USE_AVX512VL
50static SIMDPP_INL
51mask_int8<16> i_cmp_eq(const mask_int8<16>& a, const mask_int8<16>& b)
52{
53 return _mm512_kxnor(a.native(), b.native());
54}
55#endif
56
57#if SIMDPP_USE_AVX2
58static SIMDPP_INL
59mask_int8x32 i_cmp_eq(const uint8x32& a, const uint8x32& b)
60{
61#if SIMDPP_USE_AVX512VL
62 return _mm256_cmpeq_epi8_mask(a.native(), b.native());
63#else
64 return _mm256_cmpeq_epi8(a.native(), b.native());
65#endif
66}
67#endif
68
69#if SIMDPP_USE_AVX512VL
70static SIMDPP_INL
71mask_int8<32> i_cmp_eq(const mask_int8<32>& a, const mask_int8<32>& b)
72{
73 return _mm512_kxnor(a.native(), b.native());
74}
75#endif
76
77#if SIMDPP_USE_AVX512BW
78SIMDPP_INL mask_int8<64> i_cmp_eq(const uint8<64>& a, const uint8<64>& b)
79{
80 return _mm512_cmpeq_epi8_mask(a.native(), b.native());
81}
82
83SIMDPP_INL mask_int8<64> i_cmp_eq(const mask_int8<64>& a, const mask_int8<64>& b)
84{
85 return _mm512_kxnor(a.native(), b.native());
86}
87#endif
88
89// -----------------------------------------------------------------------------
90
91static SIMDPP_INL
92mask_int16x8 i_cmp_eq(const uint16x8& a, const uint16x8& b)
93{
94#if SIMDPP_USE_NULL
95 return detail::null::cmp_eq(a, b);
96#elif SIMDPP_USE_AVX512VL
97 return _mm_cmpeq_epi16_mask(a.native(), b.native());
98#elif SIMDPP_USE_SSE2
99 return _mm_cmpeq_epi16(a.native(), b.native());
100#elif SIMDPP_USE_NEON
101 return vceqq_u16(a.native(), b.native());
102#elif SIMDPP_USE_ALTIVEC
103 return vec_cmpeq(a.native(), b.native());
104#elif SIMDPP_USE_MSA
105 return (v8u16) __msa_ceq_h((v8i16) a.native(), (v8i16) b.native());
106#endif
107}
108
109#if SIMDPP_USE_AVX2
110static SIMDPP_INL
111mask_int16x16 i_cmp_eq(const uint16x16& a, const uint16x16& b)
112{
113#if SIMDPP_USE_AVX512VL
114 return _mm256_cmpeq_epi16_mask(a.native(), b.native());
115#else
116 return _mm256_cmpeq_epi16(a.native(), b.native());
117#endif
118}
119#endif
120
121#if SIMDPP_USE_AVX512BW
122SIMDPP_INL mask_int16<32> i_cmp_eq(const uint16<32>& a, const uint16<32>& b)
123{
124 return _mm512_cmpeq_epi16_mask(a.native(), b.native());
125}
126#endif
127
128// -----------------------------------------------------------------------------
129
130static SIMDPP_INL
131mask_int32x4 i_cmp_eq(const uint32x4& a, const uint32x4& b)
132{
133#if SIMDPP_USE_NULL
134 return detail::null::cmp_eq(a, b);
135#elif SIMDPP_USE_AVX512VL
136 return _mm_cmpeq_epi32_mask(a.native(), b.native());
137#elif SIMDPP_USE_SSE2
138 return _mm_cmpeq_epi32(a.native(), b.native());
139#elif SIMDPP_USE_NEON
140 return vceqq_u32(a.native(), b.native());
141#elif SIMDPP_USE_ALTIVEC
142 return vec_cmpeq(a.native(), b.native());
143#elif SIMDPP_USE_MSA
144 return (v4u32) __msa_ceq_w((v4i32) a.native(), (v4i32) b.native());
145#endif
146}
147
148#if SIMDPP_USE_AVX512VL
149static SIMDPP_INL
150mask_int16<8> i_cmp_eq(const mask_int16<8>& a, const mask_int16<8>& b)
151{
152 return _mm512_kxnor(a.native(), b.native());
153}
154#endif
155
156#if SIMDPP_USE_AVX2
157static SIMDPP_INL
158mask_int32x8 i_cmp_eq(const uint32x8& a, const uint32x8& b)
159{
160#if SIMDPP_USE_AVX512VL
161 return _mm256_cmpeq_epi32_mask(a.native(), b.native());
162#else
163 return _mm256_cmpeq_epi32(a.native(), b.native());
164#endif
165}
166#endif
167
168#if SIMDPP_USE_AVX512VL
169static SIMDPP_INL
170mask_int16<16> i_cmp_eq(const mask_int16<16>& a, const mask_int16<16>& b)
171{
172 return _mm512_kxnor(a.native(), b.native());
173}
174#endif
175
176#if SIMDPP_USE_AVX512F
177static SIMDPP_INL
178mask_int32<16> i_cmp_eq(const uint32<16>& a, const uint32<16>& b)
179{
180 return _mm512_cmpeq_epi32_mask(a.native(), b.native());
181}
182
183static SIMDPP_INL
184mask_int32<16> i_cmp_eq(const mask_int32<16>& a, const mask_int32<16>& b)
185{
186 return _mm512_kxnor(a.native(), b.native());
187}
188#endif
189
190// -----------------------------------------------------------------------------
191
192static SIMDPP_INL
193mask_int64x2 i_cmp_eq(const uint64x2& a, const uint64x2& b)
194{
195#if SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
196 return _mm_comeq_epi64(a.native(), b.native());
197#elif SIMDPP_USE_AVX512VL
198 return _mm_cmpeq_epi64_mask(a.native(), b.native());
199#elif SIMDPP_USE_SSE4_1
200 return _mm_cmpeq_epi64(a.native(), b.native());
201#elif SIMDPP_USE_SSE2
202 uint64x2 r32, r32s;
203 r32 = i_cmp_eq(uint32x4(a), uint32x4(b));
204 // swap the 32-bit halves
205 r32s = bit_or(shift_l<32>(r32), shift_r<32>(r32));
206 // combine the results. Each 32-bit half is ANDed with the neighbouring pair
207 r32 = bit_and(r32, r32s);
208 return r32;
209#elif SIMDPP_USE_NEON64
210 return vceqq_u64(a.native(), b.native());
211#elif SIMDPP_USE_NEON32
212 uint32x4 r32, r32s;
213 r32 = i_cmp_eq(uint32x4(a), uint32x4(b));
214 r32s = r32;
215 // swap the 32-bit halves
216 transpose2(r32, r32s);
217 // combine the results. Each 32-bit half is ANDed with the neighbouring pair
218 r32 = bit_and(r32, r32s);
219 return uint64x2(r32);
220#elif SIMDPP_USE_VSX_207
221 return (__vector uint64_t) vec_cmpeq(a.native(), b.native());
222#elif SIMDPP_USE_MSA
223 return (v2u64) __msa_ceq_d((v2i64) a.native(), (v2i64) b.native());
224#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
225 return detail::null::cmp_eq(a, b);
226#endif
227}
228
229#if SIMDPP_USE_AVX512VL
230static SIMDPP_INL
231mask_int64<2> i_cmp_eq(const mask_int64<2>& a, const mask_int64<2>& b)
232{
233 return _mm512_kxnor(a.native(), b.native());
234}
235#endif
236
237#if SIMDPP_USE_AVX2
238static SIMDPP_INL
239mask_int64x4 i_cmp_eq(const uint64x4& a, const uint64x4& b)
240{
241#if SIMDPP_USE_AVX512VL
242 return _mm256_cmpeq_epi64_mask(a.native(), b.native());
243#else
244 return _mm256_cmpeq_epi64(a.native(), b.native());
245#endif
246}
247#endif
248
249#if SIMDPP_USE_AVX512VL
250static SIMDPP_INL
251mask_int64<4> i_cmp_eq(const mask_int64<4>& a, const mask_int64<4>& b)
252{
253 return _mm512_kxnor(a.native(), b.native());
254}
255#endif
256
257#if SIMDPP_USE_AVX512F
258static SIMDPP_INL
259mask_int64<8> i_cmp_eq(const uint64<8>& a, const uint64<8>& b)
260{
261 return _mm512_cmpeq_epi64_mask(a.native(), b.native());
262}
263
264static SIMDPP_INL
265mask_int64<8> i_cmp_eq(const mask_int64<8>& a, const mask_int64<8>& b)
266{
267 return _mm512_kxnor(a.native(), b.native());
268}
269#endif
270
271// -----------------------------------------------------------------------------
272
273static SIMDPP_INL
274mask_float32x4 i_cmp_eq(const float32x4& a, const float32x4& b)
275{
276#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
277 return detail::null::cmp_eq(a, b);
278#elif SIMDPP_USE_AVX512VL
279 return _mm_cmp_ps_mask(a.native(), b.native(), _CMP_EQ_OQ);
280#elif SIMDPP_USE_AVX
281 return _mm_cmp_ps(a.native(), b.native(), _CMP_EQ_OQ);
282#elif SIMDPP_USE_SSE2
283 return _mm_cmpeq_ps(a.native(), b.native());
284#elif SIMDPP_USE_NEON
285 return vreinterpretq_f32_u32(vceqq_f32(a.native(), b.native()));
286#elif SIMDPP_USE_ALTIVEC
287 return vec_cmpeq(a.native(), b.native());
288#elif SIMDPP_USE_MSA
289 return (v4f32) __msa_fceq_w(a.native(), b.native());
290#endif
291}
292
293#if SIMDPP_USE_AVX512VL
294static SIMDPP_INL
295mask_float32<4> i_cmp_eq(const mask_float32<4>& a, const mask_float32<4>& b)
296{
297 return _mm512_kxnor(a.native(), b.native());
298}
299#endif
300
301
302#if SIMDPP_USE_AVX
303static SIMDPP_INL
304mask_float32x8 i_cmp_eq(const float32x8& a, const float32x8& b)
305{
306#if SIMDPP_USE_AVX512VL
307 return _mm256_cmp_ps_mask(a.native(), b.native(), _CMP_EQ_OQ);
308#else
309 return _mm256_cmp_ps(a.native(), b.native(), _CMP_EQ_OQ);
310#endif
311}
312#endif
313
314#if SIMDPP_USE_AVX512VL
315static SIMDPP_INL
316mask_float32<8> i_cmp_eq(const mask_float32<8>& a, const mask_float32<8>& b)
317{
318 return _mm512_kxnor(a.native(), b.native());
319}
320#endif
321
322#if SIMDPP_USE_AVX512F
323static SIMDPP_INL
324mask_float32<16> i_cmp_eq(const float32<16>& a, const float32<16>& b)
325{
326 return _mm512_cmp_ps_mask(a.native(), b.native(), _CMP_EQ_OQ);
327}
328
329static SIMDPP_INL
330mask_float32<16> i_cmp_eq(const mask_float32<16>& a, const mask_float32<16>& b)
331{
332 return _mm512_kxnor(a.native(), b.native());
333}
334#endif
335
336// -----------------------------------------------------------------------------
337
338static SIMDPP_INL
339mask_float64x2 i_cmp_eq(const float64x2& a, const float64x2& b)
340{
341#if SIMDPP_USE_AVX512VL
342 return _mm_cmp_pd_mask(a.native(), b.native(), _CMP_EQ_OQ);
343#elif SIMDPP_USE_AVX
344 return _mm_cmp_pd(a.native(), b.native(), _CMP_EQ_OQ);
345#elif SIMDPP_USE_SSE2
346 return _mm_cmpeq_pd(a.native(), b.native());
347#elif SIMDPP_USE_NEON64
348 return vreinterpretq_f64_u64(vceqq_f64(a.native(), b.native()));
349#elif SIMDPP_USE_VSX_206
350 return (__vector double) vec_cmpeq(a.native(), b.native());
351#elif SIMDPP_USE_MSA
352 return (v2f64) __msa_fceq_d(a.native(), b.native());
353#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
354 return detail::null::cmp_eq(a, b);
355#else
356 return SIMDPP_NOT_IMPLEMENTED2(a, b);
357#endif
358}
359
360#if SIMDPP_USE_AVX512VL
361static SIMDPP_INL
362mask_float64<2> i_cmp_eq(const mask_float64<2>& a, const mask_float64<2>& b)
363{
364 return _mm512_kxnor(a.native(), b.native());
365}
366#endif
367
368
369#if SIMDPP_USE_AVX
370static SIMDPP_INL
371mask_float64x4 i_cmp_eq(const float64x4& a, const float64x4& b)
372{
373#if SIMDPP_USE_AVX512VL
374 return _mm256_cmp_pd_mask(a.native(), b.native(), _CMP_EQ_OQ);
375#else
376 return _mm256_cmp_pd(a.native(), b.native(), _CMP_EQ_OQ);
377#endif
378}
379#endif
380
381#if SIMDPP_USE_AVX512VL
382static SIMDPP_INL
383mask_float64<4> i_cmp_eq(const mask_float64<4>& a, const mask_float64<4>& b)
384{
385 return _mm512_kxnor(a.native(), b.native());
386}
387#endif
388
389#if SIMDPP_USE_AVX512F
390static SIMDPP_INL
391mask_float64<8> i_cmp_eq(const float64<8>& a, const float64<8>& b)
392{
393 return _mm512_cmp_pd_mask(a.native(), b.native(), _CMP_EQ_OQ);
394}
395
396static SIMDPP_INL
397mask_float64<8> i_cmp_eq(const mask_float64<8>& a, const mask_float64<8>& b)
398{
399 return _mm512_kxnor(a.native(), b.native());
400}
401#endif
402
403// -----------------------------------------------------------------------------
404
405template<class V> SIMDPP_INL
406typename V::mask_vector_type i_cmp_eq(const V& a, const V& b)
407{
408 SIMDPP_VEC_ARRAY_IMPL2(typename V::mask_vector_type, i_cmp_eq, a, b);
409}
410
411} // namespace insn
412} // namespace detail
413} // namespace SIMDPP_ARCH_NAMESPACE
414} // namespace simdpp
415
416#endif
417
418