1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_GT_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_GT_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/make_shuffle_bytes_mask.h>
17#include <simdpp/core/make_uint.h>
18#include <simdpp/detail/not_implemented.h>
19#include <simdpp/core/bit_xor.h>
20#include <simdpp/detail/null/compare.h>
21#include <simdpp/detail/not_implemented.h>
22#include <simdpp/detail/vector_array_macros.h>
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace insn {
28
29
30static SIMDPP_INL
31mask_int8x16 i_cmp_gt(const int8x16& a, const int8x16& b)
32{
33#if SIMDPP_USE_NULL
34 return detail::null::cmp_gt(a, b);
35#elif SIMDPP_USE_AVX512VL
36 return _mm_cmpgt_epi8_mask(a.native(), b.native());
37#elif SIMDPP_USE_SSE2
38 return _mm_cmpgt_epi8(a.native(), b.native());
39#elif SIMDPP_USE_NEON
40 return vcgtq_s8(a.native(), b.native());
41#elif SIMDPP_USE_ALTIVEC
42 return vec_cmpgt(a.native(), b.native());
43#elif SIMDPP_USE_MSA
44 return (v16u8) __msa_clt_s_b(b.native(), a.native());
45#endif
46}
47
48#if SIMDPP_USE_AVX2
49static SIMDPP_INL
50mask_int8x32 i_cmp_gt(const int8x32& a, const int8x32& b)
51{
52#if SIMDPP_USE_AVX512VL
53 return _mm256_cmpgt_epi8_mask(a.native(), b.native());
54#else
55 return _mm256_cmpgt_epi8(a.native(), b.native());
56#endif
57}
58#endif
59
60#if SIMDPP_USE_AVX512BW
61SIMDPP_INL mask_int8<64> i_cmp_gt(const int8<64>& a, const int8<64>& b)
62{
63 return _mm512_cmpgt_epi8_mask(a.native(), b.native());
64}
65#endif
66
67// -----------------------------------------------------------------------------
68
69static SIMDPP_INL
70mask_int8x16 i_cmp_gt(const uint8x16& ca, const uint8x16& cb)
71{
72 uint8<16> a = ca, b = cb;
73#if SIMDPP_USE_NULL
74 return detail::null::cmp_gt(a, b);
75#elif SIMDPP_USE_AVX512VL
76 return _mm_cmpgt_epu8_mask(a.native(), b.native());
77#elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
78 return _mm_comgt_epu8(a.native(), b.native());
79#elif SIMDPP_USE_SSE2
80 a = bit_xor(a, 0x80); // sub
81 b = bit_xor(b, 0x80); // sub
82 return _mm_cmpgt_epi8(a.native(), b.native());
83#elif SIMDPP_USE_NEON
84 return vcgtq_u8(a.native(), b.native());
85#elif SIMDPP_USE_ALTIVEC
86 return vec_cmpgt(a.native(), b.native());
87#elif SIMDPP_USE_MSA
88 return (v16u8) __msa_clt_u_b(b.native(), a.native());
89#endif
90}
91
92#if SIMDPP_USE_AVX2
93static SIMDPP_INL
94mask_int8x32 i_cmp_gt(const uint8x32& ca, const uint8x32& cb)
95{
96#if SIMDPP_USE_AVX512VL
97 return _mm256_cmpgt_epu8_mask(ca.native(), cb.native());
98#else
99 uint8<32> a = ca, b = cb;
100 a = bit_xor(a, 0x80); // sub
101 b = bit_xor(b, 0x80); // sub
102 return _mm256_cmpgt_epi8(a.native(), b.native());
103#endif
104}
105#endif
106
107#if SIMDPP_USE_AVX512BW
108SIMDPP_INL mask_int8<64> i_cmp_gt(const uint8<64>& a, const uint8<64>& b)
109{
110 return _mm512_cmpgt_epu8_mask(a.native(), b.native());
111}
112#endif
113
114// -----------------------------------------------------------------------------
115
116static SIMDPP_INL
117mask_int16x8 i_cmp_gt(const int16x8& a, const int16x8& b)
118{
119#if SIMDPP_USE_NULL
120 return detail::null::cmp_gt(a, b);
121#elif SIMDPP_USE_AVX512VL
122 return _mm_cmpgt_epi16_mask(a.native(), b.native());
123#elif SIMDPP_USE_SSE2
124 return _mm_cmpgt_epi16(a.native(), b.native());
125#elif SIMDPP_USE_NEON
126 return vcgtq_s16(a.native(), b.native());
127#elif SIMDPP_USE_ALTIVEC
128 return vec_cmpgt(a.native(), b.native());
129#elif SIMDPP_USE_MSA
130 return (v8u16) __msa_clt_s_h(b.native(), a.native());
131#endif
132}
133
134#if SIMDPP_USE_AVX2
135static SIMDPP_INL
136mask_int16x16 i_cmp_gt(const int16x16& a, const int16x16& b)
137{
138#if SIMDPP_USE_AVX512VL
139 return _mm256_cmpgt_epi16_mask(a.native(), b.native());
140#else
141 return _mm256_cmpgt_epi16(a.native(), b.native());
142#endif
143}
144#endif
145
146#if SIMDPP_USE_AVX512BW
147SIMDPP_INL mask_int16<32> i_cmp_gt(const int16<32>& a, const int16<32>& b)
148{
149 return _mm512_cmpgt_epi16_mask(a.native(), b.native());
150}
151#endif
152
153// -----------------------------------------------------------------------------
154
155static SIMDPP_INL
156mask_int16x8 i_cmp_gt(const uint16x8& ca, const uint16x8& cb)
157{
158 uint16<8> a = ca, b = cb;
159#if SIMDPP_USE_NULL
160 return detail::null::cmp_gt(a, b);
161#elif SIMDPP_USE_AVX512VL
162 return _mm_cmpgt_epu16_mask(a.native(), b.native());
163#elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
164 return _mm_comgt_epu16(a.native(), b.native());
165#elif SIMDPP_USE_SSE2
166 a = bit_xor(a, 0x8000); // sub
167 b = bit_xor(b, 0x8000); // sub
168 return _mm_cmpgt_epi16(a.native(), b.native());
169#elif SIMDPP_USE_NEON
170 return vcgtq_u16(a.native(), b.native());
171#elif SIMDPP_USE_ALTIVEC
172 return vec_cmpgt(a.native(), b.native());
173#elif SIMDPP_USE_MSA
174 return (v8u16) __msa_clt_u_h(b.native(), a.native());
175#endif
176}
177
178#if SIMDPP_USE_AVX2
179static SIMDPP_INL
180mask_int16x16 i_cmp_gt(const uint16x16& ca, const uint16x16& cb)
181{
182#if SIMDPP_USE_AVX512VL
183 return _mm256_cmpgt_epu16_mask(ca.native(), cb.native());
184#else
185 uint16<16> a = ca, b = cb;
186 a = bit_xor(a, 0x8000); // sub
187 b = bit_xor(b, 0x8000); // sub
188 return _mm256_cmpgt_epi16(a.native(), b.native());
189#endif
190}
191#endif
192
193#if SIMDPP_USE_AVX512BW
194SIMDPP_INL mask_int16<32> i_cmp_gt(const uint16<32>& a, const uint16<32>& b)
195{
196 return _mm512_cmpgt_epu16_mask(a.native(), b.native());
197}
198#endif
199
200// -----------------------------------------------------------------------------
201
202static SIMDPP_INL
203mask_int32x4 i_cmp_gt(const int32x4& a, const int32x4& b)
204{
205#if SIMDPP_USE_NULL
206 return detail::null::cmp_gt(a, b);
207#elif SIMDPP_USE_AVX512VL
208 return _mm_cmpgt_epi32_mask(a.native(), b.native());
209#elif SIMDPP_USE_SSE2
210 return _mm_cmpgt_epi32(a.native(), b.native());
211#elif SIMDPP_USE_NEON
212 return vcgtq_s32(a.native(), b.native());
213#elif SIMDPP_USE_ALTIVEC
214 return vec_cmpgt(a.native(), b.native());
215#elif SIMDPP_USE_MSA
216 return (v4u32) __msa_clt_s_w(b.native(), a.native());
217#endif
218}
219
220#if SIMDPP_USE_AVX2
221static SIMDPP_INL
222mask_int32x8 i_cmp_gt(const int32x8& a, const int32x8& b)
223{
224#if SIMDPP_USE_AVX512VL
225 return _mm256_cmpgt_epi32_mask(a.native(), b.native());
226#else
227 return _mm256_cmpgt_epi32(a.native(), b.native());
228#endif
229}
230#endif
231
232#if SIMDPP_USE_AVX512F
233static SIMDPP_INL
234mask_int32<16> i_cmp_gt(const int32<16>& a, const int32<16>& b)
235{
236 return _mm512_cmpgt_epi32_mask(a.native(), b.native());
237}
238#endif
239
240// -----------------------------------------------------------------------------
241
242static SIMDPP_INL
243mask_int32x4 i_cmp_gt(const uint32x4& ca, const uint32x4& cb)
244{
245 uint32<4> a = ca, b = cb;
246#if SIMDPP_USE_NULL
247 return detail::null::cmp_gt(a, b);
248#elif SIMDPP_USE_AVX512VL
249 return _mm_cmpgt_epu32_mask(a.native(), b.native());
250#elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
251 return _mm_comgt_epu32(a.native(), b.native());
252#elif SIMDPP_USE_SSE2
253 a = bit_xor(a, 0x80000000); // sub
254 b = bit_xor(b, 0x80000000); // sub
255 return _mm_cmpgt_epi32(a.native(), b.native());
256#elif SIMDPP_USE_NEON
257 return vcgtq_u32(a.native(), b.native());
258#elif SIMDPP_USE_ALTIVEC
259 return vec_cmpgt(a.native(), b.native());
260#elif SIMDPP_USE_MSA
261 return (v4u32) __msa_clt_u_w(b.native(), a.native());
262#endif
263}
264
265#if SIMDPP_USE_AVX2
266static SIMDPP_INL
267mask_int32x8 i_cmp_gt(const uint32x8& ca, const uint32x8& cb)
268{
269#if SIMDPP_USE_AVX512VL
270 return _mm256_cmpgt_epu32_mask(ca.native(), cb.native());
271#else
272 uint32<8> a = ca, b = cb;
273 a = bit_xor(a, 0x80000000); // sub
274 b = bit_xor(b, 0x80000000); // sub
275 return _mm256_cmpgt_epi32(a.native(), b.native());
276#endif
277}
278#endif
279
280#if SIMDPP_USE_AVX512F
281static SIMDPP_INL
282mask_int32<16> i_cmp_gt(const uint32<16>& a, const uint32<16>& b)
283{
284 // FIXME: BUG: GCC does not have _mm512_cmpgt_epu32_mask
285 return _mm512_cmp_epu32_mask(a.native(), b.native(), _MM_CMPINT_NLE);
286}
287#endif
288
289// -----------------------------------------------------------------------------
290
291static SIMDPP_INL
292mask_int64x2 i_cmp_gt(const int64x2& a, const int64x2& b)
293{
294#if SIMDPP_USE_AVX512VL
295 return _mm_cmpgt_epi64_mask(a.native(), b.native());
296#elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
297 return _mm_comgt_epi64(a.native(), b.native());
298#elif SIMDPP_USE_AVX2
299 return _mm_cmpgt_epi64(a.native(), b.native());
300#elif SIMDPP_USE_NEON64
301 return vcgtq_s64(a.native(), b.native());
302#elif SIMDPP_USE_VSX_207
303 return (__vector uint64_t) vec_cmpgt(a.native(), b.native());
304#elif SIMDPP_USE_MSA
305 return (v2u64) __msa_clt_s_d(b.native(), a.native());
306#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
307 return detail::null::cmp_gt(a, b);
308#else
309 return SIMDPP_NOT_IMPLEMENTED2(a, b);
310#endif
311}
312
313#if SIMDPP_USE_AVX2
314static SIMDPP_INL
315mask_int64x4 i_cmp_gt(const int64x4& a, const int64x4& b)
316{
317#if SIMDPP_USE_AVX512VL
318 return _mm256_cmpgt_epi64_mask(a.native(), b.native());
319#else
320 return _mm256_cmpgt_epi64(a.native(), b.native());
321#endif
322}
323#endif
324
325#if SIMDPP_USE_AVX512F
326static SIMDPP_INL
327mask_int64<8> i_cmp_gt(const int64<8>& a, const int64<8>& b)
328{
329 // GCC does not have _mm512_cmpgt_epi64_mask
330 return _mm512_cmp_epi64_mask(a.native(), b.native(), _MM_CMPINT_NLE);
331}
332#endif
333
334// -----------------------------------------------------------------------------
335
336static SIMDPP_INL
337mask_int64x2 i_cmp_gt(const uint64x2& a, const uint64x2& b)
338{
339#if SIMDPP_USE_AVX512VL
340 return _mm_cmpgt_epu64_mask(a.native(), b.native());
341#elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
342 return _mm_comgt_epu64(a.native(), b.native());
343#elif SIMDPP_USE_AVX2
344 uint64<2> ca = bit_xor(a, 0x8000000000000000); // sub
345 uint64<2> cb = bit_xor(b, 0x8000000000000000); // sub
346 return _mm_cmpgt_epi64(ca.native(), cb.native());
347#elif SIMDPP_USE_NEON64
348 return vcgtq_u64(a.native(), b.native());
349#elif SIMDPP_USE_VSX_207
350 return (__vector uint64_t) vec_cmpgt(a.native(), b.native());
351#elif SIMDPP_USE_MSA
352 return (v2u64) __msa_clt_u_d(b.native(), a.native());
353#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
354 return detail::null::cmp_gt(a, b);
355#else
356 return SIMDPP_NOT_IMPLEMENTED2(a, b);
357#endif
358}
359
360#if SIMDPP_USE_AVX2
361static SIMDPP_INL
362mask_int64x4 i_cmp_gt(const uint64x4& ca, const uint64x4& cb)
363{
364#if SIMDPP_USE_AVX512VL
365 return _mm256_cmpgt_epu64_mask(ca.native(), cb.native());
366#else
367 uint64<4> a = ca, b = cb;
368 a = bit_xor(a, 0x8000000000000000); // sub
369 b = bit_xor(b, 0x8000000000000000); // sub
370 return _mm256_cmpgt_epi64(a.native(), b.native());
371#endif
372}
373#endif
374
375#if SIMDPP_USE_AVX512F
376static SIMDPP_INL
377mask_int64<8> i_cmp_gt(const uint64<8>& a, const uint64<8>& b)
378{
379 return _mm512_cmp_epu64_mask(a.native(), b.native(), _MM_CMPINT_NLE);
380}
381#endif
382
383// -----------------------------------------------------------------------------
384
385static SIMDPP_INL
386mask_float32x4 i_cmp_gt(const float32x4& a, const float32x4& b)
387{
388#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
389 return detail::null::cmp_gt(a, b);
390#elif SIMDPP_USE_AVX512VL
391 return _mm_cmp_ps_mask(a.native(), b.native(), _CMP_GT_OQ);
392#elif SIMDPP_USE_AVX
393 return _mm_cmp_ps(a.native(), b.native(), _CMP_GT_OQ);
394#elif SIMDPP_USE_SSE2
395 return _mm_cmpgt_ps(a.native(), b.native());
396#elif SIMDPP_USE_NEON
397 return vreinterpretq_f32_u32(vcgtq_f32(a.native(), b.native()));
398#elif SIMDPP_USE_ALTIVEC
399 return vec_cmpgt(a.native(), b.native());
400#elif SIMDPP_USE_MSA
401 return (v4f32) __msa_fclt_w(b.native(), a.native());
402#endif
403}
404
405#if SIMDPP_USE_AVX
406static SIMDPP_INL
407mask_float32x8 i_cmp_gt(const float32x8& a, const float32x8& b)
408{
409#if SIMDPP_USE_AVX512VL
410 return _mm256_cmp_ps_mask(a.native(), b.native(), _CMP_GT_OQ);
411#else
412 return _mm256_cmp_ps(a.native(), b.native(), _CMP_GT_OQ);
413#endif
414}
415#endif
416
417#if SIMDPP_USE_AVX512F
418static SIMDPP_INL
419mask_float32<16> i_cmp_gt(const float32<16>& a, const float32<16>& b)
420{
421 return _mm512_cmp_ps_mask(a.native(), b.native(), _CMP_GT_OQ);
422}
423#endif
424
425// -----------------------------------------------------------------------------
426
427static SIMDPP_INL
428mask_float64x2 i_cmp_gt(const float64x2& a, const float64x2& b)
429{
430#if SIMDPP_USE_AVX512VL
431 return _mm_cmp_pd_mask(a.native(), b.native(), _CMP_GT_OQ);
432#elif SIMDPP_USE_AVX
433 return _mm_cmp_pd(a.native(), b.native(), _CMP_GT_OQ);
434#elif SIMDPP_USE_SSE2
435 return _mm_cmpgt_pd(a.native(), b.native());
436#elif SIMDPP_USE_NEON64
437 return vreinterpretq_f64_u64(vcgtq_f64(a.native(), b.native()));
438#elif SIMDPP_USE_VSX_206
439 return (__vector double) vec_cmpgt(a.native(), b.native());
440#elif SIMDPP_USE_MSA
441 return (v2f64) __msa_fclt_d(b.native(), a.native());
442#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
443 return detail::null::cmp_gt(a, b);
444#endif
445}
446
447#if SIMDPP_USE_AVX
448static SIMDPP_INL
449mask_float64x4 i_cmp_gt(const float64x4& a, const float64x4& b)
450{
451#if SIMDPP_USE_AVX512VL
452 return _mm256_cmp_pd_mask(a.native(), b.native(), _CMP_GT_OQ);
453#else
454 return _mm256_cmp_pd(a.native(), b.native(), _CMP_GT_OQ);
455#endif
456}
457#endif
458
459#if SIMDPP_USE_AVX512F
460static SIMDPP_INL
461mask_float64<8> i_cmp_gt(const float64<8>& a, const float64<8>& b)
462{
463 return _mm512_cmp_pd_mask(a.native(), b.native(), _CMP_GT_OQ);
464}
465#endif
466
467// -----------------------------------------------------------------------------
468
469template<class V> SIMDPP_INL
470typename V::mask_vector_type i_cmp_gt(const V& a, const V& b)
471{
472 SIMDPP_VEC_ARRAY_IMPL2(typename V::mask_vector_type, i_cmp_gt, a, b);
473}
474
475} // namespace insn
476} // namespace detail
477} // namespace SIMDPP_ARCH_NAMESPACE
478} // namespace simdpp
479
480#endif
481
482