1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_GT_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_GT_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/make_shuffle_bytes_mask.h> |
17 | #include <simdpp/core/make_uint.h> |
18 | #include <simdpp/detail/not_implemented.h> |
19 | #include <simdpp/core/bit_xor.h> |
20 | #include <simdpp/detail/null/compare.h> |
21 | #include <simdpp/detail/not_implemented.h> |
22 | #include <simdpp/detail/vector_array_macros.h> |
23 | |
24 | namespace simdpp { |
25 | namespace SIMDPP_ARCH_NAMESPACE { |
26 | namespace detail { |
27 | namespace insn { |
28 | |
29 | |
30 | static SIMDPP_INL |
31 | mask_int8x16 i_cmp_gt(const int8x16& a, const int8x16& b) |
32 | { |
33 | #if SIMDPP_USE_NULL |
34 | return detail::null::cmp_gt(a, b); |
35 | #elif SIMDPP_USE_AVX512VL |
36 | return _mm_cmpgt_epi8_mask(a.native(), b.native()); |
37 | #elif SIMDPP_USE_SSE2 |
38 | return _mm_cmpgt_epi8(a.native(), b.native()); |
39 | #elif SIMDPP_USE_NEON |
40 | return vcgtq_s8(a.native(), b.native()); |
41 | #elif SIMDPP_USE_ALTIVEC |
42 | return vec_cmpgt(a.native(), b.native()); |
43 | #elif SIMDPP_USE_MSA |
44 | return (v16u8) __msa_clt_s_b(b.native(), a.native()); |
45 | #endif |
46 | } |
47 | |
48 | #if SIMDPP_USE_AVX2 |
49 | static SIMDPP_INL |
50 | mask_int8x32 i_cmp_gt(const int8x32& a, const int8x32& b) |
51 | { |
52 | #if SIMDPP_USE_AVX512VL |
53 | return _mm256_cmpgt_epi8_mask(a.native(), b.native()); |
54 | #else |
55 | return _mm256_cmpgt_epi8(a.native(), b.native()); |
56 | #endif |
57 | } |
58 | #endif |
59 | |
60 | #if SIMDPP_USE_AVX512BW |
61 | SIMDPP_INL mask_int8<64> i_cmp_gt(const int8<64>& a, const int8<64>& b) |
62 | { |
63 | return _mm512_cmpgt_epi8_mask(a.native(), b.native()); |
64 | } |
65 | #endif |
66 | |
67 | // ----------------------------------------------------------------------------- |
68 | |
69 | static SIMDPP_INL |
70 | mask_int8x16 i_cmp_gt(const uint8x16& ca, const uint8x16& cb) |
71 | { |
72 | uint8<16> a = ca, b = cb; |
73 | #if SIMDPP_USE_NULL |
74 | return detail::null::cmp_gt(a, b); |
75 | #elif SIMDPP_USE_AVX512VL |
76 | return _mm_cmpgt_epu8_mask(a.native(), b.native()); |
77 | #elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
78 | return _mm_comgt_epu8(a.native(), b.native()); |
79 | #elif SIMDPP_USE_SSE2 |
80 | a = bit_xor(a, 0x80); // sub |
81 | b = bit_xor(b, 0x80); // sub |
82 | return _mm_cmpgt_epi8(a.native(), b.native()); |
83 | #elif SIMDPP_USE_NEON |
84 | return vcgtq_u8(a.native(), b.native()); |
85 | #elif SIMDPP_USE_ALTIVEC |
86 | return vec_cmpgt(a.native(), b.native()); |
87 | #elif SIMDPP_USE_MSA |
88 | return (v16u8) __msa_clt_u_b(b.native(), a.native()); |
89 | #endif |
90 | } |
91 | |
92 | #if SIMDPP_USE_AVX2 |
93 | static SIMDPP_INL |
94 | mask_int8x32 i_cmp_gt(const uint8x32& ca, const uint8x32& cb) |
95 | { |
96 | #if SIMDPP_USE_AVX512VL |
97 | return _mm256_cmpgt_epu8_mask(ca.native(), cb.native()); |
98 | #else |
99 | uint8<32> a = ca, b = cb; |
100 | a = bit_xor(a, 0x80); // sub |
101 | b = bit_xor(b, 0x80); // sub |
102 | return _mm256_cmpgt_epi8(a.native(), b.native()); |
103 | #endif |
104 | } |
105 | #endif |
106 | |
107 | #if SIMDPP_USE_AVX512BW |
108 | SIMDPP_INL mask_int8<64> i_cmp_gt(const uint8<64>& a, const uint8<64>& b) |
109 | { |
110 | return _mm512_cmpgt_epu8_mask(a.native(), b.native()); |
111 | } |
112 | #endif |
113 | |
114 | // ----------------------------------------------------------------------------- |
115 | |
116 | static SIMDPP_INL |
117 | mask_int16x8 i_cmp_gt(const int16x8& a, const int16x8& b) |
118 | { |
119 | #if SIMDPP_USE_NULL |
120 | return detail::null::cmp_gt(a, b); |
121 | #elif SIMDPP_USE_AVX512VL |
122 | return _mm_cmpgt_epi16_mask(a.native(), b.native()); |
123 | #elif SIMDPP_USE_SSE2 |
124 | return _mm_cmpgt_epi16(a.native(), b.native()); |
125 | #elif SIMDPP_USE_NEON |
126 | return vcgtq_s16(a.native(), b.native()); |
127 | #elif SIMDPP_USE_ALTIVEC |
128 | return vec_cmpgt(a.native(), b.native()); |
129 | #elif SIMDPP_USE_MSA |
130 | return (v8u16) __msa_clt_s_h(b.native(), a.native()); |
131 | #endif |
132 | } |
133 | |
134 | #if SIMDPP_USE_AVX2 |
135 | static SIMDPP_INL |
136 | mask_int16x16 i_cmp_gt(const int16x16& a, const int16x16& b) |
137 | { |
138 | #if SIMDPP_USE_AVX512VL |
139 | return _mm256_cmpgt_epi16_mask(a.native(), b.native()); |
140 | #else |
141 | return _mm256_cmpgt_epi16(a.native(), b.native()); |
142 | #endif |
143 | } |
144 | #endif |
145 | |
146 | #if SIMDPP_USE_AVX512BW |
147 | SIMDPP_INL mask_int16<32> i_cmp_gt(const int16<32>& a, const int16<32>& b) |
148 | { |
149 | return _mm512_cmpgt_epi16_mask(a.native(), b.native()); |
150 | } |
151 | #endif |
152 | |
153 | // ----------------------------------------------------------------------------- |
154 | |
155 | static SIMDPP_INL |
156 | mask_int16x8 i_cmp_gt(const uint16x8& ca, const uint16x8& cb) |
157 | { |
158 | uint16<8> a = ca, b = cb; |
159 | #if SIMDPP_USE_NULL |
160 | return detail::null::cmp_gt(a, b); |
161 | #elif SIMDPP_USE_AVX512VL |
162 | return _mm_cmpgt_epu16_mask(a.native(), b.native()); |
163 | #elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
164 | return _mm_comgt_epu16(a.native(), b.native()); |
165 | #elif SIMDPP_USE_SSE2 |
166 | a = bit_xor(a, 0x8000); // sub |
167 | b = bit_xor(b, 0x8000); // sub |
168 | return _mm_cmpgt_epi16(a.native(), b.native()); |
169 | #elif SIMDPP_USE_NEON |
170 | return vcgtq_u16(a.native(), b.native()); |
171 | #elif SIMDPP_USE_ALTIVEC |
172 | return vec_cmpgt(a.native(), b.native()); |
173 | #elif SIMDPP_USE_MSA |
174 | return (v8u16) __msa_clt_u_h(b.native(), a.native()); |
175 | #endif |
176 | } |
177 | |
178 | #if SIMDPP_USE_AVX2 |
179 | static SIMDPP_INL |
180 | mask_int16x16 i_cmp_gt(const uint16x16& ca, const uint16x16& cb) |
181 | { |
182 | #if SIMDPP_USE_AVX512VL |
183 | return _mm256_cmpgt_epu16_mask(ca.native(), cb.native()); |
184 | #else |
185 | uint16<16> a = ca, b = cb; |
186 | a = bit_xor(a, 0x8000); // sub |
187 | b = bit_xor(b, 0x8000); // sub |
188 | return _mm256_cmpgt_epi16(a.native(), b.native()); |
189 | #endif |
190 | } |
191 | #endif |
192 | |
193 | #if SIMDPP_USE_AVX512BW |
194 | SIMDPP_INL mask_int16<32> i_cmp_gt(const uint16<32>& a, const uint16<32>& b) |
195 | { |
196 | return _mm512_cmpgt_epu16_mask(a.native(), b.native()); |
197 | } |
198 | #endif |
199 | |
200 | // ----------------------------------------------------------------------------- |
201 | |
202 | static SIMDPP_INL |
203 | mask_int32x4 i_cmp_gt(const int32x4& a, const int32x4& b) |
204 | { |
205 | #if SIMDPP_USE_NULL |
206 | return detail::null::cmp_gt(a, b); |
207 | #elif SIMDPP_USE_AVX512VL |
208 | return _mm_cmpgt_epi32_mask(a.native(), b.native()); |
209 | #elif SIMDPP_USE_SSE2 |
210 | return _mm_cmpgt_epi32(a.native(), b.native()); |
211 | #elif SIMDPP_USE_NEON |
212 | return vcgtq_s32(a.native(), b.native()); |
213 | #elif SIMDPP_USE_ALTIVEC |
214 | return vec_cmpgt(a.native(), b.native()); |
215 | #elif SIMDPP_USE_MSA |
216 | return (v4u32) __msa_clt_s_w(b.native(), a.native()); |
217 | #endif |
218 | } |
219 | |
220 | #if SIMDPP_USE_AVX2 |
221 | static SIMDPP_INL |
222 | mask_int32x8 i_cmp_gt(const int32x8& a, const int32x8& b) |
223 | { |
224 | #if SIMDPP_USE_AVX512VL |
225 | return _mm256_cmpgt_epi32_mask(a.native(), b.native()); |
226 | #else |
227 | return _mm256_cmpgt_epi32(a.native(), b.native()); |
228 | #endif |
229 | } |
230 | #endif |
231 | |
232 | #if SIMDPP_USE_AVX512F |
233 | static SIMDPP_INL |
234 | mask_int32<16> i_cmp_gt(const int32<16>& a, const int32<16>& b) |
235 | { |
236 | return _mm512_cmpgt_epi32_mask(a.native(), b.native()); |
237 | } |
238 | #endif |
239 | |
240 | // ----------------------------------------------------------------------------- |
241 | |
242 | static SIMDPP_INL |
243 | mask_int32x4 i_cmp_gt(const uint32x4& ca, const uint32x4& cb) |
244 | { |
245 | uint32<4> a = ca, b = cb; |
246 | #if SIMDPP_USE_NULL |
247 | return detail::null::cmp_gt(a, b); |
248 | #elif SIMDPP_USE_AVX512VL |
249 | return _mm_cmpgt_epu32_mask(a.native(), b.native()); |
250 | #elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
251 | return _mm_comgt_epu32(a.native(), b.native()); |
252 | #elif SIMDPP_USE_SSE2 |
253 | a = bit_xor(a, 0x80000000); // sub |
254 | b = bit_xor(b, 0x80000000); // sub |
255 | return _mm_cmpgt_epi32(a.native(), b.native()); |
256 | #elif SIMDPP_USE_NEON |
257 | return vcgtq_u32(a.native(), b.native()); |
258 | #elif SIMDPP_USE_ALTIVEC |
259 | return vec_cmpgt(a.native(), b.native()); |
260 | #elif SIMDPP_USE_MSA |
261 | return (v4u32) __msa_clt_u_w(b.native(), a.native()); |
262 | #endif |
263 | } |
264 | |
265 | #if SIMDPP_USE_AVX2 |
266 | static SIMDPP_INL |
267 | mask_int32x8 i_cmp_gt(const uint32x8& ca, const uint32x8& cb) |
268 | { |
269 | #if SIMDPP_USE_AVX512VL |
270 | return _mm256_cmpgt_epu32_mask(ca.native(), cb.native()); |
271 | #else |
272 | uint32<8> a = ca, b = cb; |
273 | a = bit_xor(a, 0x80000000); // sub |
274 | b = bit_xor(b, 0x80000000); // sub |
275 | return _mm256_cmpgt_epi32(a.native(), b.native()); |
276 | #endif |
277 | } |
278 | #endif |
279 | |
280 | #if SIMDPP_USE_AVX512F |
281 | static SIMDPP_INL |
282 | mask_int32<16> i_cmp_gt(const uint32<16>& a, const uint32<16>& b) |
283 | { |
284 | // FIXME: BUG: GCC does not have _mm512_cmpgt_epu32_mask |
285 | return _mm512_cmp_epu32_mask(a.native(), b.native(), _MM_CMPINT_NLE); |
286 | } |
287 | #endif |
288 | |
289 | // ----------------------------------------------------------------------------- |
290 | |
291 | static SIMDPP_INL |
292 | mask_int64x2 i_cmp_gt(const int64x2& a, const int64x2& b) |
293 | { |
294 | #if SIMDPP_USE_AVX512VL |
295 | return _mm_cmpgt_epi64_mask(a.native(), b.native()); |
296 | #elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
297 | return _mm_comgt_epi64(a.native(), b.native()); |
298 | #elif SIMDPP_USE_AVX2 |
299 | return _mm_cmpgt_epi64(a.native(), b.native()); |
300 | #elif SIMDPP_USE_NEON64 |
301 | return vcgtq_s64(a.native(), b.native()); |
302 | #elif SIMDPP_USE_VSX_207 |
303 | return (__vector uint64_t) vec_cmpgt(a.native(), b.native()); |
304 | #elif SIMDPP_USE_MSA |
305 | return (v2u64) __msa_clt_s_d(b.native(), a.native()); |
306 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
307 | return detail::null::cmp_gt(a, b); |
308 | #else |
309 | return SIMDPP_NOT_IMPLEMENTED2(a, b); |
310 | #endif |
311 | } |
312 | |
313 | #if SIMDPP_USE_AVX2 |
314 | static SIMDPP_INL |
315 | mask_int64x4 i_cmp_gt(const int64x4& a, const int64x4& b) |
316 | { |
317 | #if SIMDPP_USE_AVX512VL |
318 | return _mm256_cmpgt_epi64_mask(a.native(), b.native()); |
319 | #else |
320 | return _mm256_cmpgt_epi64(a.native(), b.native()); |
321 | #endif |
322 | } |
323 | #endif |
324 | |
325 | #if SIMDPP_USE_AVX512F |
326 | static SIMDPP_INL |
327 | mask_int64<8> i_cmp_gt(const int64<8>& a, const int64<8>& b) |
328 | { |
329 | // GCC does not have _mm512_cmpgt_epi64_mask |
330 | return _mm512_cmp_epi64_mask(a.native(), b.native(), _MM_CMPINT_NLE); |
331 | } |
332 | #endif |
333 | |
334 | // ----------------------------------------------------------------------------- |
335 | |
336 | static SIMDPP_INL |
337 | mask_int64x2 i_cmp_gt(const uint64x2& a, const uint64x2& b) |
338 | { |
339 | #if SIMDPP_USE_AVX512VL |
340 | return _mm_cmpgt_epu64_mask(a.native(), b.native()); |
341 | #elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
342 | return _mm_comgt_epu64(a.native(), b.native()); |
343 | #elif SIMDPP_USE_AVX2 |
344 | uint64<2> ca = bit_xor(a, 0x8000000000000000); // sub |
345 | uint64<2> cb = bit_xor(b, 0x8000000000000000); // sub |
346 | return _mm_cmpgt_epi64(ca.native(), cb.native()); |
347 | #elif SIMDPP_USE_NEON64 |
348 | return vcgtq_u64(a.native(), b.native()); |
349 | #elif SIMDPP_USE_VSX_207 |
350 | return (__vector uint64_t) vec_cmpgt(a.native(), b.native()); |
351 | #elif SIMDPP_USE_MSA |
352 | return (v2u64) __msa_clt_u_d(b.native(), a.native()); |
353 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
354 | return detail::null::cmp_gt(a, b); |
355 | #else |
356 | return SIMDPP_NOT_IMPLEMENTED2(a, b); |
357 | #endif |
358 | } |
359 | |
360 | #if SIMDPP_USE_AVX2 |
361 | static SIMDPP_INL |
362 | mask_int64x4 i_cmp_gt(const uint64x4& ca, const uint64x4& cb) |
363 | { |
364 | #if SIMDPP_USE_AVX512VL |
365 | return _mm256_cmpgt_epu64_mask(ca.native(), cb.native()); |
366 | #else |
367 | uint64<4> a = ca, b = cb; |
368 | a = bit_xor(a, 0x8000000000000000); // sub |
369 | b = bit_xor(b, 0x8000000000000000); // sub |
370 | return _mm256_cmpgt_epi64(a.native(), b.native()); |
371 | #endif |
372 | } |
373 | #endif |
374 | |
375 | #if SIMDPP_USE_AVX512F |
376 | static SIMDPP_INL |
377 | mask_int64<8> i_cmp_gt(const uint64<8>& a, const uint64<8>& b) |
378 | { |
379 | return _mm512_cmp_epu64_mask(a.native(), b.native(), _MM_CMPINT_NLE); |
380 | } |
381 | #endif |
382 | |
383 | // ----------------------------------------------------------------------------- |
384 | |
385 | static SIMDPP_INL |
386 | mask_float32x4 i_cmp_gt(const float32x4& a, const float32x4& b) |
387 | { |
388 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
389 | return detail::null::cmp_gt(a, b); |
390 | #elif SIMDPP_USE_AVX512VL |
391 | return _mm_cmp_ps_mask(a.native(), b.native(), _CMP_GT_OQ); |
392 | #elif SIMDPP_USE_AVX |
393 | return _mm_cmp_ps(a.native(), b.native(), _CMP_GT_OQ); |
394 | #elif SIMDPP_USE_SSE2 |
395 | return _mm_cmpgt_ps(a.native(), b.native()); |
396 | #elif SIMDPP_USE_NEON |
397 | return vreinterpretq_f32_u32(vcgtq_f32(a.native(), b.native())); |
398 | #elif SIMDPP_USE_ALTIVEC |
399 | return vec_cmpgt(a.native(), b.native()); |
400 | #elif SIMDPP_USE_MSA |
401 | return (v4f32) __msa_fclt_w(b.native(), a.native()); |
402 | #endif |
403 | } |
404 | |
405 | #if SIMDPP_USE_AVX |
406 | static SIMDPP_INL |
407 | mask_float32x8 i_cmp_gt(const float32x8& a, const float32x8& b) |
408 | { |
409 | #if SIMDPP_USE_AVX512VL |
410 | return _mm256_cmp_ps_mask(a.native(), b.native(), _CMP_GT_OQ); |
411 | #else |
412 | return _mm256_cmp_ps(a.native(), b.native(), _CMP_GT_OQ); |
413 | #endif |
414 | } |
415 | #endif |
416 | |
417 | #if SIMDPP_USE_AVX512F |
418 | static SIMDPP_INL |
419 | mask_float32<16> i_cmp_gt(const float32<16>& a, const float32<16>& b) |
420 | { |
421 | return _mm512_cmp_ps_mask(a.native(), b.native(), _CMP_GT_OQ); |
422 | } |
423 | #endif |
424 | |
425 | // ----------------------------------------------------------------------------- |
426 | |
427 | static SIMDPP_INL |
428 | mask_float64x2 i_cmp_gt(const float64x2& a, const float64x2& b) |
429 | { |
430 | #if SIMDPP_USE_AVX512VL |
431 | return _mm_cmp_pd_mask(a.native(), b.native(), _CMP_GT_OQ); |
432 | #elif SIMDPP_USE_AVX |
433 | return _mm_cmp_pd(a.native(), b.native(), _CMP_GT_OQ); |
434 | #elif SIMDPP_USE_SSE2 |
435 | return _mm_cmpgt_pd(a.native(), b.native()); |
436 | #elif SIMDPP_USE_NEON64 |
437 | return vreinterpretq_f64_u64(vcgtq_f64(a.native(), b.native())); |
438 | #elif SIMDPP_USE_VSX_206 |
439 | return (__vector double) vec_cmpgt(a.native(), b.native()); |
440 | #elif SIMDPP_USE_MSA |
441 | return (v2f64) __msa_fclt_d(b.native(), a.native()); |
442 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
443 | return detail::null::cmp_gt(a, b); |
444 | #endif |
445 | } |
446 | |
447 | #if SIMDPP_USE_AVX |
448 | static SIMDPP_INL |
449 | mask_float64x4 i_cmp_gt(const float64x4& a, const float64x4& b) |
450 | { |
451 | #if SIMDPP_USE_AVX512VL |
452 | return _mm256_cmp_pd_mask(a.native(), b.native(), _CMP_GT_OQ); |
453 | #else |
454 | return _mm256_cmp_pd(a.native(), b.native(), _CMP_GT_OQ); |
455 | #endif |
456 | } |
457 | #endif |
458 | |
459 | #if SIMDPP_USE_AVX512F |
460 | static SIMDPP_INL |
461 | mask_float64<8> i_cmp_gt(const float64<8>& a, const float64<8>& b) |
462 | { |
463 | return _mm512_cmp_pd_mask(a.native(), b.native(), _CMP_GT_OQ); |
464 | } |
465 | #endif |
466 | |
467 | // ----------------------------------------------------------------------------- |
468 | |
469 | template<class V> SIMDPP_INL |
470 | typename V::mask_vector_type i_cmp_gt(const V& a, const V& b) |
471 | { |
472 | SIMDPP_VEC_ARRAY_IMPL2(typename V::mask_vector_type, i_cmp_gt, a, b); |
473 | } |
474 | |
475 | } // namespace insn |
476 | } // namespace detail |
477 | } // namespace SIMDPP_ARCH_NAMESPACE |
478 | } // namespace simdpp |
479 | |
480 | #endif |
481 | |
482 | |