1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_LT_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_LT_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/bit_xor.h> |
17 | #include <simdpp/detail/null/compare.h> |
18 | #include <simdpp/detail/not_implemented.h> |
19 | #include <simdpp/detail/vector_array_macros.h> |
20 | |
21 | namespace simdpp { |
22 | namespace SIMDPP_ARCH_NAMESPACE { |
23 | namespace detail { |
24 | namespace insn { |
25 | |
26 | |
27 | static SIMDPP_INL |
28 | mask_int8x16 i_cmp_lt(const int8x16& a, const int8x16& b) |
29 | { |
30 | #if SIMDPP_USE_NULL |
31 | return detail::null::cmp_lt(a, b); |
32 | #elif SIMDPP_USE_AVX512VL |
33 | return _mm_cmplt_epi8_mask(a.native(), b.native()); |
34 | #elif SIMDPP_USE_SSE2 |
35 | return _mm_cmplt_epi8(a.native(), b.native()); |
36 | #elif SIMDPP_USE_NEON |
37 | return vcltq_s8(a.native(), b.native()); |
38 | #elif SIMDPP_USE_ALTIVEC |
39 | return vec_cmplt(a.native(), b.native()); |
40 | #elif SIMDPP_USE_MSA |
41 | return (v16u8) __msa_clt_s_b(a.native(), b.native()); |
42 | #endif |
43 | } |
44 | |
45 | #if SIMDPP_USE_AVX2 |
46 | static SIMDPP_INL |
47 | mask_int8x32 i_cmp_lt(const int8x32& a, const int8x32& b) |
48 | { |
49 | #if SIMDPP_USE_AVX512VL |
50 | return _mm256_cmplt_epi8_mask(a.native(), b.native()); |
51 | #else |
52 | return _mm256_cmpgt_epi8(b.native(), a.native()); |
53 | #endif |
54 | } |
55 | #endif |
56 | |
57 | #if SIMDPP_USE_AVX512BW |
58 | SIMDPP_INL mask_int8<64> i_cmp_lt(const int8<64>& a, const int8<64>& b) |
59 | { |
60 | return _mm512_cmplt_epi8_mask(a.native(), b.native()); |
61 | } |
62 | #endif |
63 | |
64 | // ----------------------------------------------------------------------------- |
65 | |
66 | static SIMDPP_INL |
67 | mask_int8x16 i_cmp_lt(const uint8x16& ca, const uint8x16& cb) |
68 | { |
69 | uint8<16> a = ca, b = cb; |
70 | #if SIMDPP_USE_NULL |
71 | return detail::null::cmp_lt(a, b); |
72 | #elif SIMDPP_USE_AVX512VL |
73 | return _mm_cmplt_epu8_mask(a.native(), b.native()); |
74 | #elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
75 | return _mm_comlt_epu8(a.native(), b.native()); |
76 | #elif SIMDPP_USE_SSE2 |
77 | a = bit_xor(a, 0x80); // sub |
78 | b = bit_xor(b, 0x80); // sub |
79 | return _mm_cmplt_epi8(a.native(), b.native()); |
80 | #elif SIMDPP_USE_NEON |
81 | return vcltq_u8(a.native(), b.native()); |
82 | #elif SIMDPP_USE_ALTIVEC |
83 | return vec_cmplt(a.native(), b.native()); |
84 | #elif SIMDPP_USE_MSA |
85 | return (v16u8) __msa_clt_u_b(a.native(), b.native()); |
86 | #endif |
87 | } |
88 | |
89 | #if SIMDPP_USE_AVX2 |
90 | static SIMDPP_INL |
91 | mask_int8x32 i_cmp_lt(const uint8x32& ca, const uint8x32& cb) |
92 | { |
93 | #if SIMDPP_USE_AVX512VL |
94 | return _mm256_cmplt_epu8_mask(ca.native(), cb.native()); |
95 | #else |
96 | uint8<32> a = ca, b = cb; |
97 | a = bit_xor(a, 0x80); // sub |
98 | b = bit_xor(b, 0x80); // sub |
99 | return _mm256_cmpgt_epi8(b.native(), a.native()); |
100 | #endif |
101 | } |
102 | #endif |
103 | |
104 | #if SIMDPP_USE_AVX512BW |
105 | SIMDPP_INL mask_int8<64> i_cmp_lt(const uint8<64>& a, const uint8<64>& b) |
106 | { |
107 | return _mm512_cmplt_epu8_mask(a.native(), b.native()); |
108 | } |
109 | #endif |
110 | |
111 | // ----------------------------------------------------------------------------- |
112 | |
113 | static SIMDPP_INL |
114 | mask_int16x8 i_cmp_lt(const int16x8& a, const int16x8& b) |
115 | { |
116 | #if SIMDPP_USE_NULL |
117 | return detail::null::cmp_lt(a, b); |
118 | #elif SIMDPP_USE_AVX512VL |
119 | return _mm_cmplt_epi16_mask(a.native(), b.native()); |
120 | #elif SIMDPP_USE_SSE2 |
121 | return _mm_cmplt_epi16(a.native(), b.native()); |
122 | #elif SIMDPP_USE_NEON |
123 | return vcltq_s16(a.native(), b.native()); |
124 | #elif SIMDPP_USE_ALTIVEC |
125 | return vec_cmplt(a.native(), b.native()); |
126 | #elif SIMDPP_USE_MSA |
127 | return (v8u16) __msa_clt_s_h(a.native(), b.native()); |
128 | #endif |
129 | } |
130 | |
131 | #if SIMDPP_USE_AVX2 |
132 | static SIMDPP_INL |
133 | mask_int16x16 i_cmp_lt(const int16x16& a, const int16x16& b) |
134 | { |
135 | #if SIMDPP_USE_AVX512VL |
136 | return _mm256_cmplt_epi16_mask(a.native(), b.native()); |
137 | #else |
138 | return _mm256_cmpgt_epi16(b.native(), a.native()); |
139 | #endif |
140 | } |
141 | #endif |
142 | |
143 | #if SIMDPP_USE_AVX512BW |
144 | SIMDPP_INL mask_int16<32> i_cmp_lt(const int16<32>& a, const int16<32>& b) |
145 | { |
146 | return _mm512_cmplt_epi16_mask(a.native(), b.native()); |
147 | } |
148 | #endif |
149 | |
150 | // ----------------------------------------------------------------------------- |
151 | |
152 | static SIMDPP_INL |
153 | mask_int16x8 i_cmp_lt(const uint16x8& ca, const uint16x8& cb) |
154 | { |
155 | uint16<8> a = ca, b = cb; |
156 | #if SIMDPP_USE_NULL |
157 | return detail::null::cmp_lt(a, b); |
158 | #elif SIMDPP_USE_AVX512VL |
159 | return _mm_cmplt_epu16_mask(a.native(), b.native()); |
160 | #elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
161 | return _mm_comlt_epu16(a.native(), b.native()); |
162 | #elif SIMDPP_USE_SSE2 |
163 | uint16x8 bias = make_uint(0x8000); |
164 | a = bit_xor(a, bias); // sub |
165 | b = bit_xor(b, bias); // sub |
166 | return _mm_cmplt_epi16(a.native(), b.native()); |
167 | #elif SIMDPP_USE_NEON |
168 | return vcltq_u16(a.native(), b.native()); |
169 | #elif SIMDPP_USE_ALTIVEC |
170 | return vec_cmplt(a.native(), b.native()); |
171 | #elif SIMDPP_USE_MSA |
172 | return (v8u16) __msa_clt_u_h(a.native(), b.native()); |
173 | #endif |
174 | } |
175 | |
176 | #if SIMDPP_USE_AVX2 |
177 | static SIMDPP_INL |
178 | mask_int16x16 i_cmp_lt(const uint16x16& ca, const uint16x16& cb) |
179 | { |
180 | #if SIMDPP_USE_AVX512VL |
181 | return _mm256_cmplt_epu16_mask(ca.native(), cb.native()); |
182 | #else |
183 | uint16<16> a = ca, b = cb; |
184 | a = bit_xor(a, 0x8000); // sub |
185 | b = bit_xor(b, 0x8000); // sub |
186 | return _mm256_cmpgt_epi16(b.native(), a.native()); |
187 | #endif |
188 | } |
189 | #endif |
190 | |
191 | #if SIMDPP_USE_AVX512BW |
192 | SIMDPP_INL mask_int16<32> i_cmp_lt(const uint16<32>& a, const uint16<32>& b) |
193 | { |
194 | return _mm512_cmplt_epu16_mask(a.native(), b.native()); |
195 | } |
196 | #endif |
197 | |
198 | // ----------------------------------------------------------------------------- |
199 | |
200 | static SIMDPP_INL |
201 | mask_int32x4 i_cmp_lt(const int32x4& a, const int32x4& b) |
202 | { |
203 | #if SIMDPP_USE_NULL |
204 | return detail::null::cmp_lt(a, b); |
205 | #elif SIMDPP_USE_AVX512VL |
206 | return _mm_cmplt_epi32_mask(a.native(), b.native()); |
207 | #elif SIMDPP_USE_SSE2 |
208 | return _mm_cmplt_epi32(a.native(), b.native()); |
209 | #elif SIMDPP_USE_NEON |
210 | return vcltq_s32(a.native(), b.native()); |
211 | #elif SIMDPP_USE_ALTIVEC |
212 | return vec_cmplt(a.native(), b.native()); |
213 | #elif SIMDPP_USE_MSA |
214 | return (v4u32) __msa_clt_s_w(a.native(), b.native()); |
215 | #endif |
216 | } |
217 | |
218 | #if SIMDPP_USE_AVX2 |
219 | static SIMDPP_INL |
220 | mask_int32x8 i_cmp_lt(const int32x8& a, const int32x8& b) |
221 | { |
222 | #if SIMDPP_USE_AVX512VL |
223 | return _mm256_cmplt_epi32_mask(a.native(), b.native()); |
224 | #else |
225 | return _mm256_cmpgt_epi32(b.native(), a.native()); |
226 | #endif |
227 | } |
228 | #endif |
229 | |
230 | #if SIMDPP_USE_AVX512F |
231 | static SIMDPP_INL |
232 | mask_int32<16> i_cmp_lt(const int32<16>& a, const int32<16>& b) |
233 | { |
234 | return _mm512_cmpgt_epi32_mask(b.native(), a.native()); |
235 | } |
236 | #endif |
237 | |
238 | // ----------------------------------------------------------------------------- |
239 | |
240 | static SIMDPP_INL |
241 | mask_int32x4 i_cmp_lt(const uint32x4& ca, const uint32x4& cb) |
242 | { |
243 | uint32<4> a = ca, b = cb; |
244 | #if SIMDPP_USE_NULL |
245 | return detail::null::cmp_lt(a, b); |
246 | #elif SIMDPP_USE_AVX512VL |
247 | return _mm_cmplt_epu32_mask(a.native(), b.native()); |
248 | #elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
249 | return _mm_comlt_epu32(a.native(), b.native()); |
250 | #elif SIMDPP_USE_SSE2 |
251 | a = bit_xor(a, 0x80000000); // sub |
252 | b = bit_xor(b, 0x80000000); // sub |
253 | return _mm_cmplt_epi32(a.native(), b.native()); |
254 | #elif SIMDPP_USE_NEON |
255 | return vcltq_u32(a.native(), b.native()); |
256 | #elif SIMDPP_USE_ALTIVEC |
257 | return vec_cmplt(a.native(), b.native()); |
258 | #elif SIMDPP_USE_MSA |
259 | return (v4u32) __msa_clt_u_w(a.native(), b.native()); |
260 | #endif |
261 | } |
262 | |
263 | #if SIMDPP_USE_AVX2 |
264 | static SIMDPP_INL |
265 | mask_int32x8 i_cmp_lt(const uint32x8& ca, const uint32x8& cb) |
266 | { |
267 | #if SIMDPP_USE_AVX512VL |
268 | return _mm256_cmplt_epu32_mask(ca.native(), cb.native()); |
269 | #else |
270 | uint32<8> a = ca, b = cb; |
271 | a = bit_xor(a, 0x80000000); // sub |
272 | b = bit_xor(b, 0x80000000); // sub |
273 | return _mm256_cmpgt_epi32(b.native(), a.native()); |
274 | #endif |
275 | } |
276 | #endif |
277 | |
278 | #if SIMDPP_USE_AVX512F |
279 | static SIMDPP_INL |
280 | mask_int32<16> i_cmp_lt(const uint32<16>& a, const uint32<16>& b) |
281 | { |
282 | return _mm512_cmplt_epu32_mask(a.native(), b.native()); |
283 | } |
284 | #endif |
285 | |
286 | // ----------------------------------------------------------------------------- |
287 | |
288 | static SIMDPP_INL |
289 | mask_int64x2 i_cmp_lt(const int64x2& a, const int64x2& b) |
290 | { |
291 | #if SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
292 | return _mm_comlt_epi64(a.native(), b.native()); |
293 | #elif SIMDPP_USE_AVX512VL |
294 | return _mm_cmplt_epi64_mask(a.native(), b.native()); |
295 | #elif SIMDPP_USE_AVX2 |
296 | return _mm_cmpgt_epi64(b.native(), a.native()); |
297 | #elif SIMDPP_USE_NEON64 |
298 | return vcltq_s64(a.native(), b.native()); |
299 | #elif SIMDPP_USE_VSX_207 |
300 | return (__vector uint64_t) vec_cmplt(a.native(), b.native()); |
301 | #elif SIMDPP_USE_MSA |
302 | return (v2u64) __msa_clt_s_d(a.native(), b.native()); |
303 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
304 | return detail::null::cmp_lt(a, b); |
305 | #else |
306 | return SIMDPP_NOT_IMPLEMENTED2(a, b); |
307 | #endif |
308 | } |
309 | |
310 | #if SIMDPP_USE_AVX2 |
311 | static SIMDPP_INL |
312 | mask_int64x4 i_cmp_lt(const int64x4& a, const int64x4& b) |
313 | { |
314 | #if SIMDPP_USE_AVX512VL |
315 | return _mm256_cmplt_epi64_mask(a.native(), b.native()); |
316 | #else |
317 | return _mm256_cmpgt_epi64(b.native(), a.native()); |
318 | #endif |
319 | } |
320 | #endif |
321 | |
322 | #if SIMDPP_USE_AVX512F |
323 | static SIMDPP_INL |
324 | mask_int64<8> i_cmp_lt(const int64<8>& a, const int64<8>& b) |
325 | { |
326 | return _mm512_cmplt_epi64_mask(a.native(), b.native()); |
327 | } |
328 | #endif |
329 | |
330 | // ----------------------------------------------------------------------------- |
331 | |
332 | static SIMDPP_INL |
333 | mask_int64x2 i_cmp_lt(const uint64x2& a, const uint64x2& b) |
334 | { |
335 | #if SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM |
336 | return _mm_comlt_epu64(a.native(), b.native()); |
337 | #elif SIMDPP_USE_AVX512VL |
338 | return _mm_cmplt_epu64_mask(a.native(), b.native()); |
339 | #elif SIMDPP_USE_AVX2 |
340 | uint64<2> ca = bit_xor(a, 0x8000000000000000); // sub |
341 | uint64<2> cb = bit_xor(b, 0x8000000000000000); // sub |
342 | return _mm_cmpgt_epi64(cb.native(), ca.native()); |
343 | #elif SIMDPP_USE_NEON64 |
344 | return vcltq_u64(a.native(), b.native()); |
345 | #elif SIMDPP_USE_VSX_207 |
346 | return (__vector uint64_t) vec_cmplt(a.native(), b.native()); |
347 | #elif SIMDPP_USE_MSA |
348 | return (v2u64) __msa_clt_u_d(a.native(), b.native()); |
349 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
350 | return detail::null::cmp_lt(a, b); |
351 | #else |
352 | return SIMDPP_NOT_IMPLEMENTED2(a, b); |
353 | #endif |
354 | } |
355 | |
356 | #if SIMDPP_USE_AVX2 |
357 | static SIMDPP_INL |
358 | mask_int64x4 i_cmp_lt(const uint64x4& ca, const uint64x4& cb) |
359 | { |
360 | #if SIMDPP_USE_AVX512VL |
361 | return _mm256_cmplt_epu64_mask(ca.native(), cb.native()); |
362 | #else |
363 | uint64<4> a = ca, b = cb; |
364 | a = bit_xor(a, 0x8000000000000000); // sub |
365 | b = bit_xor(b, 0x8000000000000000); // sub |
366 | return _mm256_cmpgt_epi64(b.native(), a.native()); |
367 | #endif |
368 | } |
369 | #endif |
370 | |
371 | #if SIMDPP_USE_AVX512F |
372 | static SIMDPP_INL |
373 | mask_int64<8> i_cmp_lt(const uint64<8>& a, const uint64<8>& b) |
374 | { |
375 | return _mm512_cmplt_epu64_mask(a.native(), b.native()); |
376 | } |
377 | #endif |
378 | |
379 | // ----------------------------------------------------------------------------- |
380 | |
381 | static SIMDPP_INL |
382 | mask_float32x4 i_cmp_lt(const float32x4& a, const float32x4& b) |
383 | { |
384 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
385 | return detail::null::cmp_lt(a, b); |
386 | #elif SIMDPP_USE_AVX512VL |
387 | return _mm_cmp_ps_mask(a.native(), b.native(), _CMP_LT_OQ); |
388 | #elif SIMDPP_USE_AVX |
389 | return _mm_cmp_ps(a.native(), b.native(), _CMP_LT_OQ); |
390 | #elif SIMDPP_USE_SSE2 |
391 | return _mm_cmplt_ps(a.native(), b.native()); |
392 | #elif SIMDPP_USE_NEON |
393 | return vreinterpretq_f32_u32(vcltq_f32(a.native(), b.native())); |
394 | #elif SIMDPP_USE_ALTIVEC |
395 | return vec_cmplt(a.native(), b.native()); |
396 | #elif SIMDPP_USE_MSA |
397 | return (v4f32) __msa_fclt_w(a.native(), b.native()); |
398 | #endif |
399 | } |
400 | |
401 | #if SIMDPP_USE_AVX |
402 | static SIMDPP_INL |
403 | mask_float32x8 i_cmp_lt(const float32x8& a, const float32x8& b) |
404 | { |
405 | #if SIMDPP_USE_AVX512VL |
406 | return _mm256_cmp_ps_mask(a.native(), b.native(), _CMP_LT_OQ); |
407 | #else |
408 | return _mm256_cmp_ps(a.native(), b.native(), _CMP_LT_OQ); |
409 | #endif |
410 | } |
411 | #endif |
412 | |
413 | #if SIMDPP_USE_AVX512F |
414 | static SIMDPP_INL |
415 | mask_float32<16> i_cmp_lt(const float32<16>& a, const float32<16>& b) |
416 | { |
417 | return _mm512_cmp_ps_mask(a.native(), b.native(), _CMP_LT_OQ); |
418 | } |
419 | #endif |
420 | |
421 | // ----------------------------------------------------------------------------- |
422 | |
423 | static SIMDPP_INL |
424 | mask_float64x2 i_cmp_lt(const float64x2& a, const float64x2& b) |
425 | { |
426 | #if SIMDPP_USE_AVX512VL |
427 | return _mm_cmp_pd_mask(a.native(), b.native(), _CMP_LT_OQ); |
428 | #elif SIMDPP_USE_AVX |
429 | return _mm_cmp_pd(a.native(), b.native(), _CMP_LT_OQ); |
430 | #elif SIMDPP_USE_SSE2 |
431 | return _mm_cmplt_pd(a.native(), b.native()); |
432 | #elif SIMDPP_USE_NEON64 |
433 | return vreinterpretq_f64_u64(vcltq_f64(a.native(), b.native())); |
434 | #elif SIMDPP_USE_VSX_206 |
435 | return (__vector double) vec_cmplt(a.native(), b.native()); |
436 | #elif SIMDPP_USE_MSA |
437 | return (v2f64) __msa_fclt_d(a.native(), b.native()); |
438 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
439 | return detail::null::cmp_lt(a, b); |
440 | #endif |
441 | } |
442 | |
443 | #if SIMDPP_USE_AVX |
444 | static SIMDPP_INL |
445 | mask_float64x4 i_cmp_lt(const float64x4& a, const float64x4& b) |
446 | { |
447 | #if SIMDPP_USE_AVX512VL |
448 | return _mm256_cmp_pd_mask(a.native(), b.native(), _CMP_LT_OQ); |
449 | #else |
450 | return _mm256_cmp_pd(a.native(), b.native(), _CMP_LT_OQ); |
451 | #endif |
452 | } |
453 | #endif |
454 | |
455 | #if SIMDPP_USE_AVX512F |
456 | static SIMDPP_INL |
457 | mask_float64<8> i_cmp_lt(const float64<8>& a, const float64<8>& b) |
458 | { |
459 | return _mm512_cmp_pd_mask(a.native(), b.native(), _CMP_LT_OQ); |
460 | } |
461 | #endif |
462 | |
463 | // ----------------------------------------------------------------------------- |
464 | |
465 | template<class V> SIMDPP_INL |
466 | typename V::mask_vector_type i_cmp_lt(const V& a, const V& b) |
467 | { |
468 | SIMDPP_VEC_ARRAY_IMPL2(typename V::mask_vector_type, i_cmp_lt, a, b); |
469 | } |
470 | |
471 | } // namespace insn |
472 | } // namespace detail |
473 | } // namespace SIMDPP_ARCH_NAMESPACE |
474 | } // namespace simdpp |
475 | |
476 | #endif |
477 | |
478 | |