1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_LT_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_CMP_LT_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/bit_xor.h>
17#include <simdpp/detail/null/compare.h>
18#include <simdpp/detail/not_implemented.h>
19#include <simdpp/detail/vector_array_macros.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace insn {
25
26
27static SIMDPP_INL
28mask_int8x16 i_cmp_lt(const int8x16& a, const int8x16& b)
29{
30#if SIMDPP_USE_NULL
31 return detail::null::cmp_lt(a, b);
32#elif SIMDPP_USE_AVX512VL
33 return _mm_cmplt_epi8_mask(a.native(), b.native());
34#elif SIMDPP_USE_SSE2
35 return _mm_cmplt_epi8(a.native(), b.native());
36#elif SIMDPP_USE_NEON
37 return vcltq_s8(a.native(), b.native());
38#elif SIMDPP_USE_ALTIVEC
39 return vec_cmplt(a.native(), b.native());
40#elif SIMDPP_USE_MSA
41 return (v16u8) __msa_clt_s_b(a.native(), b.native());
42#endif
43}
44
45#if SIMDPP_USE_AVX2
46static SIMDPP_INL
47mask_int8x32 i_cmp_lt(const int8x32& a, const int8x32& b)
48{
49#if SIMDPP_USE_AVX512VL
50 return _mm256_cmplt_epi8_mask(a.native(), b.native());
51#else
52 return _mm256_cmpgt_epi8(b.native(), a.native());
53#endif
54}
55#endif
56
57#if SIMDPP_USE_AVX512BW
58SIMDPP_INL mask_int8<64> i_cmp_lt(const int8<64>& a, const int8<64>& b)
59{
60 return _mm512_cmplt_epi8_mask(a.native(), b.native());
61}
62#endif
63
64// -----------------------------------------------------------------------------
65
66static SIMDPP_INL
67mask_int8x16 i_cmp_lt(const uint8x16& ca, const uint8x16& cb)
68{
69 uint8<16> a = ca, b = cb;
70#if SIMDPP_USE_NULL
71 return detail::null::cmp_lt(a, b);
72#elif SIMDPP_USE_AVX512VL
73 return _mm_cmplt_epu8_mask(a.native(), b.native());
74#elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
75 return _mm_comlt_epu8(a.native(), b.native());
76#elif SIMDPP_USE_SSE2
77 a = bit_xor(a, 0x80); // sub
78 b = bit_xor(b, 0x80); // sub
79 return _mm_cmplt_epi8(a.native(), b.native());
80#elif SIMDPP_USE_NEON
81 return vcltq_u8(a.native(), b.native());
82#elif SIMDPP_USE_ALTIVEC
83 return vec_cmplt(a.native(), b.native());
84#elif SIMDPP_USE_MSA
85 return (v16u8) __msa_clt_u_b(a.native(), b.native());
86#endif
87}
88
89#if SIMDPP_USE_AVX2
90static SIMDPP_INL
91mask_int8x32 i_cmp_lt(const uint8x32& ca, const uint8x32& cb)
92{
93#if SIMDPP_USE_AVX512VL
94 return _mm256_cmplt_epu8_mask(ca.native(), cb.native());
95#else
96 uint8<32> a = ca, b = cb;
97 a = bit_xor(a, 0x80); // sub
98 b = bit_xor(b, 0x80); // sub
99 return _mm256_cmpgt_epi8(b.native(), a.native());
100#endif
101}
102#endif
103
104#if SIMDPP_USE_AVX512BW
105SIMDPP_INL mask_int8<64> i_cmp_lt(const uint8<64>& a, const uint8<64>& b)
106{
107 return _mm512_cmplt_epu8_mask(a.native(), b.native());
108}
109#endif
110
111// -----------------------------------------------------------------------------
112
113static SIMDPP_INL
114mask_int16x8 i_cmp_lt(const int16x8& a, const int16x8& b)
115{
116#if SIMDPP_USE_NULL
117 return detail::null::cmp_lt(a, b);
118#elif SIMDPP_USE_AVX512VL
119 return _mm_cmplt_epi16_mask(a.native(), b.native());
120#elif SIMDPP_USE_SSE2
121 return _mm_cmplt_epi16(a.native(), b.native());
122#elif SIMDPP_USE_NEON
123 return vcltq_s16(a.native(), b.native());
124#elif SIMDPP_USE_ALTIVEC
125 return vec_cmplt(a.native(), b.native());
126#elif SIMDPP_USE_MSA
127 return (v8u16) __msa_clt_s_h(a.native(), b.native());
128#endif
129}
130
131#if SIMDPP_USE_AVX2
132static SIMDPP_INL
133mask_int16x16 i_cmp_lt(const int16x16& a, const int16x16& b)
134{
135#if SIMDPP_USE_AVX512VL
136 return _mm256_cmplt_epi16_mask(a.native(), b.native());
137#else
138 return _mm256_cmpgt_epi16(b.native(), a.native());
139#endif
140}
141#endif
142
143#if SIMDPP_USE_AVX512BW
144SIMDPP_INL mask_int16<32> i_cmp_lt(const int16<32>& a, const int16<32>& b)
145{
146 return _mm512_cmplt_epi16_mask(a.native(), b.native());
147}
148#endif
149
150// -----------------------------------------------------------------------------
151
152static SIMDPP_INL
153mask_int16x8 i_cmp_lt(const uint16x8& ca, const uint16x8& cb)
154{
155 uint16<8> a = ca, b = cb;
156#if SIMDPP_USE_NULL
157 return detail::null::cmp_lt(a, b);
158#elif SIMDPP_USE_AVX512VL
159 return _mm_cmplt_epu16_mask(a.native(), b.native());
160#elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
161 return _mm_comlt_epu16(a.native(), b.native());
162#elif SIMDPP_USE_SSE2
163 uint16x8 bias = make_uint(0x8000);
164 a = bit_xor(a, bias); // sub
165 b = bit_xor(b, bias); // sub
166 return _mm_cmplt_epi16(a.native(), b.native());
167#elif SIMDPP_USE_NEON
168 return vcltq_u16(a.native(), b.native());
169#elif SIMDPP_USE_ALTIVEC
170 return vec_cmplt(a.native(), b.native());
171#elif SIMDPP_USE_MSA
172 return (v8u16) __msa_clt_u_h(a.native(), b.native());
173#endif
174}
175
176#if SIMDPP_USE_AVX2
177static SIMDPP_INL
178mask_int16x16 i_cmp_lt(const uint16x16& ca, const uint16x16& cb)
179{
180#if SIMDPP_USE_AVX512VL
181 return _mm256_cmplt_epu16_mask(ca.native(), cb.native());
182#else
183 uint16<16> a = ca, b = cb;
184 a = bit_xor(a, 0x8000); // sub
185 b = bit_xor(b, 0x8000); // sub
186 return _mm256_cmpgt_epi16(b.native(), a.native());
187#endif
188}
189#endif
190
191#if SIMDPP_USE_AVX512BW
192SIMDPP_INL mask_int16<32> i_cmp_lt(const uint16<32>& a, const uint16<32>& b)
193{
194 return _mm512_cmplt_epu16_mask(a.native(), b.native());
195}
196#endif
197
198// -----------------------------------------------------------------------------
199
200static SIMDPP_INL
201mask_int32x4 i_cmp_lt(const int32x4& a, const int32x4& b)
202{
203#if SIMDPP_USE_NULL
204 return detail::null::cmp_lt(a, b);
205#elif SIMDPP_USE_AVX512VL
206 return _mm_cmplt_epi32_mask(a.native(), b.native());
207#elif SIMDPP_USE_SSE2
208 return _mm_cmplt_epi32(a.native(), b.native());
209#elif SIMDPP_USE_NEON
210 return vcltq_s32(a.native(), b.native());
211#elif SIMDPP_USE_ALTIVEC
212 return vec_cmplt(a.native(), b.native());
213#elif SIMDPP_USE_MSA
214 return (v4u32) __msa_clt_s_w(a.native(), b.native());
215#endif
216}
217
218#if SIMDPP_USE_AVX2
219static SIMDPP_INL
220mask_int32x8 i_cmp_lt(const int32x8& a, const int32x8& b)
221{
222#if SIMDPP_USE_AVX512VL
223 return _mm256_cmplt_epi32_mask(a.native(), b.native());
224#else
225 return _mm256_cmpgt_epi32(b.native(), a.native());
226#endif
227}
228#endif
229
230#if SIMDPP_USE_AVX512F
231static SIMDPP_INL
232mask_int32<16> i_cmp_lt(const int32<16>& a, const int32<16>& b)
233{
234 return _mm512_cmpgt_epi32_mask(b.native(), a.native());
235}
236#endif
237
238// -----------------------------------------------------------------------------
239
240static SIMDPP_INL
241mask_int32x4 i_cmp_lt(const uint32x4& ca, const uint32x4& cb)
242{
243 uint32<4> a = ca, b = cb;
244#if SIMDPP_USE_NULL
245 return detail::null::cmp_lt(a, b);
246#elif SIMDPP_USE_AVX512VL
247 return _mm_cmplt_epu32_mask(a.native(), b.native());
248#elif SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
249 return _mm_comlt_epu32(a.native(), b.native());
250#elif SIMDPP_USE_SSE2
251 a = bit_xor(a, 0x80000000); // sub
252 b = bit_xor(b, 0x80000000); // sub
253 return _mm_cmplt_epi32(a.native(), b.native());
254#elif SIMDPP_USE_NEON
255 return vcltq_u32(a.native(), b.native());
256#elif SIMDPP_USE_ALTIVEC
257 return vec_cmplt(a.native(), b.native());
258#elif SIMDPP_USE_MSA
259 return (v4u32) __msa_clt_u_w(a.native(), b.native());
260#endif
261}
262
263#if SIMDPP_USE_AVX2
264static SIMDPP_INL
265mask_int32x8 i_cmp_lt(const uint32x8& ca, const uint32x8& cb)
266{
267#if SIMDPP_USE_AVX512VL
268 return _mm256_cmplt_epu32_mask(ca.native(), cb.native());
269#else
270 uint32<8> a = ca, b = cb;
271 a = bit_xor(a, 0x80000000); // sub
272 b = bit_xor(b, 0x80000000); // sub
273 return _mm256_cmpgt_epi32(b.native(), a.native());
274#endif
275}
276#endif
277
278#if SIMDPP_USE_AVX512F
279static SIMDPP_INL
280mask_int32<16> i_cmp_lt(const uint32<16>& a, const uint32<16>& b)
281{
282 return _mm512_cmplt_epu32_mask(a.native(), b.native());
283}
284#endif
285
286// -----------------------------------------------------------------------------
287
288static SIMDPP_INL
289mask_int64x2 i_cmp_lt(const int64x2& a, const int64x2& b)
290{
291#if SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
292 return _mm_comlt_epi64(a.native(), b.native());
293#elif SIMDPP_USE_AVX512VL
294 return _mm_cmplt_epi64_mask(a.native(), b.native());
295#elif SIMDPP_USE_AVX2
296 return _mm_cmpgt_epi64(b.native(), a.native());
297#elif SIMDPP_USE_NEON64
298 return vcltq_s64(a.native(), b.native());
299#elif SIMDPP_USE_VSX_207
300 return (__vector uint64_t) vec_cmplt(a.native(), b.native());
301#elif SIMDPP_USE_MSA
302 return (v2u64) __msa_clt_s_d(a.native(), b.native());
303#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
304 return detail::null::cmp_lt(a, b);
305#else
306 return SIMDPP_NOT_IMPLEMENTED2(a, b);
307#endif
308}
309
310#if SIMDPP_USE_AVX2
311static SIMDPP_INL
312mask_int64x4 i_cmp_lt(const int64x4& a, const int64x4& b)
313{
314#if SIMDPP_USE_AVX512VL
315 return _mm256_cmplt_epi64_mask(a.native(), b.native());
316#else
317 return _mm256_cmpgt_epi64(b.native(), a.native());
318#endif
319}
320#endif
321
322#if SIMDPP_USE_AVX512F
323static SIMDPP_INL
324mask_int64<8> i_cmp_lt(const int64<8>& a, const int64<8>& b)
325{
326 return _mm512_cmplt_epi64_mask(a.native(), b.native());
327}
328#endif
329
330// -----------------------------------------------------------------------------
331
332static SIMDPP_INL
333mask_int64x2 i_cmp_lt(const uint64x2& a, const uint64x2& b)
334{
335#if SIMDPP_USE_XOP && !SIMDPP_WORKAROUND_XOP_COM
336 return _mm_comlt_epu64(a.native(), b.native());
337#elif SIMDPP_USE_AVX512VL
338 return _mm_cmplt_epu64_mask(a.native(), b.native());
339#elif SIMDPP_USE_AVX2
340 uint64<2> ca = bit_xor(a, 0x8000000000000000); // sub
341 uint64<2> cb = bit_xor(b, 0x8000000000000000); // sub
342 return _mm_cmpgt_epi64(cb.native(), ca.native());
343#elif SIMDPP_USE_NEON64
344 return vcltq_u64(a.native(), b.native());
345#elif SIMDPP_USE_VSX_207
346 return (__vector uint64_t) vec_cmplt(a.native(), b.native());
347#elif SIMDPP_USE_MSA
348 return (v2u64) __msa_clt_u_d(a.native(), b.native());
349#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
350 return detail::null::cmp_lt(a, b);
351#else
352 return SIMDPP_NOT_IMPLEMENTED2(a, b);
353#endif
354}
355
356#if SIMDPP_USE_AVX2
357static SIMDPP_INL
358mask_int64x4 i_cmp_lt(const uint64x4& ca, const uint64x4& cb)
359{
360#if SIMDPP_USE_AVX512VL
361 return _mm256_cmplt_epu64_mask(ca.native(), cb.native());
362#else
363 uint64<4> a = ca, b = cb;
364 a = bit_xor(a, 0x8000000000000000); // sub
365 b = bit_xor(b, 0x8000000000000000); // sub
366 return _mm256_cmpgt_epi64(b.native(), a.native());
367#endif
368}
369#endif
370
371#if SIMDPP_USE_AVX512F
372static SIMDPP_INL
373mask_int64<8> i_cmp_lt(const uint64<8>& a, const uint64<8>& b)
374{
375 return _mm512_cmplt_epu64_mask(a.native(), b.native());
376}
377#endif
378
379// -----------------------------------------------------------------------------
380
381static SIMDPP_INL
382mask_float32x4 i_cmp_lt(const float32x4& a, const float32x4& b)
383{
384#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
385 return detail::null::cmp_lt(a, b);
386#elif SIMDPP_USE_AVX512VL
387 return _mm_cmp_ps_mask(a.native(), b.native(), _CMP_LT_OQ);
388#elif SIMDPP_USE_AVX
389 return _mm_cmp_ps(a.native(), b.native(), _CMP_LT_OQ);
390#elif SIMDPP_USE_SSE2
391 return _mm_cmplt_ps(a.native(), b.native());
392#elif SIMDPP_USE_NEON
393 return vreinterpretq_f32_u32(vcltq_f32(a.native(), b.native()));
394#elif SIMDPP_USE_ALTIVEC
395 return vec_cmplt(a.native(), b.native());
396#elif SIMDPP_USE_MSA
397 return (v4f32) __msa_fclt_w(a.native(), b.native());
398#endif
399}
400
401#if SIMDPP_USE_AVX
402static SIMDPP_INL
403mask_float32x8 i_cmp_lt(const float32x8& a, const float32x8& b)
404{
405#if SIMDPP_USE_AVX512VL
406 return _mm256_cmp_ps_mask(a.native(), b.native(), _CMP_LT_OQ);
407#else
408 return _mm256_cmp_ps(a.native(), b.native(), _CMP_LT_OQ);
409#endif
410}
411#endif
412
413#if SIMDPP_USE_AVX512F
414static SIMDPP_INL
415mask_float32<16> i_cmp_lt(const float32<16>& a, const float32<16>& b)
416{
417 return _mm512_cmp_ps_mask(a.native(), b.native(), _CMP_LT_OQ);
418}
419#endif
420
421// -----------------------------------------------------------------------------
422
423static SIMDPP_INL
424mask_float64x2 i_cmp_lt(const float64x2& a, const float64x2& b)
425{
426#if SIMDPP_USE_AVX512VL
427 return _mm_cmp_pd_mask(a.native(), b.native(), _CMP_LT_OQ);
428#elif SIMDPP_USE_AVX
429 return _mm_cmp_pd(a.native(), b.native(), _CMP_LT_OQ);
430#elif SIMDPP_USE_SSE2
431 return _mm_cmplt_pd(a.native(), b.native());
432#elif SIMDPP_USE_NEON64
433 return vreinterpretq_f64_u64(vcltq_f64(a.native(), b.native()));
434#elif SIMDPP_USE_VSX_206
435 return (__vector double) vec_cmplt(a.native(), b.native());
436#elif SIMDPP_USE_MSA
437 return (v2f64) __msa_fclt_d(a.native(), b.native());
438#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
439 return detail::null::cmp_lt(a, b);
440#endif
441}
442
443#if SIMDPP_USE_AVX
444static SIMDPP_INL
445mask_float64x4 i_cmp_lt(const float64x4& a, const float64x4& b)
446{
447#if SIMDPP_USE_AVX512VL
448 return _mm256_cmp_pd_mask(a.native(), b.native(), _CMP_LT_OQ);
449#else
450 return _mm256_cmp_pd(a.native(), b.native(), _CMP_LT_OQ);
451#endif
452}
453#endif
454
455#if SIMDPP_USE_AVX512F
456static SIMDPP_INL
457mask_float64<8> i_cmp_lt(const float64<8>& a, const float64<8>& b)
458{
459 return _mm512_cmp_pd_mask(a.native(), b.native(), _CMP_LT_OQ);
460}
461#endif
462
463// -----------------------------------------------------------------------------
464
465template<class V> SIMDPP_INL
466typename V::mask_vector_type i_cmp_lt(const V& a, const V& b)
467{
468 SIMDPP_VEC_ARRAY_IMPL2(typename V::mask_vector_type, i_cmp_lt, a, b);
469}
470
471} // namespace insn
472} // namespace detail
473} // namespace SIMDPP_ARCH_NAMESPACE
474} // namespace simdpp
475
476#endif
477
478