1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MAX_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_MAX_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/bit_xor.h> |
17 | #include <simdpp/core/blend.h> |
18 | #include <simdpp/core/cmp_gt.h> |
19 | #include <simdpp/detail/null/math.h> |
20 | #include <simdpp/detail/vector_array_macros.h> |
21 | |
22 | namespace simdpp { |
23 | namespace SIMDPP_ARCH_NAMESPACE { |
24 | namespace detail { |
25 | namespace insn { |
26 | |
27 | |
28 | static SIMDPP_INL |
29 | int8x16 i_max(const int8x16& a, const int8x16& b) |
30 | { |
31 | #if SIMDPP_USE_NULL |
32 | return detail::null::max(a, b); |
33 | #elif SIMDPP_USE_SSE4_1 |
34 | return _mm_max_epi8(a.native(), b.native()); |
35 | #elif SIMDPP_USE_SSE2 |
36 | int8x16 ca = bit_xor(a, 0x80); |
37 | int8x16 cb = bit_xor(b, 0x80); |
38 | int8x16 r = _mm_max_epu8(ca.native(), cb.native()); |
39 | return bit_xor(r, 0x80); |
40 | #elif SIMDPP_USE_NEON |
41 | return vmaxq_s8(a.native(), b.native()); |
42 | #elif SIMDPP_USE_ALTIVEC |
43 | return vec_max(a.native(), b.native()); |
44 | #elif SIMDPP_USE_MSA |
45 | return __msa_max_s_b(a.native(), b.native()); |
46 | #endif |
47 | } |
48 | |
49 | #if SIMDPP_USE_AVX2 |
50 | static SIMDPP_INL |
51 | int8x32 i_max(const int8x32& a, const int8x32& b) |
52 | { |
53 | return _mm256_max_epi8(a.native(), b.native()); |
54 | } |
55 | #endif |
56 | |
57 | #if SIMDPP_USE_AVX512BW |
58 | SIMDPP_INL int8<64> i_max(const int8<64>& a, const int8<64>& b) |
59 | { |
60 | return _mm512_max_epi8(a.native(), b.native()); |
61 | } |
62 | #endif |
63 | |
64 | template<unsigned N> SIMDPP_INL |
65 | int8<N> i_max(const int8<N>& a, const int8<N>& b) |
66 | { |
67 | SIMDPP_VEC_ARRAY_IMPL2(int8<N>, i_max, a, b); |
68 | } |
69 | |
70 | // ----------------------------------------------------------------------------- |
71 | |
72 | static SIMDPP_INL |
73 | uint8x16 i_max(const uint8x16& a, const uint8x16& b) |
74 | { |
75 | #if SIMDPP_USE_NULL |
76 | return detail::null::max(a, b); |
77 | #elif SIMDPP_USE_SSE2 |
78 | return _mm_max_epu8(a.native(), b.native()); |
79 | #elif SIMDPP_USE_NEON |
80 | return vmaxq_u8(a.native(), b.native()); |
81 | #elif SIMDPP_USE_ALTIVEC |
82 | return vec_max(a.native(), b.native()); |
83 | #elif SIMDPP_USE_MSA |
84 | return __msa_max_u_b(a.native(), b.native()); |
85 | #endif |
86 | } |
87 | |
88 | #if SIMDPP_USE_AVX2 |
89 | static SIMDPP_INL |
90 | uint8x32 i_max(const uint8x32& a, const uint8x32& b) |
91 | { |
92 | return _mm256_max_epu8(a.native(), b.native()); |
93 | } |
94 | #endif |
95 | |
96 | #if SIMDPP_USE_AVX512BW |
97 | SIMDPP_INL uint8<64> i_max(const uint8<64>& a, const uint8<64>& b) |
98 | { |
99 | return _mm512_max_epu8(a.native(), b.native()); |
100 | } |
101 | #endif |
102 | |
103 | template<unsigned N> SIMDPP_INL |
104 | uint8<N> i_max(const uint8<N>& a, const uint8<N>& b) |
105 | { |
106 | SIMDPP_VEC_ARRAY_IMPL2(uint8<N>, i_max, a, b); |
107 | } |
108 | |
109 | // ----------------------------------------------------------------------------- |
110 | |
111 | static SIMDPP_INL |
112 | int16x8 i_max(const int16x8& a, const int16x8& b) |
113 | { |
114 | #if SIMDPP_USE_NULL |
115 | return detail::null::max(a, b); |
116 | #elif SIMDPP_USE_SSE2 |
117 | return _mm_max_epi16(a.native(), b.native()); |
118 | #elif SIMDPP_USE_NEON |
119 | return vmaxq_s16(a.native(), b.native()); |
120 | #elif SIMDPP_USE_ALTIVEC |
121 | return vec_max(a.native(), b.native()); |
122 | #elif SIMDPP_USE_MSA |
123 | return __msa_max_s_h(a.native(), b.native()); |
124 | #endif |
125 | } |
126 | |
127 | #if SIMDPP_USE_AVX2 |
128 | static SIMDPP_INL |
129 | int16x16 i_max(const int16x16& a, const int16x16& b) |
130 | { |
131 | return _mm256_max_epi16(a.native(), b.native()); |
132 | } |
133 | #endif |
134 | |
135 | #if SIMDPP_USE_AVX512BW |
136 | SIMDPP_INL int16<32> i_max(const int16<32>& a, const int16<32>& b) |
137 | { |
138 | return _mm512_max_epi16(a.native(), b.native()); |
139 | } |
140 | #endif |
141 | |
142 | template<unsigned N> SIMDPP_INL |
143 | int16<N> i_max(const int16<N>& a, const int16<N>& b) |
144 | { |
145 | SIMDPP_VEC_ARRAY_IMPL2(int16<N>, i_max, a, b); |
146 | } |
147 | |
148 | // ----------------------------------------------------------------------------- |
149 | |
150 | static SIMDPP_INL |
151 | uint16x8 i_max(const uint16x8& a, const uint16x8& b) |
152 | { |
153 | #if SIMDPP_USE_NULL |
154 | return detail::null::max(a, b); |
155 | #elif SIMDPP_USE_SSE4_1 |
156 | return _mm_max_epu16(a.native(), b.native()); |
157 | #elif SIMDPP_USE_SSE2 |
158 | int16x8 ca = bit_xor(a, 0x8000); |
159 | int16x8 cb = bit_xor(b, 0x8000); |
160 | int16x8 r = _mm_max_epi16(ca.native(), cb.native()); |
161 | return bit_xor(r, 0x8000); |
162 | #elif SIMDPP_USE_NEON |
163 | return vmaxq_u16(a.native(), b.native()); |
164 | #elif SIMDPP_USE_ALTIVEC |
165 | return vec_max(a.native(), b.native()); |
166 | #elif SIMDPP_USE_MSA |
167 | return __msa_max_u_h(a.native(), b.native()); |
168 | #endif |
169 | } |
170 | |
171 | #if SIMDPP_USE_AVX2 |
172 | static SIMDPP_INL |
173 | uint16x16 i_max(const uint16x16& a, const uint16x16& b) |
174 | { |
175 | return _mm256_max_epu16(a.native(), b.native()); |
176 | } |
177 | #endif |
178 | |
179 | #if SIMDPP_USE_AVX512BW |
180 | SIMDPP_INL uint16<32> i_max(const uint16<32>& a, const uint16<32>& b) |
181 | { |
182 | return _mm512_max_epu16(a.native(), b.native()); |
183 | } |
184 | #endif |
185 | |
186 | template<unsigned N> SIMDPP_INL |
187 | uint16<N> i_max(const uint16<N>& a, const uint16<N>& b) |
188 | { |
189 | SIMDPP_VEC_ARRAY_IMPL2(uint16<N>, i_max, a, b); |
190 | } |
191 | |
192 | // ----------------------------------------------------------------------------- |
193 | |
194 | static SIMDPP_INL |
195 | int32x4 i_max(const int32x4& a, const int32x4& b) |
196 | { |
197 | #if SIMDPP_USE_NULL |
198 | return detail::null::max(a, b); |
199 | #elif SIMDPP_USE_SSE4_1 |
200 | return _mm_max_epi32(a.native(), b.native()); |
201 | #elif SIMDPP_USE_SSE2 |
202 | mask_int32x4 mask = cmp_gt(a, b); |
203 | return blend(a, b, mask); |
204 | #elif SIMDPP_USE_NEON |
205 | return vmaxq_s32(a.native(), b.native()); |
206 | #elif SIMDPP_USE_ALTIVEC |
207 | return vec_max(a.native(), b.native()); |
208 | #elif SIMDPP_USE_MSA |
209 | return __msa_max_s_w(a.native(), b.native()); |
210 | #endif |
211 | } |
212 | |
213 | #if SIMDPP_USE_AVX2 |
214 | static SIMDPP_INL |
215 | int32x8 i_max(const int32x8& a, const int32x8& b) |
216 | { |
217 | return _mm256_max_epi32(a.native(), b.native()); |
218 | } |
219 | #endif |
220 | |
221 | #if SIMDPP_USE_AVX512F |
222 | static SIMDPP_INL |
223 | int32<16> i_max(const int32<16>& a, const int32<16>& b) |
224 | { |
225 | return _mm512_max_epi32(a.native(), b.native()); |
226 | } |
227 | #endif |
228 | |
229 | template<unsigned N> SIMDPP_INL |
230 | int32<N> i_max(const int32<N>& a, const int32<N>& b) |
231 | { |
232 | SIMDPP_VEC_ARRAY_IMPL2(int32<N>, i_max, a, b); |
233 | } |
234 | |
235 | // ----------------------------------------------------------------------------- |
236 | |
237 | static SIMDPP_INL |
238 | uint32x4 i_max(const uint32x4& a, const uint32x4& b) |
239 | { |
240 | #if SIMDPP_USE_NULL |
241 | return detail::null::max(a, b); |
242 | #elif SIMDPP_USE_SSE4_1 |
243 | return _mm_max_epu32(a.native(), b.native()); |
244 | #elif SIMDPP_USE_SSE2 |
245 | mask_int32x4 mask = cmp_gt(a, b); |
246 | return blend(a, b, mask); |
247 | #elif SIMDPP_USE_NEON |
248 | return vmaxq_u32(a.native(), b.native()); |
249 | #elif SIMDPP_USE_ALTIVEC |
250 | return vec_max(a.native(), b.native()); |
251 | #elif SIMDPP_USE_MSA |
252 | return __msa_max_u_w(a.native(), b.native()); |
253 | #endif |
254 | } |
255 | |
256 | #if SIMDPP_USE_AVX2 |
257 | static SIMDPP_INL |
258 | uint32x8 i_max(const uint32x8& a, const uint32x8& b) |
259 | { |
260 | return _mm256_max_epu32(a.native(), b.native()); |
261 | } |
262 | #endif |
263 | |
264 | #if SIMDPP_USE_AVX512F |
265 | static SIMDPP_INL |
266 | uint32<16> i_max(const uint32<16>& a, const uint32<16>& b) |
267 | { |
268 | return _mm512_max_epu32(a.native(), b.native()); |
269 | } |
270 | #endif |
271 | |
272 | template<unsigned N> SIMDPP_INL |
273 | uint32<N> i_max(const uint32<N>& a, const uint32<N>& b) |
274 | { |
275 | SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, i_max, a, b); |
276 | } |
277 | |
278 | // ----------------------------------------------------------------------------- |
279 | |
280 | static SIMDPP_INL |
281 | int64x2 i_max(const int64x2& a, const int64x2& b) |
282 | { |
283 | #if SIMDPP_USE_AVX512VL |
284 | return _mm_max_epi64(a.native(), b.native()); |
285 | #elif SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 |
286 | mask_int64x2 mask = cmp_gt(a, b); |
287 | return blend(a, b, mask); |
288 | #elif SIMDPP_USE_VSX_207 |
289 | return vec_max(a.native(), b.native()); |
290 | #elif SIMDPP_USE_MSA |
291 | return __msa_max_s_d(a.native(), b.native()); |
292 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
293 | return detail::null::max(a, b); |
294 | #else |
295 | return SIMDPP_NOT_IMPLEMENTED2(a, b); |
296 | #endif |
297 | } |
298 | |
299 | #if SIMDPP_USE_AVX2 |
300 | static SIMDPP_INL |
301 | int64x4 i_max(const int64x4& a, const int64x4& b) |
302 | { |
303 | #if SIMDPP_USE_AVX512VL |
304 | return _mm256_max_epi64(a.native(), b.native()); |
305 | #else |
306 | mask_int64x4 mask = cmp_gt(a, b); |
307 | return blend(a, b, mask); |
308 | #endif |
309 | } |
310 | #endif |
311 | |
312 | #if SIMDPP_USE_AVX512F |
313 | static SIMDPP_INL |
314 | int64<8> i_max(const int64<8>& a, const int64<8>& b) |
315 | { |
316 | return _mm512_max_epi64(a.native(), b.native()); |
317 | } |
318 | #endif |
319 | |
320 | template<unsigned N> SIMDPP_INL |
321 | int64<N> i_max(const int64<N>& a, const int64<N>& b) |
322 | { |
323 | SIMDPP_VEC_ARRAY_IMPL2(int64<N>, i_max, a, b); |
324 | } |
325 | |
326 | // ----------------------------------------------------------------------------- |
327 | |
328 | static SIMDPP_INL |
329 | uint64x2 i_max(const uint64x2& a, const uint64x2& b) |
330 | { |
331 | #if SIMDPP_USE_AVX512VL |
332 | return _mm_max_epu64(a.native(), b.native()); |
333 | #elif SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 |
334 | mask_int64x2 mask = cmp_gt(a, b); |
335 | return blend(a, b, mask); |
336 | #elif SIMDPP_USE_VSX_207 |
337 | return vec_max(a.native(), b.native()); |
338 | #elif SIMDPP_USE_MSA |
339 | return __msa_max_u_d(a.native(), b.native()); |
340 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
341 | return detail::null::max(a, b); |
342 | #else |
343 | return SIMDPP_NOT_IMPLEMENTED2(a, b); |
344 | #endif |
345 | } |
346 | |
347 | #if SIMDPP_USE_AVX2 |
348 | static SIMDPP_INL |
349 | uint64x4 i_max(const uint64x4& a, const uint64x4& b) |
350 | { |
351 | #if SIMDPP_USE_AVX512VL |
352 | return _mm256_max_epu64(a.native(), b.native()); |
353 | #else |
354 | mask_int64x4 mask = cmp_gt(a, b); |
355 | return blend(a, b, mask); |
356 | #endif |
357 | } |
358 | #endif |
359 | |
360 | #if SIMDPP_USE_AVX512F |
361 | static SIMDPP_INL |
362 | uint64<8> i_max(const uint64<8>& a, const uint64<8>& b) |
363 | { |
364 | return _mm512_max_epu64(a.native(), b.native()); |
365 | } |
366 | #endif |
367 | |
368 | template<unsigned N> SIMDPP_INL |
369 | uint64<N> i_max(const uint64<N>& a, const uint64<N>& b) |
370 | { |
371 | SIMDPP_VEC_ARRAY_IMPL2(uint64<N>, i_max, a, b); |
372 | } |
373 | |
374 | } // namespace insn |
375 | } // namespace detail |
376 | } // namespace SIMDPP_ARCH_NAMESPACE |
377 | } // namespace simdpp |
378 | |
379 | #endif |
380 | |
381 | |