1/* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MAX_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MAX_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/i_max.h>
17#include <simdpp/core/extract.h>
18#include <simdpp/core/move_l.h>
19#include <simdpp/core/make_uint.h>
20#include <simdpp/detail/mem_block.h>
21#include <simdpp/detail/extract128.h>
22#include <limits>
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace insn {
28
29static SIMDPP_INL
30uint8_t i_reduce_max(const uint8x16& a)
31{
32#if SIMDPP_USE_NULL
33 uint8_t r = a.el(0);
34 for (unsigned i = 0; i < a.length; i++) {
35 r = r > a.el(i) ? r : a.el(i);
36 }
37 return r;
38#elif SIMDPP_USE_NEON64
39 return vmaxvq_u8(a.native());
40#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
41 uint8x16 r = max(a, move16_l<8>(a));
42 r = max(r, move16_l<4>(r));
43 r = max(r, move16_l<2>(r));
44 r = max(r, move16_l<1>(r));
45 return extract<0>(r);
46#endif
47}
48
49#if SIMDPP_USE_AVX2
50static SIMDPP_INL
51uint8_t i_reduce_max(const uint8<32>& a)
52{
53 uint8x16 r = detail::extract128<0>(a);
54 r = max(r, detail::extract128<1>(a));
55 return i_reduce_max(r);
56}
57#endif
58
59#if SIMDPP_USE_AVX512BW
60SIMDPP_INL uint8_t i_reduce_max(const uint8<64>& a)
61{
62 uint8<32> r = detail::extract256<0>(a);
63 r = max(r, detail::extract256<1>(a));
64 return i_reduce_max(r);
65}
66#endif
67
68template<unsigned N>
69SIMDPP_INL uint8_t i_reduce_max(const uint8<N>& a)
70{
71#if SIMDPP_USE_NULL
72 uint8_t r = std::numeric_limits<uint8_t>::min();
73 for (unsigned j = 0; j < a.vec_length; ++j) {
74 for (unsigned i = 0; i < a.base_length; i++) {
75 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
76 }
77 }
78 return r;
79#else
80 uint8v r = a.vec(0);
81 for (unsigned j = 1; j < a.vec_length; ++j) {
82 r = max(r, a.vec(j));
83 }
84 return i_reduce_max(r);
85#endif
86}
87
88// -----------------------------------------------------------------------------
89
90static SIMDPP_INL
91int8_t i_reduce_max(const int8x16& a)
92{
93#if SIMDPP_USE_NULL
94 int8_t r = a.el(0);
95 for (unsigned i = 0; i < a.length; i++) {
96 r = r > a.el(i) ? r : a.el(i);
97 }
98 return r;
99#elif SIMDPP_USE_NEON64
100 return vmaxvq_s8(a.native());
101#elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
102 int8x16 r = a;
103 r = max(r, move16_l<8>(r));
104 r = max(r, move16_l<4>(r));
105 r = max(r, move16_l<2>(r));
106 r = max(r, move16_l<1>(r));
107 return extract<0>(r);
108#elif SIMDPP_USE_SSE2
109 // no instruction for int8 max available, only for uint8
110 uint8x16 ca = bit_xor(a, 0x80);
111 return i_reduce_max(ca) ^ 0x80;
112#endif
113}
114
115#if SIMDPP_USE_AVX2
116static SIMDPP_INL
117int8_t i_reduce_max(const int8<32>& a)
118{
119 int8x16 r = detail::extract128<0>(a);
120 r = max(r, detail::extract128<1>(a));
121 return i_reduce_max(r);
122}
123#endif
124
125#if SIMDPP_USE_AVX512BW
126SIMDPP_INL int8_t i_reduce_max(const int8<64>& a)
127{
128 int8<32> r = detail::extract256<0>(a);
129 r = max(r, detail::extract256<1>(a));
130 return i_reduce_max(r);
131}
132#endif
133
134template<unsigned N>
135SIMDPP_INL int8_t i_reduce_max(const int8<N>& a)
136{
137#if SIMDPP_USE_NULL
138 int8_t r = std::numeric_limits<int8_t>::min();;
139 for (unsigned j = 0; j < a.vec_length; ++j) {
140 for (unsigned i = 0; i < a.base_length; i++) {
141 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
142 }
143 }
144 return r;
145#elif SIMDPP_USE_SSE2 && !SIMDPP_USE_SSE4_1
146 // no instruction for int8 max available, only for uint8
147 uint8x16 r = bit_xor(a.vec(0), 0x80);
148 for (unsigned j = 1; j < a.vec_length; ++j) {
149 uint8x16 ca = bit_xor(a.vec(j), 0x80);
150 r = max(r, ca);
151 }
152 return i_reduce_max(r) ^ 0x80;
153#else
154 int8v r = a.vec(0);
155 for (unsigned j = 1; j < a.vec_length; ++j) {
156 r = max(r, a.vec(j));
157 }
158 return i_reduce_max(r);
159#endif
160}
161
162// -----------------------------------------------------------------------------
163static SIMDPP_INL
164int16_t i_reduce_max(const int16x8& a);
165
166static SIMDPP_INL
167uint16_t i_reduce_max(const uint16x8& a)
168{
169#if SIMDPP_USE_NULL
170 uint16_t r = a.el(0);
171 for (unsigned i = 0; i < a.length; i++) {
172 r = r > a.el(i) ? r : a.el(i);
173 }
174 return r;
175#elif SIMDPP_USE_NEON64
176 return vmaxvq_u16(a.native());
177#elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
178 uint16x8 r = max(a, move8_l<4>(a));
179 r = max(r, move8_l<2>(r));
180 r = max(r, move8_l<1>(r));
181 return extract<0>(r);
182#elif SIMDPP_USE_SSE2
183 // no instruction for uint16 max available, only for int16
184 int16x8 ca = bit_xor(a, 0x8000);
185 return i_reduce_max(ca) ^ 0x8000;
186#endif
187}
188
189#if SIMDPP_USE_AVX2
190static SIMDPP_INL
191uint16_t i_reduce_max(const uint16x16& a)
192{
193 uint16x8 r = detail::extract128<0>(a);
194 r = max(r, detail::extract128<1>(a));
195 return i_reduce_max(r);
196}
197#endif
198
199#if SIMDPP_USE_AVX512BW
200SIMDPP_INL uint16_t i_reduce_max(const uint16<32>& a)
201{
202 uint16<16> r = detail::extract256<0>(a);
203 r = max(r, detail::extract256<1>(a));
204 return i_reduce_max(r);
205}
206#endif
207
208template<unsigned N>
209SIMDPP_INL uint16_t i_reduce_max(const uint16<N>& a)
210{
211#if SIMDPP_USE_NULL
212 uint16_t r = std::numeric_limits<uint16_t>::min();;
213 for (unsigned j = 0; j < a.vec_length; ++j) {
214 for (unsigned i = 0; i < a.base_length; i++) {
215 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
216 }
217 }
218 return r;
219#elif SIMDPP_USE_SSE2 && !SIMDPP_USE_SSE4_1
220 // no instruction for uint16 max available, only for int16
221 int16x8 r = bit_xor(a.vec(0), 0x8000);
222 for (unsigned j = 1; j < a.vec_length; ++j) {
223 int16x8 ca = bit_xor(a.vec(j), 0x8000);
224 r = max(r, ca);
225 }
226 return i_reduce_max(r) ^ 0x8000;
227#else
228 uint16v r = a.vec(0);
229 for (unsigned j = 1; j < a.vec_length; ++j) {
230 r = max(r, a.vec(j));
231 }
232 return i_reduce_max(r);
233#endif
234}
235
236// -----------------------------------------------------------------------------
237
238static SIMDPP_INL
239int16_t i_reduce_max(const int16x8& a)
240{
241#if SIMDPP_USE_NULL
242 int16_t r = a.el(0);
243 for (unsigned i = 0; i < a.length; i++) {
244 r = r > a.el(i) ? r : a.el(i);
245 }
246 return r;
247#elif SIMDPP_USE_NEON64
248 return vmaxvq_s16(a.native());
249#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
250 int16x8 r = max(a, move8_l<4>(a));
251 r = max(r, move8_l<2>(r));
252 r = max(r, move8_l<1>(r));
253 return extract<0>(r);
254#endif
255}
256
257#if SIMDPP_USE_AVX2
258static SIMDPP_INL
259int16_t i_reduce_max(const int16x16& a)
260{
261 int16x8 r = detail::extract128<0>(a);
262 r = max(r, detail::extract128<1>(a));
263 return i_reduce_max(r);
264}
265#endif
266
267#if SIMDPP_USE_AVX512BW
268SIMDPP_INL int16_t i_reduce_max(const int16<32>& a)
269{
270 int16<16> r = detail::extract256<0>(a);
271 r = max(r, detail::extract256<1>(a));
272 return i_reduce_max(r);
273}
274#endif
275
276template<unsigned N>
277SIMDPP_INL int16_t i_reduce_max(const int16<N>& a)
278{
279#if SIMDPP_USE_NULL
280 int16_t r = std::numeric_limits<int16_t>::min();;
281 for (unsigned j = 0; j < a.vec_length; ++j) {
282 for (unsigned i = 0; i < a.base_length; i++) {
283 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
284 }
285 }
286 return r;
287#else
288 int16v r = a.vec(0);
289 for (unsigned j = 1; j < a.vec_length; ++j) {
290 r = max(r, a.vec(j));
291 }
292 return i_reduce_max(r);
293#endif
294}
295
296// -----------------------------------------------------------------------------
297
298static SIMDPP_INL
299uint32_t i_reduce_max(const uint32x4& a)
300{
301#if SIMDPP_USE_NULL
302 uint32_t r = a.el(0);
303 for (unsigned i = 0; i < a.length; i++) {
304 r = r > a.el(i) ? r : a.el(i);
305 }
306 return r;
307#elif SIMDPP_USE_NEON64
308 return vmaxvq_u32(a.native());
309#elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
310 uint32x4 r = max(a, move4_l<2>(a));
311 r = max(r, move4_l<1>(r));
312 return extract<0>(r);
313#elif SIMDPP_USE_SSE2
314 mem_block<uint32x4> b = a;
315 uint32_t r = b[0];
316 for (unsigned i = 1; i < b.length; i++) {
317 r = r > b[i] ? r : b[i];
318 }
319 return r;
320#endif
321}
322
323#if SIMDPP_USE_AVX2
324static SIMDPP_INL
325uint32_t i_reduce_max(const uint32x8& a)
326{
327 uint32x4 r = detail::extract128<0>(a);
328 r = max(r, detail::extract128<1>(a));
329 r = max(r, move4_l<2>(r));
330 r = max(r, move4_l<1>(r));
331 return extract<0>(r);
332}
333#endif
334
335#if SIMDPP_USE_AVX512F
336static SIMDPP_INL
337uint32_t i_reduce_max(const uint32<16>& a)
338{
339 return i_reduce_max((uint32<8>)max(extract256<0>(a), extract256<1>(a)));
340}
341#endif
342
343template<unsigned N>
344SIMDPP_INL uint32_t i_reduce_max(const uint32<N>& a)
345{
346#if SIMDPP_USE_NULL
347 uint32_t r = std::numeric_limits<uint32_t>::min();;
348 for (unsigned j = 0; j < a.vec_length; ++j) {
349 for (unsigned i = 0; i < a.base_length; i++) {
350 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
351 }
352 }
353 return r;
354#else
355 uint32v r = a.vec(0);
356 for (unsigned j = 1; j < a.vec_length; ++j) {
357 r = max(r, a.vec(j));
358 }
359 return i_reduce_max(r);
360#endif
361}
362
363// -----------------------------------------------------------------------------
364
365static SIMDPP_INL
366int32_t i_reduce_max(const int32x4& a)
367{
368#if SIMDPP_USE_NULL
369 int32_t r = a.el(0);
370 for (unsigned i = 0; i < a.length; i++) {
371 r = r > a.el(i) ? r : a.el(i);
372 }
373 return r;
374#elif SIMDPP_USE_NEON64
375 return vmaxvq_s32(a.native());
376#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
377 int32x4 r = max(a, move4_l<2>(a));
378 r = max(r, move4_l<1>(r));
379 return extract<0>(r);
380#endif
381}
382
383#if SIMDPP_USE_AVX2
384static SIMDPP_INL
385int32_t i_reduce_max(const int32x8& a)
386{
387 int32x4 r = detail::extract128<0>(a);
388 r = max(r, detail::extract128<1>(a));
389 r = max(r, move4_l<2>(r));
390 r = max(r, move4_l<1>(r));
391 return extract<0>(r);
392}
393#endif
394
395#if SIMDPP_USE_AVX512F
396static SIMDPP_INL
397int32_t i_reduce_max(const int32<16>& a)
398{
399 return i_reduce_max((int32<8>)max(extract256<0>(a), extract256<1>(a)));
400}
401#endif
402
403template<unsigned N>
404SIMDPP_INL int32_t i_reduce_max(const int32<N>& a)
405{
406#if SIMDPP_USE_NULL
407 int32_t r = std::numeric_limits<int32_t>::min();;
408 for (unsigned j = 0; j < a.vec_length; ++j) {
409 for (unsigned i = 0; i < a.base_length; i++) {
410 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
411 }
412 }
413 return r;
414#else
415 int32v r = a.vec(0);
416 for (unsigned j = 1; j < a.vec_length; ++j) {
417 r = max(r, a.vec(j));
418 }
419 return i_reduce_max(r);
420#endif
421}
422
423// -----------------------------------------------------------------------------
424
425static SIMDPP_INL
426uint64_t i_reduce_max(const uint64x2& a)
427{
428#if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207
429 uint64x2 r = max(a, move2_l<1>(a));
430 return extract<0>(r);
431#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
432 mem_block<uint64x2> b = a;
433 return b[0] > b[1] ? b[0] : b[1];
434#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
435 uint64_t r = a.el(0);
436 for (unsigned i = 0; i < a.length; i++) {
437 r = r > a.el(i) ? r : a.el(i);
438 }
439 return r;
440#else
441 return SIMDPP_NOT_IMPLEMENTED1(a);
442#endif
443}
444
445#if SIMDPP_USE_AVX2
446static SIMDPP_INL
447uint64_t i_reduce_max(const uint64x4& a)
448{
449 uint64x2 r = detail::extract128<0>(a);
450 r = max(r, detail::extract128<1>(a));
451 r = max(r, move2_l<1>(r));
452 return extract<0>(r);
453}
454#endif
455
456#if SIMDPP_USE_AVX512F
457static SIMDPP_INL
458uint64_t i_reduce_max(const uint64<8>& a)
459{
460 return i_reduce_max((uint64<4>)max(extract256<0>(a), extract256<1>(a)));
461}
462#endif
463
464template<unsigned N>
465SIMDPP_INL uint64_t i_reduce_max(const uint64<N>& a)
466{
467#if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
468 uint64v r = a.vec(0);
469 for (unsigned j = 1; j < a.vec_length; ++j) {
470 r = max(r, a.vec(j));
471 }
472 return i_reduce_max(r);
473#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
474 uint64_t r = std::numeric_limits<uint64_t>::min();
475 for (unsigned j = 0; j < a.vec_length; ++j) {
476 mem_block<uint64v> b = a.vec(j);
477 for (unsigned i = 0; i < a.base_length; i++) {
478 r = r > b[i] ? r : b[i];
479 }
480 }
481 return r;
482#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
483 uint64_t r = std::numeric_limits<uint64_t>::min();;
484 for (unsigned j = 0; j < a.vec_length; ++j) {
485 for (unsigned i = 0; i < a.base_length; i++) {
486 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
487 }
488 }
489 return r;
490#else
491 return SIMDPP_NOT_IMPLEMENTED1(a);
492#endif
493}
494
495// -----------------------------------------------------------------------------
496
497static SIMDPP_INL
498int64_t i_reduce_max(const int64x2& a)
499{
500#if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
501 int64x2 r = max(a, move2_l<1>(a));
502 return extract<0>(r);
503#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
504 mem_block<int64x2> b = a;
505 return b[0] > b[1] ? b[0] : b[1];
506#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
507 int64_t r = a.el(0);
508 for (unsigned i = 0; i < a.length; i++) {
509 r = r > a.el(i) ? r : a.el(i);
510 }
511 return r;
512#else
513 return SIMDPP_NOT_IMPLEMENTED1(a);
514#endif
515}
516
517#if SIMDPP_USE_AVX2
518static SIMDPP_INL
519int64_t i_reduce_max(const int64x4& a)
520{
521 int64x2 r = detail::extract128<0>(a);
522 r = max(r, detail::extract128<1>(a));
523 r = max(r, move2_l<1>(r));
524 return extract<0>(r);
525}
526#endif
527
528#if SIMDPP_USE_AVX512F
529static SIMDPP_INL
530int64_t i_reduce_max(const int64<8>& a)
531{
532 return i_reduce_max((int64<4>)max(extract256<0>(a), extract256<1>(a)));
533}
534#endif
535
536template<unsigned N>
537SIMDPP_INL int64_t i_reduce_max(const int64<N>& a)
538{
539#if SIMDPP_USE_AXV2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
540 int64v r = a.vec(0);
541 for (unsigned j = 1; j < a.vec_length; ++j) {
542 r = max(r, a.vec(j));
543 }
544 return i_reduce_max(r);
545#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
546 int64_t r = std::numeric_limits<int64_t>::min();;
547 for (unsigned j = 0; j < a.vec_length; ++j) {
548 mem_block<int64v> b = a.vec(j);
549 for (unsigned i = 0; i < a.base_length; i++) {
550 r = r > b[i] ? r : b[i];
551 }
552 }
553 return r;
554#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
555 int64_t r = std::numeric_limits<int64_t>::min();;
556 for (unsigned j = 0; j < a.vec_length; ++j) {
557 for (unsigned i = 0; i < a.base_length; i++) {
558 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
559 }
560 }
561 return r;
562#else
563 return SIMDPP_NOT_IMPLEMENTED1(a);
564#endif
565}
566
567// -----------------------------------------------------------------------------
568
569} // namespace insn
570} // namespace detail
571} // namespace SIMDPP_ARCH_NAMESPACE
572} // namespace simdpp
573
574#endif
575
576