1/* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_MAX_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_MAX_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/extract.h>
17#include <simdpp/core/f_max.h>
18#include <simdpp/core/permute2.h>
19#include <simdpp/detail/extract128.h>
20#include <simdpp/detail/workarounds.h>
21
22namespace simdpp {
23namespace SIMDPP_ARCH_NAMESPACE {
24namespace detail {
25namespace insn {
26
27
28static SIMDPP_INL
29float i_reduce_max(const float32x4& a)
30{
31#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
32 float r = a.el(0);
33 for (unsigned i = 1; i < a.length; i++) {
34 r = r > a.el(i) ? r : a.el(i); // TODO nan
35 }
36 return r;
37#elif SIMDPP_USE_SSE2
38 float32x4 b = _mm_movehl_ps(a.native(), a.native());
39 b = max(a, b);
40 b = max(b, permute2<1,1>(b));
41 return _mm_cvtss_f32(b.native());
42#elif SIMDPP_USE_NEON64
43 return vmaxnmvq_f32(a.native());
44#elif SIMDPP_USE_NEON_FLT_SP
45 float32x2_t a2 = vpmax_f32(vget_low_f32(a.native()), vget_high_f32(a.native()));
46 a2 = vpmax_f32(a2, a2);
47 return vget_lane_f32(a2, 0);
48#elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
49 float32x4 b = a;
50 b = max(b, move4_l<1>(b));
51 b = max(b, move4_l<2>(b));
52 return extract<0>(b);
53#endif
54}
55
56#if SIMDPP_USE_AVX
57static SIMDPP_INL
58float i_reduce_max(const float32x8& a)
59{
60 float32x4 ah = detail::extract128<1>(a);
61 float32x4 al = detail::extract128<0>(a);
62 al = max(al, ah);
63 return i_reduce_max(al);
64}
65#endif
66
67#if SIMDPP_USE_AVX512F
68static SIMDPP_INL
69float i_reduce_max(const float32<16>& a)
70{
71 return i_reduce_max(max(extract256<0>(a), extract256<1>(a)));
72}
73#endif
74
75template<unsigned N>
76SIMDPP_INL float i_reduce_max(const float32<N>& a)
77{
78 float32v r = a.vec(0);
79 for (unsigned i = 1; i < a.vec_length; ++i)
80 r = max(r, a.vec(i));
81 return i_reduce_max(r);
82}
83
84// -----------------------------------------------------------------------------
85
86static SIMDPP_INL
87double i_reduce_max(const float64x2& a)
88{
89#if SIMDPP_USE_SSE2
90 float64x2 b = max(a, permute2<1,1>(a));
91 return _mm_cvtsd_f64(b.native());
92#elif SIMDPP_USE_NEON64
93 return vmaxnmvq_f64(a.native());
94#elif SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
95 float64x2 b = max(a, permute2<1,1>(a));
96 return extract<0>(b);
97#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
98 double r = a.el(0);
99 for (unsigned i = 1; i < a.length; i++) {
100 r = r > a.el(i) ? r : a.el(i); // TODO nan
101 }
102 return r;
103#endif
104}
105
106#if SIMDPP_USE_AVX
107static SIMDPP_INL
108double i_reduce_max(const float64x4& a)
109{
110 float64x2 ah = detail::extract128<1>(a);
111 float64x2 al = detail::extract128<0>(a);
112 al = max(al, ah);
113 return i_reduce_max(al);
114}
115#endif
116
117#if SIMDPP_USE_AVX512F
118static SIMDPP_INL
119double i_reduce_max(const float64<8>& a)
120{
121 return i_reduce_max(max(extract256<0>(a), extract256<1>(a)));
122}
123#endif
124
125template<unsigned N>
126SIMDPP_INL double i_reduce_max(const float64<N>& a)
127{
128 float64v r = a.vec(0);
129 for (unsigned i = 1; i < a.vec_length; ++i)
130 r = max(r, a.vec(i));
131 return i_reduce_max(r);
132}
133
134} // namespace insn
135} // namespace detail
136} // namespace SIMDPP_ARCH_NAMESPACE
137} // namespace simdpp
138
139#endif
140
141