f_reduce_add.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/f_reduce_add.h]

1	/ Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_ADD_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_ADD_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/extract.h>
17	#include <simdpp/core/f_add.h>
18	#include <simdpp/core/permute2.h>
19	#include <simdpp/detail/extract128.h>
20	#include <simdpp/detail/workarounds.h>
21
22	namespace simdpp {
23	namespace SIMDPP_ARCH_NAMESPACE {
24	namespace detail {
25	namespace insn {
26
27
28	static SIMDPP_INL
29	float i_reduce_add(const float32x4& a)
30	{
31	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON_NO_FLT_SP
32	float r = a.el(`0`);
33	for (unsigned i = `1`; i < a.length; i++) {
34	r += a.el(i);
35	}
36	return r;
37	#elif SIMDPP_USE_SSE3
38	float32x4 b = a;
39	b = _mm_hadd_ps(b.native(), b.native());
40	b = _mm_hadd_ps(b.native(), b.native());
41	return _mm_cvtss_f32(b.native());
42	#elif SIMDPP_USE_SSE2
43	float32x4 sum2 = _mm_movehl_ps(a.native(), a.native());
44	float32x4 sum = add(a, sum2);
45	sum = add(sum, permute2<`1`,`0`>(sum));
46	return _mm_cvtss_f32(sum.native());
47	#elif SIMDPP_USE_NEON_FLT_SP
48	float32x2_t a2 = vpadd_f32(vget_low_f32(a.native()), vget_high_f32(a.native()));
49	a2 = vpadd_f32(a2, a2);
50	return vget_lane_f32(a2, `0`);
51	#elif SIMDPP_USE_ALTIVEC \|\| SIMDPP_USE_MSA
52	float32x4 b = a;
53	b = add(b, move4_l<`1`>(b));
54	b = add(b, move4_l<`2`>(b));
55	return extract<`0`>(b);
56	#endif
57	}
58
59	#if SIMDPP_USE_AVX
60	static SIMDPP_INL
61	float i_reduce_add(const float32x8& a)
62	{
63	__m128 ah = detail::extract128<`1`>(a).native();
64	__m128 al = detail::extract128<`0`>(a).native();
65	al = _mm_hadd_ps(al, ah);
66	al = _mm_hadd_ps(al, al);
67	al = _mm_hadd_ps(al, al);
68	return _mm_cvtss_f32(al);
69	}
70	#endif
71
72	#if SIMDPP_USE_AVX512F
73	static SIMDPP_INL
74	float i_reduce_add(const float32<`16`>& a)
75	{
76	return i_reduce_add(add(extract256<`0`>(a), extract256<`1`>(a)));
77	}
78	#endif
79
80	template<unsigned N>
81	SIMDPP_INL float i_reduce_add(const float32<N>& a)
82	{
83	float32v r = a.vec(`0`);
84	for (unsigned i = `1`; i < a.vec_length; ++i)
85	r = add(r, a.vec(i));
86	return i_reduce_add(r);
87	}
88
89	// -----------------------------------------------------------------------------
90
91	static SIMDPP_INL
92	double i_reduce_add(const float64x2& a)
93	{
94	#if SIMDPP_USE_SSE3
95	return _mm_cvtsd_f64(_mm_hadd_pd(a.native(), a.native()));
96	#elif SIMDPP_USE_SSE2
97	float64x2 b = add(a, permute2<`1`,`1`>(a));
98	return _mm_cvtsd_f64(b.native());
99	#elif SIMDPP_USE_NEON64
100	float64x2_t a2 = vpaddq_f64(a.native(), a.native());
101	return vgetq_lane_f64(a2, `0`);
102	#elif SIMDPP_USE_VSX_206 \|\| SIMDPP_USE_MSA
103	float64x2 b = add(a, permute2<`1`,`1`>(a));
104	return extract<`0`>(b);
105	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON32 \|\| SIMDPP_USE_ALTIVEC
106	double r = a.el(`0`);
107	for (unsigned i = `1`; i < a.length; i++) {
108	r += a.el(i);
109	}
110	return r;
111	#endif
112	}
113
114	#if SIMDPP_USE_AVX
115	static SIMDPP_INL
116	double i_reduce_add(const float64x4& a)
117	{
118	__m128d ah = detail::extract128<`1`>(a).native();
119	__m128d al = detail::extract128<`0`>(a).native();
120	al = _mm_hadd_pd(al, ah);
121	al = _mm_hadd_pd(al, al);
122	return _mm_cvtsd_f64(al);
123	}
124	#endif
125
126	#if SIMDPP_USE_AVX512F
127	static SIMDPP_INL
128	double i_reduce_add(const float64<`8`>& a)
129	{
130	return i_reduce_add(add(extract256<`0`>(a), extract256<`1`>(a)));
131	}
132	#endif
133
134	template<unsigned N>
135	SIMDPP_INL double i_reduce_add(const float64<N>& a)
136	{
137	float64v r = a.vec(`0`);
138	for (unsigned i = `1`; i < a.vec_length; ++i)
139	r = add(r, a.vec(i));
140	return i_reduce_add(r);
141	}
142
143	} // namespace insn
144	} // namespace detail
145	} // namespace SIMDPP_ARCH_NAMESPACE
146	} // namespace simdpp
147
148	#endif
149
150

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/f_reduce_add.h