f_floor.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/f_floor.h]

1	/ Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_FLOOR_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_FLOOR_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <cmath>
16	#include <simdpp/types.h>
17	#include <simdpp/core/f_abs.h>
18	#include <simdpp/core/bit_or.h>
19	#include <simdpp/core/blend.h>
20	#include <simdpp/core/cmp_eq.h>
21	#include <simdpp/core/cmp_gt.h>
22	#include <simdpp/core/i_shift_r.h>
23	#include <simdpp/core/i_sub.h>
24	#include <simdpp/core/to_float32.h>
25	#include <simdpp/core/to_int32.h>
26	#include <simdpp/detail/vector_array_macros.h>
27
28	namespace simdpp {
29	namespace SIMDPP_ARCH_NAMESPACE {
30	namespace detail {
31	namespace insn {
32
33
34	static SIMDPP_INL
35	float32x4 i_floor(const float32x4& a)
36	{
37	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON_NO_FLT_SP
38	float32x4 r;
39	for (unsigned i = `0`; i < a.length; i++) {
40	r.el(i) = std::floor(a.el(i));
41	}
42	return r;
43	#elif SIMDPP_USE_SSE4_1
44	return _mm_floor_ps(a.native());
45	#elif SIMDPP_USE_NEON64
46	return vrndmq_f32(a.native());
47	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_NEON_FLT_SP \|\| SIMDPP_USE_MSA
48	//check if the value is not too large, or is zero
49	float32x4 ba = abs(a);
50	mask_float32x4 mask_range = cmp_le(ba, `8388607.0f`);
51	mask_float32x4 mask_nonzero = cmp_gt(ba, `0`);
52	mask_float32x4 mask = bit_and(mask_range, mask_nonzero); // takes care of nans and zeros
53
54	//calculate the i_floor using trunc
55	int32x4 s = shift_r((uint32x4)a, `31`); //=1 if a<0
56	float32x4 at = (float32x4) sub((int32x4)a, s); //=nextafter towards +inf, if a<0
57	int32x4 ia = to_int32(at);
58	ia = sub(ia, s);
59	float32x4 fa = to_float32(ia);
60
61	//combine the results
62	return blend(fa, a, mask);
63	#elif SIMDPP_USE_ALTIVEC
64	return vec_floor(a.native());
65	#endif
66	}
67
68	#if SIMDPP_USE_AVX
69	static SIMDPP_INL
70	float32x8 i_floor(const float32x8& a)
71	{
72	return _mm256_floor_ps(a.native());
73	}
74	#endif
75
76	#if SIMDPP_USE_AVX512F
77	static SIMDPP_INL
78	float32<`16`> i_floor(const float32<`16`>& a)
79	{
80	return _mm512_floor_ps(a.native());
81	}
82	#endif
83
84	// -----------------------------------------------------------------------------
85
86	static SIMDPP_INL
87	float64x2 i_floor(const float64x2& a)
88	{
89	#if SIMDPP_USE_SSE4_1
90	return _mm_floor_pd(a.native());
91	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_MSA
92	float64x2 af = abs(a);
93	// check if the value is not too large or is a nan
94	mask_float64x2 mask_range = cmp_le(af, `4503599627370495.0`);
95	// check if truncate to zero or minus one
96	mask_float64x2 mask_1to1 = cmp_lt(af, `1.0`);
97
98	/ Emulate truncation for numbers not less than 1.0.*
99	This is implemented by clearing the mantissa in the source number,
100	adding 1.0 and subtracting integer 1. The mantissa of the resulting
101	number will effectively contain a bit mask defining which bits need to
102	be cleared off the source number in order to truncate it.
103	*/
104	float64x2 clearbits = bit_and(af, `0x7ff0000000000000`); // clear the mantissa
105	clearbits = add(clearbits, `1.0`);
106	clearbits = (float64x2) sub(uint64x2(clearbits), `1`);
107	clearbits = bit_andnot(clearbits, `0xfff0000000000000`); // leave only the mantissa
108
109	float64x2 a2 = bit_andnot(a, clearbits); // truncate
110
111	// check if we need to subtract one (truncated bits when negative)
112	mask_float64x2 mask_neg = cmp_lt(a, `0.0`);
113	mask_float64x2 mask_sub1 = cmp_gt(bit_and(a, clearbits), `0.0`);
114	mask_sub1 = bit_and(mask_sub1, mask_neg);
115
116	// one special case is when 'a' is in the range of (-1.0, 0.0) in which
117	// a & clearbits may still yield to zero. Thus this additional check
118	mask_sub1 = bit_or(mask_sub1, bit_and(mask_1to1, mask_neg));
119	float64x2 sub1 = make_float(-`1.0`);
120	sub1 = bit_and(sub1, mask_sub1);
121
122	a2 = bit_andnot(a, mask_1to1);
123	a2 = sub(a2, sub1);
124
125	return blend(a2, a, mask_range);
126	#elif SIMDPP_USE_NEON64
127	return vrndnq_f64(a.native());
128	#elif SIMDPP_USE_VSX_206
129	return vec_floor(a.native());
130	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON32 \|\| SIMDPP_USE_ALTIVEC
131	float64x2 r;
132	for (unsigned i = `0`; i < r.length; ++i) {
133	r.el(i) = std::floor(a.el(i));
134	}
135	return r;
136	#endif
137	}
138
139	#if SIMDPP_USE_AVX
140	static SIMDPP_INL
141	float64x4 i_floor(const float64x4& a)
142	{
143	return _mm256_floor_pd(a.native());
144	}
145	#endif
146
147	#if SIMDPP_USE_AVX512F
148	static SIMDPP_INL
149	float64<`8`> i_floor(const float64<`8`>& a)
150	{
151	return _mm512_floor_pd(a.native());
152	}
153	#endif
154
155	template<class V> SIMDPP_INL
156	V i_floor(const V& a)
157	{
158	SIMDPP_VEC_ARRAY_IMPL1(V, i_floor, a);
159	}
160
161	} // namespace insn
162	} // namespace detail
163	} // namespace SIMDPP_ARCH_NAMESPACE
164	} // namespace simdpp
165
166	#endif
167
168

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/f_floor.h