f_ceil.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/f_ceil.h]

1	/ Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_CEIL_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_CEIL_H
10
11	#ifndef LIBSIMDPP_SIMD_H
12	#error "This file must be included through simd.h"
13	#endif
14
15	#include <simdpp/types.h>
16	#include <simdpp/core/bit_or.h>
17	#include <simdpp/core/blend.h>
18	#include <simdpp/core/cmp_eq.h>
19	#include <simdpp/core/cmp_gt.h>
20	#include <simdpp/core/i_add.h>
21	#include <simdpp/core/i_shift_r.h>
22	#include <simdpp/core/i_sub.h>
23	#include <simdpp/core/f_abs.h>
24	#include <simdpp/core/f_add.h>
25	#include <simdpp/core/make_float.h>
26	#include <simdpp/core/make_int.h>
27	#include <simdpp/core/to_float32.h>
28	#include <simdpp/core/to_int32.h>
29	#include <simdpp/detail/vector_array_macros.h>
30
31	namespace simdpp {
32	namespace SIMDPP_ARCH_NAMESPACE {
33	namespace detail {
34	namespace insn {
35
36	static SIMDPP_INL
37	float32x4 i_ceil(const float32x4& a)
38	{
39	#if SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON_NO_FLT_SP
40	float32x4 r;
41	for (unsigned i = `0`; i < a.length; i++) {
42	r.el(i) = std::ceil(a.el(i));
43	}
44	return r;
45	#elif SIMDPP_USE_SSE4_1
46	return _mm_ceil_ps(a.native());
47	#elif SIMDPP_USE_NEON64
48	return vrndpq_f32(a.native()); // FIXME: ARMv8
49	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_NEON_FLT_SP \|\| SIMDPP_USE_MSA
50	//check if the value is not too large, or is zero
51	float32x4 ba = abs(a);
52	mask_float32x4 mask_range = cmp_le(ba, `8388607.0f`);
53	mask_float32x4 mask_nonzero = cmp_gt(ba, `0`);
54	mask_float32x4 mask = bit_and(mask_range, mask_nonzero); // takes care of nans and zeros
55
56	//calculate the ceil using trunc
57	int32x4 s = shift_r((uint32x4)a, `31`);
58	s = bit_xor(s, `0x00000001`); //=1 if a>0
59	float32x4 at = (float32x4) sub((int32x4)a, s); //=nextafter towards -inf if a>0
60	int32x4 ia = to_int32(at);
61	ia = add(ia, s);
62	float32x4 fa = to_float32(ia);
63
64	//combine the results
65	return blend(fa, a, mask);
66	#elif SIMDPP_USE_ALTIVEC
67	return vec_ceil(a.native());
68	#endif
69	}
70
71	#if SIMDPP_USE_AVX
72	static SIMDPP_INL
73	float32x8 i_ceil(const float32x8& a)
74	{
75	return _mm256_ceil_ps(a.native());
76	}
77	#endif
78
79	#if SIMDPP_USE_AVX512F
80	static SIMDPP_INL
81	float32<`16`> i_ceil(const float32<`16`>& a)
82	{
83	return _mm512_ceil_ps(a.native());
84	}
85	#endif
86
87	// -----------------------------------------------------------------------------
88
89	static SIMDPP_INL
90	float64x2 i_ceil(const float64x2& a)
91	{
92	#if SIMDPP_USE_SSE4_1
93	return _mm_ceil_pd(a.native());
94	#elif SIMDPP_USE_SSE2 \|\| SIMDPP_USE_MSA
95	float64x2 af = abs(a);
96	// check if the value is not too large or is a nan
97	mask_float64x2 mask_range = cmp_le(af, `4503599627370495.0`);
98	// check if truncate to zero or minus one
99	mask_float64x2 mask_1to1 = cmp_lt(af, `1.0`);
100
101	/ Emulate truncation for numbers not less than 1.0.*
102	This is implemented by clearing the mantissa in the source number,
103	adding 1.0 and subtracting integer 1. The mantissa of the resulting
104	number will effectively contain a bit mask defining which bits need to
105	be cleared off the source number in order to truncate it.
106	*/
107	float64x2 clearbits = bit_and(af, `0x7ff0000000000000`); // clear the mantissa
108	clearbits = add(clearbits, `1.0`);
109	clearbits = (float64x2) sub(uint64x2(clearbits), `1`);
110	clearbits = bit_andnot(clearbits, `0xfff0000000000000`); // leave only the mantissa
111
112	float64x2 a2 = bit_andnot(a, clearbits); // truncate
113
114	// check if we need to subtract one (truncated bits when negative)
115	mask_float64x2 mask_pos = cmp_gt(a, `0.0`);
116	mask_float64x2 mask_add1 = cmp_gt(bit_and(a, clearbits), `0.0`);
117	mask_add1 = bit_and(mask_add1, mask_pos);
118
119	// one special case is when 'a' is in the range of (0.0, 1.0) in which
120	// a & clearbits may still yield to zero. Thus this additional check
121	mask_add1 = bit_or(mask_add1, bit_and(mask_1to1, mask_pos));
122	float64x2 add1 = make_float(`1.0`);
123	add1 = bit_and(add1, mask_add1);
124
125	a2 = bit_andnot(a, mask_1to1);
126	a2 = add(a2, add1);
127
128	return blend(a2, a, mask_range);
129	#elif SIMDPP_USE_NEON64
130	return vrndpq_f64(a.native());
131	#elif SIMDPP_USE_VSX_206
132	return vec_ceil(a.native());
133	#elif SIMDPP_USE_NULL \|\| SIMDPP_USE_NEON32 \|\| SIMDPP_USE_ALTIVEC
134	float64x2 r;
135	for (unsigned i = `0`; i < r.length; ++i) {
136	r.el(i) = std::ceil(a.el(i));
137	}
138	return r;
139	#endif
140	}
141
142	#if SIMDPP_USE_AVX
143	static SIMDPP_INL
144	float64x4 i_ceil(const float64x4& a)
145	{
146	return _mm256_ceil_pd(a.native());
147	}
148	#endif
149
150	#if SIMDPP_USE_AVX512F
151	static SIMDPP_INL
152	float64<`8`> i_ceil(const float64<`8`>& a)
153	{
154	return _mm512_ceil_pd(a.native());
155	}
156	#endif
157
158	// -----------------------------------------------------------------------------
159
160	template<class V> SIMDPP_INL
161	V i_ceil(const V& a)
162	{
163	SIMDPP_VEC_ARRAY_IMPL1(V, i_ceil, a);
164	}
165
166	} // namespace insn
167	} // namespace detail
168	} // namespace SIMDPP_ARCH_NAMESPACE
169	} // namespace simdpp
170
171	#endif
172
173

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/insn/f_ceil.h