1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_CEIL_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_CEIL_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/bit_or.h> |
17 | #include <simdpp/core/blend.h> |
18 | #include <simdpp/core/cmp_eq.h> |
19 | #include <simdpp/core/cmp_gt.h> |
20 | #include <simdpp/core/i_add.h> |
21 | #include <simdpp/core/i_shift_r.h> |
22 | #include <simdpp/core/i_sub.h> |
23 | #include <simdpp/core/f_abs.h> |
24 | #include <simdpp/core/f_add.h> |
25 | #include <simdpp/core/make_float.h> |
26 | #include <simdpp/core/make_int.h> |
27 | #include <simdpp/core/to_float32.h> |
28 | #include <simdpp/core/to_int32.h> |
29 | #include <simdpp/detail/vector_array_macros.h> |
30 | |
31 | namespace simdpp { |
32 | namespace SIMDPP_ARCH_NAMESPACE { |
33 | namespace detail { |
34 | namespace insn { |
35 | |
36 | static SIMDPP_INL |
37 | float32x4 i_ceil(const float32x4& a) |
38 | { |
39 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
40 | float32x4 r; |
41 | for (unsigned i = 0; i < a.length; i++) { |
42 | r.el(i) = std::ceil(a.el(i)); |
43 | } |
44 | return r; |
45 | #elif SIMDPP_USE_SSE4_1 |
46 | return _mm_ceil_ps(a.native()); |
47 | #elif SIMDPP_USE_NEON64 |
48 | return vrndpq_f32(a.native()); // FIXME: ARMv8 |
49 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON_FLT_SP || SIMDPP_USE_MSA |
50 | //check if the value is not too large, or is zero |
51 | float32x4 ba = abs(a); |
52 | mask_float32x4 mask_range = cmp_le(ba, 8388607.0f); |
53 | mask_float32x4 mask_nonzero = cmp_gt(ba, 0); |
54 | mask_float32x4 mask = bit_and(mask_range, mask_nonzero); // takes care of nans and zeros |
55 | |
56 | //calculate the ceil using trunc |
57 | int32x4 s = shift_r((uint32x4)a, 31); |
58 | s = bit_xor(s, 0x00000001); //=1 if a>0 |
59 | float32x4 at = (float32x4) sub((int32x4)a, s); //=nextafter towards -inf if a>0 |
60 | int32x4 ia = to_int32(at); |
61 | ia = add(ia, s); |
62 | float32x4 fa = to_float32(ia); |
63 | |
64 | //combine the results |
65 | return blend(fa, a, mask); |
66 | #elif SIMDPP_USE_ALTIVEC |
67 | return vec_ceil(a.native()); |
68 | #endif |
69 | } |
70 | |
71 | #if SIMDPP_USE_AVX |
72 | static SIMDPP_INL |
73 | float32x8 i_ceil(const float32x8& a) |
74 | { |
75 | return _mm256_ceil_ps(a.native()); |
76 | } |
77 | #endif |
78 | |
79 | #if SIMDPP_USE_AVX512F |
80 | static SIMDPP_INL |
81 | float32<16> i_ceil(const float32<16>& a) |
82 | { |
83 | return _mm512_ceil_ps(a.native()); |
84 | } |
85 | #endif |
86 | |
87 | // ----------------------------------------------------------------------------- |
88 | |
89 | static SIMDPP_INL |
90 | float64x2 i_ceil(const float64x2& a) |
91 | { |
92 | #if SIMDPP_USE_SSE4_1 |
93 | return _mm_ceil_pd(a.native()); |
94 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA |
95 | float64x2 af = abs(a); |
96 | // check if the value is not too large or is a nan |
97 | mask_float64x2 mask_range = cmp_le(af, 4503599627370495.0); |
98 | // check if truncate to zero or minus one |
99 | mask_float64x2 mask_1to1 = cmp_lt(af, 1.0); |
100 | |
101 | /* Emulate truncation for numbers not less than 1.0. |
102 | This is implemented by clearing the mantissa in the source number, |
103 | adding 1.0 and subtracting integer 1. The mantissa of the resulting |
104 | number will effectively contain a bit mask defining which bits need to |
105 | be cleared off the source number in order to truncate it. |
106 | */ |
107 | float64x2 clearbits = bit_and(af, 0x7ff0000000000000); // clear the mantissa |
108 | clearbits = add(clearbits, 1.0); |
109 | clearbits = (float64x2) sub(uint64x2(clearbits), 1); |
110 | clearbits = bit_andnot(clearbits, 0xfff0000000000000); // leave only the mantissa |
111 | |
112 | float64x2 a2 = bit_andnot(a, clearbits); // truncate |
113 | |
114 | // check if we need to subtract one (truncated bits when negative) |
115 | mask_float64x2 mask_pos = cmp_gt(a, 0.0); |
116 | mask_float64x2 mask_add1 = cmp_gt(bit_and(a, clearbits), 0.0); |
117 | mask_add1 = bit_and(mask_add1, mask_pos); |
118 | |
119 | // one special case is when 'a' is in the range of (0.0, 1.0) in which |
120 | // a & clearbits may still yield to zero. Thus this additional check |
121 | mask_add1 = bit_or(mask_add1, bit_and(mask_1to1, mask_pos)); |
122 | float64x2 add1 = make_float(1.0); |
123 | add1 = bit_and(add1, mask_add1); |
124 | |
125 | a2 = bit_andnot(a, mask_1to1); |
126 | a2 = add(a2, add1); |
127 | |
128 | return blend(a2, a, mask_range); |
129 | #elif SIMDPP_USE_NEON64 |
130 | return vrndpq_f64(a.native()); |
131 | #elif SIMDPP_USE_VSX_206 |
132 | return vec_ceil(a.native()); |
133 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
134 | float64x2 r; |
135 | for (unsigned i = 0; i < r.length; ++i) { |
136 | r.el(i) = std::ceil(a.el(i)); |
137 | } |
138 | return r; |
139 | #endif |
140 | } |
141 | |
142 | #if SIMDPP_USE_AVX |
143 | static SIMDPP_INL |
144 | float64x4 i_ceil(const float64x4& a) |
145 | { |
146 | return _mm256_ceil_pd(a.native()); |
147 | } |
148 | #endif |
149 | |
150 | #if SIMDPP_USE_AVX512F |
151 | static SIMDPP_INL |
152 | float64<8> i_ceil(const float64<8>& a) |
153 | { |
154 | return _mm512_ceil_pd(a.native()); |
155 | } |
156 | #endif |
157 | |
158 | // ----------------------------------------------------------------------------- |
159 | |
160 | template<class V> SIMDPP_INL |
161 | V i_ceil(const V& a) |
162 | { |
163 | SIMDPP_VEC_ARRAY_IMPL1(V, i_ceil, a); |
164 | } |
165 | |
166 | } // namespace insn |
167 | } // namespace detail |
168 | } // namespace SIMDPP_ARCH_NAMESPACE |
169 | } // namespace simdpp |
170 | |
171 | #endif |
172 | |
173 | |