1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_CEIL_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_CEIL_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/bit_or.h>
17#include <simdpp/core/blend.h>
18#include <simdpp/core/cmp_eq.h>
19#include <simdpp/core/cmp_gt.h>
20#include <simdpp/core/i_add.h>
21#include <simdpp/core/i_shift_r.h>
22#include <simdpp/core/i_sub.h>
23#include <simdpp/core/f_abs.h>
24#include <simdpp/core/f_add.h>
25#include <simdpp/core/make_float.h>
26#include <simdpp/core/make_int.h>
27#include <simdpp/core/to_float32.h>
28#include <simdpp/core/to_int32.h>
29#include <simdpp/detail/vector_array_macros.h>
30
31namespace simdpp {
32namespace SIMDPP_ARCH_NAMESPACE {
33namespace detail {
34namespace insn {
35
36static SIMDPP_INL
37float32x4 i_ceil(const float32x4& a)
38{
39#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
40 float32x4 r;
41 for (unsigned i = 0; i < a.length; i++) {
42 r.el(i) = std::ceil(a.el(i));
43 }
44 return r;
45#elif SIMDPP_USE_SSE4_1
46 return _mm_ceil_ps(a.native());
47#elif SIMDPP_USE_NEON64
48 return vrndpq_f32(a.native()); // FIXME: ARMv8
49#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON_FLT_SP || SIMDPP_USE_MSA
50 //check if the value is not too large, or is zero
51 float32x4 ba = abs(a);
52 mask_float32x4 mask_range = cmp_le(ba, 8388607.0f);
53 mask_float32x4 mask_nonzero = cmp_gt(ba, 0);
54 mask_float32x4 mask = bit_and(mask_range, mask_nonzero); // takes care of nans and zeros
55
56 //calculate the ceil using trunc
57 int32x4 s = shift_r((uint32x4)a, 31);
58 s = bit_xor(s, 0x00000001); //=1 if a>0
59 float32x4 at = (float32x4) sub((int32x4)a, s); //=nextafter towards -inf if a>0
60 int32x4 ia = to_int32(at);
61 ia = add(ia, s);
62 float32x4 fa = to_float32(ia);
63
64 //combine the results
65 return blend(fa, a, mask);
66#elif SIMDPP_USE_ALTIVEC
67 return vec_ceil(a.native());
68#endif
69}
70
71#if SIMDPP_USE_AVX
72static SIMDPP_INL
73float32x8 i_ceil(const float32x8& a)
74{
75 return _mm256_ceil_ps(a.native());
76}
77#endif
78
79#if SIMDPP_USE_AVX512F
80static SIMDPP_INL
81float32<16> i_ceil(const float32<16>& a)
82{
83 return _mm512_ceil_ps(a.native());
84}
85#endif
86
87// -----------------------------------------------------------------------------
88
89static SIMDPP_INL
90float64x2 i_ceil(const float64x2& a)
91{
92#if SIMDPP_USE_SSE4_1
93 return _mm_ceil_pd(a.native());
94#elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA
95 float64x2 af = abs(a);
96 // check if the value is not too large or is a nan
97 mask_float64x2 mask_range = cmp_le(af, 4503599627370495.0);
98 // check if truncate to zero or minus one
99 mask_float64x2 mask_1to1 = cmp_lt(af, 1.0);
100
101 /* Emulate truncation for numbers not less than 1.0.
102 This is implemented by clearing the mantissa in the source number,
103 adding 1.0 and subtracting integer 1. The mantissa of the resulting
104 number will effectively contain a bit mask defining which bits need to
105 be cleared off the source number in order to truncate it.
106 */
107 float64x2 clearbits = bit_and(af, 0x7ff0000000000000); // clear the mantissa
108 clearbits = add(clearbits, 1.0);
109 clearbits = (float64x2) sub(uint64x2(clearbits), 1);
110 clearbits = bit_andnot(clearbits, 0xfff0000000000000); // leave only the mantissa
111
112 float64x2 a2 = bit_andnot(a, clearbits); // truncate
113
114 // check if we need to subtract one (truncated bits when negative)
115 mask_float64x2 mask_pos = cmp_gt(a, 0.0);
116 mask_float64x2 mask_add1 = cmp_gt(bit_and(a, clearbits), 0.0);
117 mask_add1 = bit_and(mask_add1, mask_pos);
118
119 // one special case is when 'a' is in the range of (0.0, 1.0) in which
120 // a & clearbits may still yield to zero. Thus this additional check
121 mask_add1 = bit_or(mask_add1, bit_and(mask_1to1, mask_pos));
122 float64x2 add1 = make_float(1.0);
123 add1 = bit_and(add1, mask_add1);
124
125 a2 = bit_andnot(a, mask_1to1);
126 a2 = add(a2, add1);
127
128 return blend(a2, a, mask_range);
129#elif SIMDPP_USE_NEON64
130 return vrndpq_f64(a.native());
131#elif SIMDPP_USE_VSX_206
132 return vec_ceil(a.native());
133#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
134 float64x2 r;
135 for (unsigned i = 0; i < r.length; ++i) {
136 r.el(i) = std::ceil(a.el(i));
137 }
138 return r;
139#endif
140}
141
142#if SIMDPP_USE_AVX
143static SIMDPP_INL
144float64x4 i_ceil(const float64x4& a)
145{
146 return _mm256_ceil_pd(a.native());
147}
148#endif
149
150#if SIMDPP_USE_AVX512F
151static SIMDPP_INL
152float64<8> i_ceil(const float64<8>& a)
153{
154 return _mm512_ceil_pd(a.native());
155}
156#endif
157
158// -----------------------------------------------------------------------------
159
160template<class V> SIMDPP_INL
161V i_ceil(const V& a)
162{
163 SIMDPP_VEC_ARRAY_IMPL1(V, i_ceil, a);
164}
165
166} // namespace insn
167} // namespace detail
168} // namespace SIMDPP_ARCH_NAMESPACE
169} // namespace simdpp
170
171#endif
172
173