1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_FLOOR_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_FLOOR_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <cmath>
16#include <simdpp/types.h>
17#include <simdpp/core/f_abs.h>
18#include <simdpp/core/bit_or.h>
19#include <simdpp/core/blend.h>
20#include <simdpp/core/cmp_eq.h>
21#include <simdpp/core/cmp_gt.h>
22#include <simdpp/core/i_shift_r.h>
23#include <simdpp/core/i_sub.h>
24#include <simdpp/core/to_float32.h>
25#include <simdpp/core/to_int32.h>
26#include <simdpp/detail/vector_array_macros.h>
27
28namespace simdpp {
29namespace SIMDPP_ARCH_NAMESPACE {
30namespace detail {
31namespace insn {
32
33
34static SIMDPP_INL
35float32x4 i_floor(const float32x4& a)
36{
37#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
38 float32x4 r;
39 for (unsigned i = 0; i < a.length; i++) {
40 r.el(i) = std::floor(a.el(i));
41 }
42 return r;
43#elif SIMDPP_USE_SSE4_1
44 return _mm_floor_ps(a.native());
45#elif SIMDPP_USE_NEON64
46 return vrndmq_f32(a.native());
47#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON_FLT_SP || SIMDPP_USE_MSA
48 //check if the value is not too large, or is zero
49 float32x4 ba = abs(a);
50 mask_float32x4 mask_range = cmp_le(ba, 8388607.0f);
51 mask_float32x4 mask_nonzero = cmp_gt(ba, 0);
52 mask_float32x4 mask = bit_and(mask_range, mask_nonzero); // takes care of nans and zeros
53
54 //calculate the i_floor using trunc
55 int32x4 s = shift_r((uint32x4)a, 31); //=1 if a<0
56 float32x4 at = (float32x4) sub((int32x4)a, s); //=nextafter towards +inf, if a<0
57 int32x4 ia = to_int32(at);
58 ia = sub(ia, s);
59 float32x4 fa = to_float32(ia);
60
61 //combine the results
62 return blend(fa, a, mask);
63#elif SIMDPP_USE_ALTIVEC
64 return vec_floor(a.native());
65#endif
66}
67
68#if SIMDPP_USE_AVX
69static SIMDPP_INL
70float32x8 i_floor(const float32x8& a)
71{
72 return _mm256_floor_ps(a.native());
73}
74#endif
75
76#if SIMDPP_USE_AVX512F
77static SIMDPP_INL
78float32<16> i_floor(const float32<16>& a)
79{
80 return _mm512_floor_ps(a.native());
81}
82#endif
83
84// -----------------------------------------------------------------------------
85
86static SIMDPP_INL
87float64x2 i_floor(const float64x2& a)
88{
89#if SIMDPP_USE_SSE4_1
90 return _mm_floor_pd(a.native());
91#elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA
92 float64x2 af = abs(a);
93 // check if the value is not too large or is a nan
94 mask_float64x2 mask_range = cmp_le(af, 4503599627370495.0);
95 // check if truncate to zero or minus one
96 mask_float64x2 mask_1to1 = cmp_lt(af, 1.0);
97
98 /* Emulate truncation for numbers not less than 1.0.
99 This is implemented by clearing the mantissa in the source number,
100 adding 1.0 and subtracting integer 1. The mantissa of the resulting
101 number will effectively contain a bit mask defining which bits need to
102 be cleared off the source number in order to truncate it.
103 */
104 float64x2 clearbits = bit_and(af, 0x7ff0000000000000); // clear the mantissa
105 clearbits = add(clearbits, 1.0);
106 clearbits = (float64x2) sub(uint64x2(clearbits), 1);
107 clearbits = bit_andnot(clearbits, 0xfff0000000000000); // leave only the mantissa
108
109 float64x2 a2 = bit_andnot(a, clearbits); // truncate
110
111 // check if we need to subtract one (truncated bits when negative)
112 mask_float64x2 mask_neg = cmp_lt(a, 0.0);
113 mask_float64x2 mask_sub1 = cmp_gt(bit_and(a, clearbits), 0.0);
114 mask_sub1 = bit_and(mask_sub1, mask_neg);
115
116 // one special case is when 'a' is in the range of (-1.0, 0.0) in which
117 // a & clearbits may still yield to zero. Thus this additional check
118 mask_sub1 = bit_or(mask_sub1, bit_and(mask_1to1, mask_neg));
119 float64x2 sub1 = make_float(-1.0);
120 sub1 = bit_and(sub1, mask_sub1);
121
122 a2 = bit_andnot(a, mask_1to1);
123 a2 = sub(a2, sub1);
124
125 return blend(a2, a, mask_range);
126#elif SIMDPP_USE_NEON64
127 return vrndnq_f64(a.native());
128#elif SIMDPP_USE_VSX_206
129 return vec_floor(a.native());
130#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
131 float64x2 r;
132 for (unsigned i = 0; i < r.length; ++i) {
133 r.el(i) = std::floor(a.el(i));
134 }
135 return r;
136#endif
137}
138
139#if SIMDPP_USE_AVX
140static SIMDPP_INL
141float64x4 i_floor(const float64x4& a)
142{
143 return _mm256_floor_pd(a.native());
144}
145#endif
146
147#if SIMDPP_USE_AVX512F
148static SIMDPP_INL
149float64<8> i_floor(const float64<8>& a)
150{
151 return _mm512_floor_pd(a.native());
152}
153#endif
154
155template<class V> SIMDPP_INL
156V i_floor(const V& a)
157{
158 SIMDPP_VEC_ARRAY_IMPL1(V, i_floor, a);
159}
160
161} // namespace insn
162} // namespace detail
163} // namespace SIMDPP_ARCH_NAMESPACE
164} // namespace simdpp
165
166#endif
167
168