1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_TRUNC_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_TRUNC_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <cmath>
16#include <simdpp/types.h>
17#include <simdpp/core/cmp_gt.h>
18#include <simdpp/core/cmp_lt.h>
19#include <simdpp/core/cmp_le.h>
20#include <simdpp/core/f_ceil.h>
21#include <simdpp/core/f_add.h>
22#include <simdpp/core/make_float.h>
23#include <simdpp/core/make_uint.h>
24#include <simdpp/core/to_float32.h>
25#include <simdpp/core/to_float64.h>
26#include <simdpp/core/to_int32.h>
27#include <simdpp/core/to_int64.h>
28#include <simdpp/detail/vector_array_macros.h>
29
30namespace simdpp {
31namespace SIMDPP_ARCH_NAMESPACE {
32namespace detail {
33namespace insn {
34
35static SIMDPP_INL
36float32x4 i_trunc(const float32x4& a)
37{
38#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
39 float32x4 r;
40 for (unsigned i = 0; i < a.length; i++) {
41 r.el(i) = std::trunc(a.el(i));
42 }
43 return r;
44#elif SIMDPP_USE_SSE4_1
45 return _mm_round_ps(a.native(), 3); // 3 = i_truncate
46#elif SIMDPP_USE_NEON64
47 return vrndq_f32(a.native()); // FIXME: in ARM8 A32 too
48#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON_FLT_SP || SIMDPP_USE_MSA
49 //check if the value is not too large
50 float32x4 af = abs(a);
51 mask_float32x4 mask_range = cmp_lt(af, 8388608.0f);
52
53 // don't change the sign of negative zero
54 mask_float32x4 mask_nonzero = cmp_gt(af, 0);
55 mask_float32x4 mask = bit_and(mask_range, mask_nonzero);
56
57 //truncate
58 int32x4 ia = to_int32(a);
59 float32x4 fa = to_float32(ia);
60
61 //combine the results
62 return blend(fa, a, mask); // takes care of NaNs
63#elif SIMDPP_USE_ALTIVEC
64 return vec_trunc(a.native());
65#endif
66}
67
68#if SIMDPP_USE_AVX
69static SIMDPP_INL
70float32x8 i_trunc(const float32x8& a)
71{
72 return _mm256_round_ps(a.native(), 3); // 3 = i_truncate
73}
74#endif
75
76#if SIMDPP_USE_AVX512F
77static SIMDPP_INL
78float32<16> i_trunc(const float32<16>& a)
79{
80 return _mm512_roundscale_ps(a.native(), 0x13); // scale by 1, truncate
81}
82#endif
83
84// -----------------------------------------------------------------------------
85
86static SIMDPP_INL
87float64x2 i_trunc(const float64x2& a)
88{
89#if SIMDPP_USE_SSE4_1
90 return _mm_round_pd(a.native(), 3);
91#elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA
92 float64x2 af = abs(a);
93 // check if the value is not too large or is a nan
94 mask_float64x2 mask_range = cmp_le(af, 4503599627370495.0);
95 // check if truncate to zero
96 float64x2 mask_zero = cmp_lt(af, 1.0).eval().unmask();
97
98 /* Emulate truncation for numbers not less than 1.0.
99 This is implemented by clearing the mantissa in the source number,
100 adding 1.0 and subtracting integer 1. The mantissa of the resulting
101 number will effectively contain a bit mask defining which bits need to
102 be cleared off the source number in order to truncate it.
103 */
104 float64x2 clearbits = bit_and(af, 0x7ff0000000000000); // clear the mantissa
105 clearbits = add(clearbits, 1.0);
106 clearbits = (float64x2) sub(uint64x2(clearbits), 1);
107 clearbits = bit_andnot(clearbits, 0x7ff0000000000000); // leave only the mantissa
108
109 float64x2 a2 = bit_andnot(a, clearbits);
110 a2 = bit_andnot(a2, mask_zero);
111 return blend(a2, a, mask_range);
112#elif SIMDPP_USE_NEON64
113 return vrndq_f64(a.native());
114#elif SIMDPP_USE_VSX_206
115 return vec_trunc(a.native());
116#elif SIMDPP_USE_MSA
117 //check if the value is not too large
118 float64<2> af = abs(a);
119 mask_float64<2> mask_range = cmp_le(af, 4503599627370495.0);
120
121 // don't change the sign of negative zero
122 mask_float64<2> mask_nonzero = cmp_gt(af, 0);
123 mask_float64<2> mask = bit_and(mask_range, mask_nonzero);
124
125 //truncate
126 int64<2> ia = to_int64(a);
127 float64<2> fa = to_float64(ia);
128
129 //combine the results
130 return blend(fa, a, mask); // takes care of NaNs
131#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
132 float64x2 r;
133 for (unsigned i = 0; i < r.length; ++i) {
134 r.el(i) = std::trunc(a.el(i));
135 }
136 return r;
137#endif
138}
139
140#if SIMDPP_USE_AVX
141static SIMDPP_INL
142float64x4 i_trunc(const float64x4& a)
143{
144 return _mm256_round_pd(a.native(), 3); // 3 = i_truncate
145}
146#endif
147
148#if SIMDPP_USE_AVX512F
149static SIMDPP_INL
150float64<8> i_trunc(const float64<8>& a)
151{
152 return _mm512_roundscale_pd(a.native(), 0x13); // scale by 1, truncate
153}
154#endif
155
156// -----------------------------------------------------------------------------
157
158template<class V> SIMDPP_INL
159V i_trunc(const V& a)
160{
161 SIMDPP_VEC_ARRAY_IMPL1(V, i_trunc, a);
162}
163
164
165} // namespace insn
166} // namespace detail
167} // namespace SIMDPP_ARCH_NAMESPACE
168} // namespace simdpp
169
170#endif
171
172