1/* Copyright (C) 2013 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_DETAIL_CAST_BITWISE_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_DETAIL_CAST_BITWISE_H
10
11#include <simdpp/types.h>
12
13namespace simdpp {
14namespace SIMDPP_ARCH_NAMESPACE {
15namespace detail {
16
17/* Note that in this function we are invoking undefined behavior that happens
18 to work in all compilers the library supports. The only non-undefined way
19 to do bitwise data transfer between unrelated types without breaking strict
20 aliasing rules is the memcpy() function. Unfortunately some compilers can't
21 fully optimize out the overhead of the function which leads to unnecessary
22 data movement to the stack.
23
24 Note that this function does not fully work with vector types even in C++11
25 mode where they are trivial types and thus may be placed in an union.
26 Vectors containing one or two native vectors are fine, but larger vectors
27 containing 4 or more native vectors result in internal compiler errors or
28 miscompiled code on some compilers.
29*/
30template<class T, class R> SIMDPP_INL
31void cast_bitwise(const T& t, R& r)
32{
33 static_assert(sizeof(R) == sizeof(T), "Size mismatch");
34 union {
35 T t_union;
36 R r_union;
37 };
38 t_union = t;
39 r = r_union;
40}
41
42enum {
43 VECTOR_CAST_TYPE_1_TO_1,
44 VECTOR_CAST_TYPE_SPLIT2,
45 VECTOR_CAST_TYPE_COMBINE2,
46 VECTOR_CAST_TYPE_INVALID
47};
48
49#if (__GNUC__ >= 6) && !defined(__INTEL_COMPILER) && !defined(__clang__)
50/* native_cast, native_cast_split and native_cast_combine uses native vector
51 type as class template parameter. On GCC vector types have alignment
52 attributes specified on some architectures. This leads to "ignored
53 attributes" warning, because the attributes are not part of the type.
54 Since libsimdpp always uses the same attributes for all native_type members
55 we can safely ignore this warning.
56*/
57#pragma GCC diagnostic push
58#pragma GCC diagnostic ignored "-Wignored-attributes"
59#endif
60
61// The Size argument is needed to disambiguate vectors of different size on old
62// GNU ABIs.
63template<unsigned Size, class NativeT, class NativeR, bool IsVarArray>
64struct native_cast;
65
66template<unsigned Size, class T, class R> struct native_cast<Size, T, R, false> {
67 static SIMDPP_INL R cast(const T& t) { return R(t); }
68};
69
70template<unsigned Size, class T> struct native_cast<Size, T, T, false> {
71 static SIMDPP_INL T cast(const T& t) { return t; }
72};
73
74template<unsigned Size, class T, class R> struct native_cast<Size, T, R, true> {
75 static SIMDPP_INL R cast(const T& t)
76 {
77 R r;
78 cast_bitwise(t, r);
79 return r;
80 }
81};
82
83#define NATIVE_CAST_IMPL(SIZE, T_TYPE, R_TYPE, FUNC) \
84template<> struct native_cast<SIZE, T_TYPE, R_TYPE, false> { \
85 static SIMDPP_INL R_TYPE cast(const T_TYPE& t) { return FUNC(t); } \
86}
87
88#if SIMDPP_USE_SSE2
89NATIVE_CAST_IMPL(16, __m128, __m128i, _mm_castps_si128);
90NATIVE_CAST_IMPL(16, __m128, __m128d, _mm_castps_pd);
91NATIVE_CAST_IMPL(16, __m128i, __m128, _mm_castsi128_ps);
92NATIVE_CAST_IMPL(16, __m128i, __m128d, _mm_castsi128_pd);
93NATIVE_CAST_IMPL(16, __m128d, __m128i, _mm_castpd_si128);
94NATIVE_CAST_IMPL(16, __m128d, __m128, _mm_castpd_ps);
95#endif
96
97#if SIMDPP_USE_AVX
98NATIVE_CAST_IMPL(32, __m256, __m256i, _mm256_castps_si256);
99NATIVE_CAST_IMPL(32, __m256, __m256d, _mm256_castps_pd);
100NATIVE_CAST_IMPL(32, __m256i, __m256, _mm256_castsi256_ps);
101NATIVE_CAST_IMPL(32, __m256i, __m256d, _mm256_castsi256_pd);
102NATIVE_CAST_IMPL(32, __m256d, __m256i, _mm256_castpd_si256);
103NATIVE_CAST_IMPL(32, __m256d, __m256, _mm256_castpd_ps);
104#endif
105
106#if SIMDPP_USE_AVX512F
107NATIVE_CAST_IMPL(64, __m512, __m512i, _mm512_castps_si512);
108NATIVE_CAST_IMPL(64, __m512, __m512d, _mm512_castps_pd);
109NATIVE_CAST_IMPL(64, __m512i, __m512, _mm512_castsi512_ps);
110NATIVE_CAST_IMPL(64, __m512i, __m512d, _mm512_castsi512_pd);
111NATIVE_CAST_IMPL(64, __m512d, __m512i, _mm512_castpd_si512);
112NATIVE_CAST_IMPL(64, __m512d, __m512, _mm512_castpd_ps);
113#endif
114
115#if SIMDPP_USE_NEON
116NATIVE_CAST_IMPL(16, float32x4_t, uint64x2_t, vreinterpretq_u64_f32);
117NATIVE_CAST_IMPL(16, float32x4_t, int64x2_t, vreinterpretq_s64_f32);
118NATIVE_CAST_IMPL(16, float32x4_t, uint32x4_t, vreinterpretq_u32_f32);
119NATIVE_CAST_IMPL(16, float32x4_t, int32x4_t, vreinterpretq_s32_f32);
120NATIVE_CAST_IMPL(16, float32x4_t, uint16x8_t, vreinterpretq_u16_f32);
121NATIVE_CAST_IMPL(16, float32x4_t, int16x8_t, vreinterpretq_s16_f32);
122NATIVE_CAST_IMPL(16, float32x4_t, uint8x16_t, vreinterpretq_u8_f32);
123NATIVE_CAST_IMPL(16, float32x4_t, int8x16_t, vreinterpretq_s8_f32);
124
125NATIVE_CAST_IMPL(16, uint64x2_t, int64x2_t, vreinterpretq_s64_u64);
126NATIVE_CAST_IMPL(16, uint64x2_t, uint32x4_t, vreinterpretq_u32_u64);
127NATIVE_CAST_IMPL(16, uint64x2_t, int32x4_t, vreinterpretq_s32_u64);
128NATIVE_CAST_IMPL(16, uint64x2_t, uint16x8_t, vreinterpretq_u16_u64);
129NATIVE_CAST_IMPL(16, uint64x2_t, int16x8_t, vreinterpretq_s16_u64);
130NATIVE_CAST_IMPL(16, uint64x2_t, uint8x16_t, vreinterpretq_u8_u64);
131NATIVE_CAST_IMPL(16, uint64x2_t, int8x16_t, vreinterpretq_s8_u64);
132NATIVE_CAST_IMPL(16, uint64x2_t, float32x4_t, vreinterpretq_f32_u64);
133
134NATIVE_CAST_IMPL(16, int64x2_t, uint64x2_t, vreinterpretq_u64_s64);
135NATIVE_CAST_IMPL(16, int64x2_t, uint32x4_t, vreinterpretq_u32_s64);
136NATIVE_CAST_IMPL(16, int64x2_t, int32x4_t, vreinterpretq_s32_s64);
137NATIVE_CAST_IMPL(16, int64x2_t, uint16x8_t, vreinterpretq_u16_s64);
138NATIVE_CAST_IMPL(16, int64x2_t, int16x8_t, vreinterpretq_s16_s64);
139NATIVE_CAST_IMPL(16, int64x2_t, uint8x16_t, vreinterpretq_u8_s64);
140NATIVE_CAST_IMPL(16, int64x2_t, int8x16_t, vreinterpretq_s8_s64);
141NATIVE_CAST_IMPL(16, int64x2_t, float32x4_t, vreinterpretq_f32_s64);
142
143NATIVE_CAST_IMPL(16, uint32x4_t, uint64x2_t, vreinterpretq_u64_u32);
144NATIVE_CAST_IMPL(16, uint32x4_t, int64x2_t, vreinterpretq_s64_u32);
145NATIVE_CAST_IMPL(16, uint32x4_t, int32x4_t, vreinterpretq_s32_u32);
146NATIVE_CAST_IMPL(16, uint32x4_t, uint16x8_t, vreinterpretq_u16_u32);
147NATIVE_CAST_IMPL(16, uint32x4_t, int16x8_t, vreinterpretq_s16_u32);
148NATIVE_CAST_IMPL(16, uint32x4_t, uint8x16_t, vreinterpretq_u8_u32);
149NATIVE_CAST_IMPL(16, uint32x4_t, int8x16_t, vreinterpretq_s8_u32);
150NATIVE_CAST_IMPL(16, uint32x4_t, float32x4_t, vreinterpretq_f32_u32);
151
152NATIVE_CAST_IMPL(16, int32x4_t, uint64x2_t, vreinterpretq_u64_s32);
153NATIVE_CAST_IMPL(16, int32x4_t, int64x2_t, vreinterpretq_s64_s32);
154NATIVE_CAST_IMPL(16, int32x4_t, uint32x4_t, vreinterpretq_u32_s32);
155NATIVE_CAST_IMPL(16, int32x4_t, uint16x8_t, vreinterpretq_u16_s32);
156NATIVE_CAST_IMPL(16, int32x4_t, int16x8_t, vreinterpretq_s16_s32);
157NATIVE_CAST_IMPL(16, int32x4_t, uint8x16_t, vreinterpretq_u8_s32);
158NATIVE_CAST_IMPL(16, int32x4_t, int8x16_t, vreinterpretq_s8_s32);
159NATIVE_CAST_IMPL(16, int32x4_t, float32x4_t, vreinterpretq_f32_s32);
160
161NATIVE_CAST_IMPL(16, uint16x8_t, uint64x2_t, vreinterpretq_u64_u16);
162NATIVE_CAST_IMPL(16, uint16x8_t, int64x2_t, vreinterpretq_s64_u16);
163NATIVE_CAST_IMPL(16, uint16x8_t, uint32x4_t, vreinterpretq_u32_u16);
164NATIVE_CAST_IMPL(16, uint16x8_t, int32x4_t, vreinterpretq_s32_u16);
165NATIVE_CAST_IMPL(16, uint16x8_t, int16x8_t, vreinterpretq_s16_u16);
166NATIVE_CAST_IMPL(16, uint16x8_t, uint8x16_t, vreinterpretq_u8_u16);
167NATIVE_CAST_IMPL(16, uint16x8_t, int8x16_t, vreinterpretq_s8_u16);
168NATIVE_CAST_IMPL(16, uint16x8_t, float32x4_t, vreinterpretq_f32_u16);
169
170NATIVE_CAST_IMPL(16, int16x8_t, uint64x2_t, vreinterpretq_u64_s16);
171NATIVE_CAST_IMPL(16, int16x8_t, int64x2_t, vreinterpretq_s64_s16);
172NATIVE_CAST_IMPL(16, int16x8_t, uint32x4_t, vreinterpretq_u32_s16);
173NATIVE_CAST_IMPL(16, int16x8_t, int32x4_t, vreinterpretq_s32_s16);
174NATIVE_CAST_IMPL(16, int16x8_t, uint16x8_t, vreinterpretq_u16_s16);
175NATIVE_CAST_IMPL(16, int16x8_t, uint8x16_t, vreinterpretq_u8_s16);
176NATIVE_CAST_IMPL(16, int16x8_t, int8x16_t, vreinterpretq_s8_s16);
177NATIVE_CAST_IMPL(16, int16x8_t, float32x4_t, vreinterpretq_f32_s16);
178
179NATIVE_CAST_IMPL(16, uint8x16_t, uint64x2_t, vreinterpretq_u64_u8);
180NATIVE_CAST_IMPL(16, uint8x16_t, int64x2_t, vreinterpretq_s64_u8);
181NATIVE_CAST_IMPL(16, uint8x16_t, uint32x4_t, vreinterpretq_u32_u8);
182NATIVE_CAST_IMPL(16, uint8x16_t, int32x4_t, vreinterpretq_s32_u8);
183NATIVE_CAST_IMPL(16, uint8x16_t, uint16x8_t, vreinterpretq_u16_u8);
184NATIVE_CAST_IMPL(16, uint8x16_t, int16x8_t, vreinterpretq_s16_u8);
185NATIVE_CAST_IMPL(16, uint8x16_t, int8x16_t, vreinterpretq_s8_u8);
186NATIVE_CAST_IMPL(16, uint8x16_t, float32x4_t, vreinterpretq_f32_u8);
187
188NATIVE_CAST_IMPL(16, int8x16_t, uint64x2_t, vreinterpretq_u64_s8);
189NATIVE_CAST_IMPL(16, int8x16_t, int64x2_t, vreinterpretq_s64_s8);
190NATIVE_CAST_IMPL(16, int8x16_t, uint32x4_t, vreinterpretq_u32_s8);
191NATIVE_CAST_IMPL(16, int8x16_t, int32x4_t, vreinterpretq_s32_s8);
192NATIVE_CAST_IMPL(16, int8x16_t, uint16x8_t, vreinterpretq_u16_s8);
193NATIVE_CAST_IMPL(16, int8x16_t, int16x8_t, vreinterpretq_s16_s8);
194NATIVE_CAST_IMPL(16, int8x16_t, uint8x16_t, vreinterpretq_u8_s8);
195NATIVE_CAST_IMPL(16, int8x16_t, float32x4_t, vreinterpretq_f32_s8);
196#endif
197
198#if SIMDPP_USE_NEON64
199NATIVE_CAST_IMPL(16, float64x2_t, uint64x2_t, vreinterpretq_u64_f64);
200NATIVE_CAST_IMPL(16, float64x2_t, int64x2_t, vreinterpretq_s64_f64);
201NATIVE_CAST_IMPL(16, float64x2_t, uint32x4_t, vreinterpretq_u32_f64);
202NATIVE_CAST_IMPL(16, float64x2_t, int32x4_t, vreinterpretq_s32_f64);
203NATIVE_CAST_IMPL(16, float64x2_t, uint16x8_t, vreinterpretq_u16_f64);
204NATIVE_CAST_IMPL(16, float64x2_t, int16x8_t, vreinterpretq_s16_f64);
205NATIVE_CAST_IMPL(16, float64x2_t, uint8x16_t, vreinterpretq_u8_f64);
206NATIVE_CAST_IMPL(16, float64x2_t, int8x16_t, vreinterpretq_s8_f64);
207NATIVE_CAST_IMPL(16, float64x2_t, float32x4_t, vreinterpretq_f32_f64);
208
209NATIVE_CAST_IMPL(16, uint64x2_t, float64x2_t, vreinterpretq_f64_u64);
210NATIVE_CAST_IMPL(16, int64x2_t, float64x2_t, vreinterpretq_f64_s64);
211NATIVE_CAST_IMPL(16, uint32x4_t, float64x2_t, vreinterpretq_f64_u32);
212NATIVE_CAST_IMPL(16, int32x4_t, float64x2_t, vreinterpretq_f64_s32);
213NATIVE_CAST_IMPL(16, uint16x8_t, float64x2_t, vreinterpretq_f64_u16);
214NATIVE_CAST_IMPL(16, int16x8_t, float64x2_t, vreinterpretq_f64_s16);
215NATIVE_CAST_IMPL(16, uint8x16_t, float64x2_t, vreinterpretq_f64_u8);
216NATIVE_CAST_IMPL(16, int8x16_t, float64x2_t, vreinterpretq_f64_s8);
217NATIVE_CAST_IMPL(16, float32x4_t, float64x2_t, vreinterpretq_f64_f32);
218#endif
219#undef NATIVE_CAST_IMPL
220
221template<unsigned SizeT, class NativeT, class NativeR> struct native_cast_split;
222template<unsigned SizeR, class NativeT, class NativeR> struct native_cast_combine;
223
224#if SIMDPP_USE_AVX
225template<> struct native_cast_split<32, __m256, __m128i> {
226 static SIMDPP_INL void cast(const __m256& t, __m128i& r0, __m128i& r1)
227 {
228 r0 = _mm_castps_si128(_mm256_castps256_ps128(t));
229 r1 = _mm_castps_si128(_mm256_extractf128_ps(t, 1));
230 }
231};
232
233template<> struct native_cast_split<32, __m256d, __m128i> {
234 static SIMDPP_INL void cast(const __m256d& t, __m128i& r0, __m128i& r1)
235 {
236 r0 = _mm_castpd_si128(_mm256_castpd256_pd128(t));
237 r1 = _mm_castpd_si128(_mm256_extractf128_pd(t, 1));
238 }
239};
240
241template<> struct native_cast_combine<32, __m128i, __m256> {
242 static SIMDPP_INL __m256 cast(const __m128i& t0, const __m128i& t1)
243 {
244 __m256 r = _mm256_castsi256_ps(_mm256_castsi128_si256(t0));
245 r = _mm256_insertf128_ps(r, _mm_castsi128_ps(t1), 1);
246 return r;
247 }
248};
249
250template<> struct native_cast_combine<32, __m128i, __m256d> {
251 static SIMDPP_INL __m256d cast(const __m128i& t0, const __m128i& t1)
252 {
253 __m256d r = _mm256_castsi256_pd(_mm256_castsi128_si256(t0));
254 r = _mm256_insertf128_pd(r, _mm_castsi128_pd(t1), 1);
255 return r;
256 }
257};
258#endif
259
260#if SIMDPP_USE_AVX512F
261template<> struct native_cast_split<64, __m512i, __m256i> {
262 static SIMDPP_INL void cast(const __m512i& t, __m256i& r0, __m256i& r1)
263 {
264 r0 = _mm512_castsi512_si256(t);
265 r1 = _mm512_extracti64x4_epi64(t, 1);
266 }
267};
268
269template<> struct native_cast_split<64, __m512, __m256i> {
270 static SIMDPP_INL void cast(const __m512& t, __m256i& r0, __m256i& r1)
271 {
272 r0 = _mm256_castps_si256(_mm512_castps512_ps256(t));
273 r1 = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castps_pd(t), 1));
274 }
275};
276
277template<> struct native_cast_split<64, __m512d, __m256i> {
278 static SIMDPP_INL void cast(const __m512d& t, __m256i& r0, __m256i& r1)
279 {
280 r0 = _mm256_castpd_si256(_mm512_castpd512_pd256(t));
281 r1 = _mm256_castpd_si256(_mm512_extractf64x4_pd(t, 1));
282 }
283};
284
285template<> struct native_cast_combine<64, __m256i, __m512i> {
286 static SIMDPP_INL __m512i cast(const __m256i& t0, const __m256i& t1)
287 {
288 __m512i r = _mm512_castsi256_si512(t0);
289 return _mm512_inserti64x4(r, t1, 1);
290 }
291};
292
293template<> struct native_cast_combine<64, __m256i, __m512> {
294 static SIMDPP_INL __m512 cast(const __m256i& t0, const __m256i& t1)
295 {
296 __m512d r = _mm512_castsi512_pd(_mm512_castsi256_si512(t0));
297 r = _mm512_insertf64x4(r, _mm256_castsi256_pd(t1), 1);
298 return _mm512_castpd_ps(r);
299 }
300};
301
302template<> struct native_cast_combine<64, __m256i, __m512d> {
303 static SIMDPP_INL __m512d cast(const __m256i& t0, const __m256i& t1)
304 {
305 __m512d r = _mm512_castsi512_pd(_mm512_castsi256_si512(t0));
306 r = _mm512_insertf64x4(r, _mm256_castsi256_pd(t1), 1);
307 return r;
308 }
309};
310#endif
311
312template<unsigned CastType>
313struct cast_bitwise_vector_impl;
314
315template<class T>
316struct is_vararray : std::false_type {};
317
318template<class T, unsigned N>
319struct is_vararray<vararray<T, N>> : std::true_type {};
320
321template<>
322struct cast_bitwise_vector_impl<VECTOR_CAST_TYPE_1_TO_1> {
323 template<class T, class R> SIMDPP_INL static
324 void cast(const T& t, R& r)
325 {
326 using NativeT = typename T::base_vector_type::native_type;
327 using NativeR = typename R::base_vector_type::native_type;
328 const bool is_arg_vararray =
329 is_vararray<NativeT>::value || is_vararray<NativeR>::value;
330 using CastImpl = native_cast<sizeof(NativeT), NativeT,
331 NativeR, is_arg_vararray>;
332
333 for (unsigned i = 0; i < T::vec_length; ++i) {
334 r.vec(i) = CastImpl::cast(t.vec(i).native());
335 }
336 }
337};
338
339template<>
340struct cast_bitwise_vector_impl<VECTOR_CAST_TYPE_SPLIT2> {
341 template<class T, class R> SIMDPP_INL static
342 void cast(const T& t, R& r)
343 {
344 using NativeT = typename T::base_vector_type::native_type;
345 using NativeR = typename R::base_vector_type::native_type;
346 using CastImpl = native_cast_split<sizeof(NativeT), NativeT, NativeR>;
347
348 for (unsigned i = 0; i < T::vec_length; ++i) {
349 NativeR r0, r1;
350 CastImpl::cast(t.vec(i).native(), r0, r1);
351 r.vec(i*2) = r0;
352 r.vec(i*2+1) = r1;
353 }
354 }
355};
356
357template<>
358struct cast_bitwise_vector_impl<VECTOR_CAST_TYPE_COMBINE2> {
359 template<class T, class R> SIMDPP_INL static
360 void cast(const T& t, R& r)
361 {
362 using NativeT = typename T::base_vector_type::native_type;
363 using NativeR = typename R::base_vector_type::native_type;
364 using CastImpl = native_cast_combine<sizeof(NativeR), NativeT, NativeR>;
365
366 for (unsigned i = 0; i < R::vec_length; ++i) {
367 r.vec(i) = CastImpl::cast(t.vec(i*2).native(),
368 t.vec(i*2+1).native());
369 }
370 }
371};
372
373template<class T, class R> SIMDPP_INL
374void cast_bitwise_vector(const T& t, R& r)
375{
376 static_assert(sizeof(R) == sizeof(T), "Size mismatch");
377 const unsigned vector_cast_type =
378 T::vec_length == R::vec_length ? VECTOR_CAST_TYPE_1_TO_1 :
379 T::vec_length == R::vec_length*2 ? VECTOR_CAST_TYPE_COMBINE2 :
380 T::vec_length*2 == R::vec_length ? VECTOR_CAST_TYPE_SPLIT2 :
381 VECTOR_CAST_TYPE_INVALID;
382
383 cast_bitwise_vector_impl<vector_cast_type>::cast(t, r);
384}
385
386#if (__GNUC__ >= 6) && !defined(__INTEL_COMPILER) && !defined(__clang__)
387#pragma GCC diagnostic pop
388#endif
389
390} // namespace detail
391} // namespace SIMDPP_ARCH_NAMESPACE
392} // namespace simdpp
393
394#endif
395