cast_bitwise.h source code [bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/cast_bitwise.h]

1	/ Copyright (C) 2013 Povilas Kanapickas <povilas@radix.lt>*
2
3	Distributed under the Boost Software License, Version 1.0.
4	(See accompanying file LICENSE_1_0.txt or copy at
5	http://www.boost.org/LICENSE_1_0.txt)
6	*/
7
8	#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_DETAIL_CAST_BITWISE_H
9	#define LIBSIMDPP_SIMDPP_DETAIL_INSN_DETAIL_CAST_BITWISE_H
10
11	#include <simdpp/types.h>
12
13	namespace simdpp {
14	namespace SIMDPP_ARCH_NAMESPACE {
15	namespace detail {
16
17	/ Note that in this function we are invoking undefined behavior that happens*
18	to work in all compilers the library supports. The only non-undefined way
19	to do bitwise data transfer between unrelated types without breaking strict
20	aliasing rules is the memcpy() function. Unfortunately some compilers can't
21	fully optimize out the overhead of the function which leads to unnecessary
22	data movement to the stack.
23
24	Note that this function does not fully work with vector types even in C++11
25	mode where they are trivial types and thus may be placed in an union.
26	Vectors containing one or two native vectors are fine, but larger vectors
27	containing 4 or more native vectors result in internal compiler errors or
28	miscompiled code on some compilers.
29	*/
30	template<class T, class R> SIMDPP_INL
31	void cast_bitwise(const T& t, R& r)
32	{
33	static_assert(sizeof(R) == sizeof(T), "Size mismatch");
34	union {
35	T t_union;
36	R r_union;
37	};
38	t_union = t;
39	r = r_union;
40	}
41
42	enum {
43	VECTOR_CAST_TYPE_1_TO_1,
44	VECTOR_CAST_TYPE_SPLIT2,
45	VECTOR_CAST_TYPE_COMBINE2,
46	VECTOR_CAST_TYPE_INVALID
47	};
48
49	#if (__GNUC__ >= 6) && !defined(__INTEL_COMPILER) && !defined(__clang__)
50	/ native_cast, native_cast_split and native_cast_combine uses native vector*
51	type as class template parameter. On GCC vector types have alignment
52	attributes specified on some architectures. This leads to "ignored
53	attributes" warning, because the attributes are not part of the type.
54	Since libsimdpp always uses the same attributes for all native_type members
55	we can safely ignore this warning.
56	*/
57	#pragma GCC diagnostic push
58	#pragma GCC diagnostic ignored "-Wignored-attributes"
59	#endif
60
61	// The Size argument is needed to disambiguate vectors of different size on old
62	// GNU ABIs.
63	template<unsigned Size, class NativeT, class NativeR, bool IsVarArray>
64	struct native_cast;
65
66	template<unsigned Size, class T, class R> struct native_cast<Size, T, R, false> {
67	static SIMDPP_INL R cast(const T& t) { return R(t); }
68	};
69
70	template<unsigned Size, class T> struct native_cast<Size, T, T, false> {
71	static SIMDPP_INL T cast(const T& t) { return t; }
72	};
73
74	template<unsigned Size, class T, class R> struct native_cast<Size, T, R, true> {
75	static SIMDPP_INL R cast(const T& t)
76	{
77	R r;
78	cast_bitwise(t, r);
79	return r;
80	}
81	};
82
83	#define NATIVE_CAST_IMPL(SIZE, T_TYPE, R_TYPE, FUNC) \
84	template<> struct native_cast<SIZE, T_TYPE, R_TYPE, false> { \
85	static SIMDPP_INL R_TYPE cast(const T_TYPE& t) { return FUNC(t); } \
86	}
87
88	#if SIMDPP_USE_SSE2
89	NATIVE_CAST_IMPL(`16`, __m128, __m128i, _mm_castps_si128);
90	NATIVE_CAST_IMPL(`16`, __m128, __m128d, _mm_castps_pd);
91	NATIVE_CAST_IMPL(`16`, __m128i, __m128, _mm_castsi128_ps);
92	NATIVE_CAST_IMPL(`16`, __m128i, __m128d, _mm_castsi128_pd);
93	NATIVE_CAST_IMPL(`16`, __m128d, __m128i, _mm_castpd_si128);
94	NATIVE_CAST_IMPL(`16`, __m128d, __m128, _mm_castpd_ps);
95	#endif
96
97	#if SIMDPP_USE_AVX
98	NATIVE_CAST_IMPL(`32`, __m256, __m256i, _mm256_castps_si256);
99	NATIVE_CAST_IMPL(`32`, __m256, __m256d, _mm256_castps_pd);
100	NATIVE_CAST_IMPL(`32`, __m256i, __m256, _mm256_castsi256_ps);
101	NATIVE_CAST_IMPL(`32`, __m256i, __m256d, _mm256_castsi256_pd);
102	NATIVE_CAST_IMPL(`32`, __m256d, __m256i, _mm256_castpd_si256);
103	NATIVE_CAST_IMPL(`32`, __m256d, __m256, _mm256_castpd_ps);
104	#endif
105
106	#if SIMDPP_USE_AVX512F
107	NATIVE_CAST_IMPL(`64`, __m512, __m512i, _mm512_castps_si512);
108	NATIVE_CAST_IMPL(`64`, __m512, __m512d, _mm512_castps_pd);
109	NATIVE_CAST_IMPL(`64`, __m512i, __m512, _mm512_castsi512_ps);
110	NATIVE_CAST_IMPL(`64`, __m512i, __m512d, _mm512_castsi512_pd);
111	NATIVE_CAST_IMPL(`64`, __m512d, __m512i, _mm512_castpd_si512);
112	NATIVE_CAST_IMPL(`64`, __m512d, __m512, _mm512_castpd_ps);
113	#endif
114
115	#if SIMDPP_USE_NEON
116	NATIVE_CAST_IMPL(`16`, float32x4_t, uint64x2_t, vreinterpretq_u64_f32);
117	NATIVE_CAST_IMPL(`16`, float32x4_t, int64x2_t, vreinterpretq_s64_f32);
118	NATIVE_CAST_IMPL(`16`, float32x4_t, uint32x4_t, vreinterpretq_u32_f32);
119	NATIVE_CAST_IMPL(`16`, float32x4_t, int32x4_t, vreinterpretq_s32_f32);
120	NATIVE_CAST_IMPL(`16`, float32x4_t, uint16x8_t, vreinterpretq_u16_f32);
121	NATIVE_CAST_IMPL(`16`, float32x4_t, int16x8_t, vreinterpretq_s16_f32);
122	NATIVE_CAST_IMPL(`16`, float32x4_t, uint8x16_t, vreinterpretq_u8_f32);
123	NATIVE_CAST_IMPL(`16`, float32x4_t, int8x16_t, vreinterpretq_s8_f32);
124
125	NATIVE_CAST_IMPL(`16`, uint64x2_t, int64x2_t, vreinterpretq_s64_u64);
126	NATIVE_CAST_IMPL(`16`, uint64x2_t, uint32x4_t, vreinterpretq_u32_u64);
127	NATIVE_CAST_IMPL(`16`, uint64x2_t, int32x4_t, vreinterpretq_s32_u64);
128	NATIVE_CAST_IMPL(`16`, uint64x2_t, uint16x8_t, vreinterpretq_u16_u64);
129	NATIVE_CAST_IMPL(`16`, uint64x2_t, int16x8_t, vreinterpretq_s16_u64);
130	NATIVE_CAST_IMPL(`16`, uint64x2_t, uint8x16_t, vreinterpretq_u8_u64);
131	NATIVE_CAST_IMPL(`16`, uint64x2_t, int8x16_t, vreinterpretq_s8_u64);
132	NATIVE_CAST_IMPL(`16`, uint64x2_t, float32x4_t, vreinterpretq_f32_u64);
133
134	NATIVE_CAST_IMPL(`16`, int64x2_t, uint64x2_t, vreinterpretq_u64_s64);
135	NATIVE_CAST_IMPL(`16`, int64x2_t, uint32x4_t, vreinterpretq_u32_s64);
136	NATIVE_CAST_IMPL(`16`, int64x2_t, int32x4_t, vreinterpretq_s32_s64);
137	NATIVE_CAST_IMPL(`16`, int64x2_t, uint16x8_t, vreinterpretq_u16_s64);
138	NATIVE_CAST_IMPL(`16`, int64x2_t, int16x8_t, vreinterpretq_s16_s64);
139	NATIVE_CAST_IMPL(`16`, int64x2_t, uint8x16_t, vreinterpretq_u8_s64);
140	NATIVE_CAST_IMPL(`16`, int64x2_t, int8x16_t, vreinterpretq_s8_s64);
141	NATIVE_CAST_IMPL(`16`, int64x2_t, float32x4_t, vreinterpretq_f32_s64);
142
143	NATIVE_CAST_IMPL(`16`, uint32x4_t, uint64x2_t, vreinterpretq_u64_u32);
144	NATIVE_CAST_IMPL(`16`, uint32x4_t, int64x2_t, vreinterpretq_s64_u32);
145	NATIVE_CAST_IMPL(`16`, uint32x4_t, int32x4_t, vreinterpretq_s32_u32);
146	NATIVE_CAST_IMPL(`16`, uint32x4_t, uint16x8_t, vreinterpretq_u16_u32);
147	NATIVE_CAST_IMPL(`16`, uint32x4_t, int16x8_t, vreinterpretq_s16_u32);
148	NATIVE_CAST_IMPL(`16`, uint32x4_t, uint8x16_t, vreinterpretq_u8_u32);
149	NATIVE_CAST_IMPL(`16`, uint32x4_t, int8x16_t, vreinterpretq_s8_u32);
150	NATIVE_CAST_IMPL(`16`, uint32x4_t, float32x4_t, vreinterpretq_f32_u32);
151
152	NATIVE_CAST_IMPL(`16`, int32x4_t, uint64x2_t, vreinterpretq_u64_s32);
153	NATIVE_CAST_IMPL(`16`, int32x4_t, int64x2_t, vreinterpretq_s64_s32);
154	NATIVE_CAST_IMPL(`16`, int32x4_t, uint32x4_t, vreinterpretq_u32_s32);
155	NATIVE_CAST_IMPL(`16`, int32x4_t, uint16x8_t, vreinterpretq_u16_s32);
156	NATIVE_CAST_IMPL(`16`, int32x4_t, int16x8_t, vreinterpretq_s16_s32);
157	NATIVE_CAST_IMPL(`16`, int32x4_t, uint8x16_t, vreinterpretq_u8_s32);
158	NATIVE_CAST_IMPL(`16`, int32x4_t, int8x16_t, vreinterpretq_s8_s32);
159	NATIVE_CAST_IMPL(`16`, int32x4_t, float32x4_t, vreinterpretq_f32_s32);
160
161	NATIVE_CAST_IMPL(`16`, uint16x8_t, uint64x2_t, vreinterpretq_u64_u16);
162	NATIVE_CAST_IMPL(`16`, uint16x8_t, int64x2_t, vreinterpretq_s64_u16);
163	NATIVE_CAST_IMPL(`16`, uint16x8_t, uint32x4_t, vreinterpretq_u32_u16);
164	NATIVE_CAST_IMPL(`16`, uint16x8_t, int32x4_t, vreinterpretq_s32_u16);
165	NATIVE_CAST_IMPL(`16`, uint16x8_t, int16x8_t, vreinterpretq_s16_u16);
166	NATIVE_CAST_IMPL(`16`, uint16x8_t, uint8x16_t, vreinterpretq_u8_u16);
167	NATIVE_CAST_IMPL(`16`, uint16x8_t, int8x16_t, vreinterpretq_s8_u16);
168	NATIVE_CAST_IMPL(`16`, uint16x8_t, float32x4_t, vreinterpretq_f32_u16);
169
170	NATIVE_CAST_IMPL(`16`, int16x8_t, uint64x2_t, vreinterpretq_u64_s16);
171	NATIVE_CAST_IMPL(`16`, int16x8_t, int64x2_t, vreinterpretq_s64_s16);
172	NATIVE_CAST_IMPL(`16`, int16x8_t, uint32x4_t, vreinterpretq_u32_s16);
173	NATIVE_CAST_IMPL(`16`, int16x8_t, int32x4_t, vreinterpretq_s32_s16);
174	NATIVE_CAST_IMPL(`16`, int16x8_t, uint16x8_t, vreinterpretq_u16_s16);
175	NATIVE_CAST_IMPL(`16`, int16x8_t, uint8x16_t, vreinterpretq_u8_s16);
176	NATIVE_CAST_IMPL(`16`, int16x8_t, int8x16_t, vreinterpretq_s8_s16);
177	NATIVE_CAST_IMPL(`16`, int16x8_t, float32x4_t, vreinterpretq_f32_s16);
178
179	NATIVE_CAST_IMPL(`16`, uint8x16_t, uint64x2_t, vreinterpretq_u64_u8);
180	NATIVE_CAST_IMPL(`16`, uint8x16_t, int64x2_t, vreinterpretq_s64_u8);
181	NATIVE_CAST_IMPL(`16`, uint8x16_t, uint32x4_t, vreinterpretq_u32_u8);
182	NATIVE_CAST_IMPL(`16`, uint8x16_t, int32x4_t, vreinterpretq_s32_u8);
183	NATIVE_CAST_IMPL(`16`, uint8x16_t, uint16x8_t, vreinterpretq_u16_u8);
184	NATIVE_CAST_IMPL(`16`, uint8x16_t, int16x8_t, vreinterpretq_s16_u8);
185	NATIVE_CAST_IMPL(`16`, uint8x16_t, int8x16_t, vreinterpretq_s8_u8);
186	NATIVE_CAST_IMPL(`16`, uint8x16_t, float32x4_t, vreinterpretq_f32_u8);
187
188	NATIVE_CAST_IMPL(`16`, int8x16_t, uint64x2_t, vreinterpretq_u64_s8);
189	NATIVE_CAST_IMPL(`16`, int8x16_t, int64x2_t, vreinterpretq_s64_s8);
190	NATIVE_CAST_IMPL(`16`, int8x16_t, uint32x4_t, vreinterpretq_u32_s8);
191	NATIVE_CAST_IMPL(`16`, int8x16_t, int32x4_t, vreinterpretq_s32_s8);
192	NATIVE_CAST_IMPL(`16`, int8x16_t, uint16x8_t, vreinterpretq_u16_s8);
193	NATIVE_CAST_IMPL(`16`, int8x16_t, int16x8_t, vreinterpretq_s16_s8);
194	NATIVE_CAST_IMPL(`16`, int8x16_t, uint8x16_t, vreinterpretq_u8_s8);
195	NATIVE_CAST_IMPL(`16`, int8x16_t, float32x4_t, vreinterpretq_f32_s8);
196	#endif
197
198	#if SIMDPP_USE_NEON64
199	NATIVE_CAST_IMPL(`16`, float64x2_t, uint64x2_t, vreinterpretq_u64_f64);
200	NATIVE_CAST_IMPL(`16`, float64x2_t, int64x2_t, vreinterpretq_s64_f64);
201	NATIVE_CAST_IMPL(`16`, float64x2_t, uint32x4_t, vreinterpretq_u32_f64);
202	NATIVE_CAST_IMPL(`16`, float64x2_t, int32x4_t, vreinterpretq_s32_f64);
203	NATIVE_CAST_IMPL(`16`, float64x2_t, uint16x8_t, vreinterpretq_u16_f64);
204	NATIVE_CAST_IMPL(`16`, float64x2_t, int16x8_t, vreinterpretq_s16_f64);
205	NATIVE_CAST_IMPL(`16`, float64x2_t, uint8x16_t, vreinterpretq_u8_f64);
206	NATIVE_CAST_IMPL(`16`, float64x2_t, int8x16_t, vreinterpretq_s8_f64);
207	NATIVE_CAST_IMPL(`16`, float64x2_t, float32x4_t, vreinterpretq_f32_f64);
208
209	NATIVE_CAST_IMPL(`16`, uint64x2_t, float64x2_t, vreinterpretq_f64_u64);
210	NATIVE_CAST_IMPL(`16`, int64x2_t, float64x2_t, vreinterpretq_f64_s64);
211	NATIVE_CAST_IMPL(`16`, uint32x4_t, float64x2_t, vreinterpretq_f64_u32);
212	NATIVE_CAST_IMPL(`16`, int32x4_t, float64x2_t, vreinterpretq_f64_s32);
213	NATIVE_CAST_IMPL(`16`, uint16x8_t, float64x2_t, vreinterpretq_f64_u16);
214	NATIVE_CAST_IMPL(`16`, int16x8_t, float64x2_t, vreinterpretq_f64_s16);
215	NATIVE_CAST_IMPL(`16`, uint8x16_t, float64x2_t, vreinterpretq_f64_u8);
216	NATIVE_CAST_IMPL(`16`, int8x16_t, float64x2_t, vreinterpretq_f64_s8);
217	NATIVE_CAST_IMPL(`16`, float32x4_t, float64x2_t, vreinterpretq_f64_f32);
218	#endif
219	#undef NATIVE_CAST_IMPL
220
221	template<unsigned SizeT, class NativeT, class NativeR> struct native_cast_split;
222	template<unsigned SizeR, class NativeT, class NativeR> struct native_cast_combine;
223
224	#if SIMDPP_USE_AVX
225	template<> struct native_cast_split<`32`, __m256, __m128i> {
226	static SIMDPP_INL void cast(const __m256& t, __m128i& r0, __m128i& r1)
227	{
228	r0 = _mm_castps_si128(_mm256_castps256_ps128(t));
229	r1 = _mm_castps_si128(_mm256_extractf128_ps(t, `1`));
230	}
231	};
232
233	template<> struct native_cast_split<`32`, __m256d, __m128i> {
234	static SIMDPP_INL void cast(const __m256d& t, __m128i& r0, __m128i& r1)
235	{
236	r0 = _mm_castpd_si128(_mm256_castpd256_pd128(t));
237	r1 = _mm_castpd_si128(_mm256_extractf128_pd(t, `1`));
238	}
239	};
240
241	template<> struct native_cast_combine<`32`, __m128i, __m256> {
242	static SIMDPP_INL __m256 cast(const __m128i& t0, const __m128i& t1)
243	{
244	__m256 r = _mm256_castsi256_ps(_mm256_castsi128_si256(t0));
245	r = _mm256_insertf128_ps(r, _mm_castsi128_ps(t1), `1`);
246	return r;
247	}
248	};
249
250	template<> struct native_cast_combine<`32`, __m128i, __m256d> {
251	static SIMDPP_INL __m256d cast(const __m128i& t0, const __m128i& t1)
252	{
253	__m256d r = _mm256_castsi256_pd(_mm256_castsi128_si256(t0));
254	r = _mm256_insertf128_pd(r, _mm_castsi128_pd(t1), `1`);
255	return r;
256	}
257	};
258	#endif
259
260	#if SIMDPP_USE_AVX512F
261	template<> struct native_cast_split<`64`, __m512i, __m256i> {
262	static SIMDPP_INL void cast(const __m512i& t, __m256i& r0, __m256i& r1)
263	{
264	r0 = _mm512_castsi512_si256(t);
265	r1 = _mm512_extracti64x4_epi64(t, `1`);
266	}
267	};
268
269	template<> struct native_cast_split<`64`, __m512, __m256i> {
270	static SIMDPP_INL void cast(const __m512& t, __m256i& r0, __m256i& r1)
271	{
272	r0 = _mm256_castps_si256(_mm512_castps512_ps256(t));
273	r1 = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castps_pd(t), `1`));
274	}
275	};
276
277	template<> struct native_cast_split<`64`, __m512d, __m256i> {
278	static SIMDPP_INL void cast(const __m512d& t, __m256i& r0, __m256i& r1)
279	{
280	r0 = _mm256_castpd_si256(_mm512_castpd512_pd256(t));
281	r1 = _mm256_castpd_si256(_mm512_extractf64x4_pd(t, `1`));
282	}
283	};
284
285	template<> struct native_cast_combine<`64`, __m256i, __m512i> {
286	static SIMDPP_INL __m512i cast(const __m256i& t0, const __m256i& t1)
287	{
288	__m512i r = _mm512_castsi256_si512(t0);
289	return _mm512_inserti64x4(r, t1, `1`);
290	}
291	};
292
293	template<> struct native_cast_combine<`64`, __m256i, __m512> {
294	static SIMDPP_INL __m512 cast(const __m256i& t0, const __m256i& t1)
295	{
296	__m512d r = _mm512_castsi512_pd(_mm512_castsi256_si512(t0));
297	r = _mm512_insertf64x4(r, _mm256_castsi256_pd(t1), `1`);
298	return _mm512_castpd_ps(r);
299	}
300	};
301
302	template<> struct native_cast_combine<`64`, __m256i, __m512d> {
303	static SIMDPP_INL __m512d cast(const __m256i& t0, const __m256i& t1)
304	{
305	__m512d r = _mm512_castsi512_pd(_mm512_castsi256_si512(t0));
306	r = _mm512_insertf64x4(r, _mm256_castsi256_pd(t1), `1`);
307	return r;
308	}
309	};
310	#endif
311
312	template<unsigned CastType>
313	struct cast_bitwise_vector_impl;
314
315	template<class T>
316	struct is_vararray : std::false_type {};
317
318	template<class T, unsigned N>
319	struct is_vararray<vararray<T, N>> : std::true_type {};
320
321	template<>
322	struct cast_bitwise_vector_impl<VECTOR_CAST_TYPE_1_TO_1> {
323	template<class T, class R> SIMDPP_INL static
324	void cast(const T& t, R& r)
325	{
326	using NativeT = typename T::base_vector_type::native_type;
327	using NativeR = typename R::base_vector_type::native_type;
328	const bool is_arg_vararray =
329	is_vararray<NativeT>::value \|\| is_vararray<NativeR>::value;
330	using CastImpl = native_cast<sizeof(NativeT), NativeT,
331	NativeR, is_arg_vararray>;
332
333	for (unsigned i = `0`; i < T::vec_length; ++i) {
334	r.vec(i) = CastImpl::cast(t.vec(i).native());
335	}
336	}
337	};
338
339	template<>
340	struct cast_bitwise_vector_impl<VECTOR_CAST_TYPE_SPLIT2> {
341	template<class T, class R> SIMDPP_INL static
342	void cast(const T& t, R& r)
343	{
344	using NativeT = typename T::base_vector_type::native_type;
345	using NativeR = typename R::base_vector_type::native_type;
346	using CastImpl = native_cast_split<sizeof(NativeT), NativeT, NativeR>;
347
348	for (unsigned i = `0`; i < T::vec_length; ++i) {
349	NativeR r0, r1;
350	CastImpl::cast(t.vec(i).native(), r0, r1);
351	r.vec(i*`2`) = r0;
352	r.vec(i*`2`+`1`) = r1;
353	}
354	}
355	};
356
357	template<>
358	struct cast_bitwise_vector_impl<VECTOR_CAST_TYPE_COMBINE2> {
359	template<class T, class R> SIMDPP_INL static
360	void cast(const T& t, R& r)
361	{
362	using NativeT = typename T::base_vector_type::native_type;
363	using NativeR = typename R::base_vector_type::native_type;
364	using CastImpl = native_cast_combine<sizeof(NativeR), NativeT, NativeR>;
365
366	for (unsigned i = `0`; i < R::vec_length; ++i) {
367	r.vec(i) = CastImpl::cast(t.vec(i*`2`).native(),
368	t.vec(i*`2`+`1`).native());
369	}
370	}
371	};
372
373	template<class T, class R> SIMDPP_INL
374	void cast_bitwise_vector(const T& t, R& r)
375	{
376	static_assert(sizeof(R) == sizeof(T), "Size mismatch");
377	const unsigned vector_cast_type =
378	T::vec_length == R::vec_length ? VECTOR_CAST_TYPE_1_TO_1 :
379	T::vec_length == R::vec_length*`2` ? VECTOR_CAST_TYPE_COMBINE2 :
380	T::vec_length*`2` == R::vec_length ? VECTOR_CAST_TYPE_SPLIT2 :
381	VECTOR_CAST_TYPE_INVALID;
382
383	cast_bitwise_vector_impl<vector_cast_type>::cast(t, r);
384	}
385
386	#if (__GNUC__ >= 6) && !defined(__INTEL_COMPILER) && !defined(__clang__)
387	#pragma GCC diagnostic pop
388	#endif
389
390	} // namespace detail
391	} // namespace SIMDPP_ARCH_NAMESPACE
392	} // namespace simdpp
393
394	#endif
395

Browse the source code of bsFramework/Source/Foundation/bsfUtility/ThirdParty/simdpp/detail/cast_bitwise.h