| 1 | /* Copyright (C) 2013 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_DETAIL_CAST_BITWISE_H |
| 9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_DETAIL_CAST_BITWISE_H |
| 10 | |
| 11 | #include <simdpp/types.h> |
| 12 | |
| 13 | namespace simdpp { |
| 14 | namespace SIMDPP_ARCH_NAMESPACE { |
| 15 | namespace detail { |
| 16 | |
| 17 | /* Note that in this function we are invoking undefined behavior that happens |
| 18 | to work in all compilers the library supports. The only non-undefined way |
| 19 | to do bitwise data transfer between unrelated types without breaking strict |
| 20 | aliasing rules is the memcpy() function. Unfortunately some compilers can't |
| 21 | fully optimize out the overhead of the function which leads to unnecessary |
| 22 | data movement to the stack. |
| 23 | |
| 24 | Note that this function does not fully work with vector types even in C++11 |
| 25 | mode where they are trivial types and thus may be placed in an union. |
| 26 | Vectors containing one or two native vectors are fine, but larger vectors |
| 27 | containing 4 or more native vectors result in internal compiler errors or |
| 28 | miscompiled code on some compilers. |
| 29 | */ |
| 30 | template<class T, class R> SIMDPP_INL |
| 31 | void cast_bitwise(const T& t, R& r) |
| 32 | { |
| 33 | static_assert(sizeof(R) == sizeof(T), "Size mismatch" ); |
| 34 | union { |
| 35 | T t_union; |
| 36 | R r_union; |
| 37 | }; |
| 38 | t_union = t; |
| 39 | r = r_union; |
| 40 | } |
| 41 | |
| 42 | enum { |
| 43 | VECTOR_CAST_TYPE_1_TO_1, |
| 44 | VECTOR_CAST_TYPE_SPLIT2, |
| 45 | VECTOR_CAST_TYPE_COMBINE2, |
| 46 | VECTOR_CAST_TYPE_INVALID |
| 47 | }; |
| 48 | |
| 49 | #if (__GNUC__ >= 6) && !defined(__INTEL_COMPILER) && !defined(__clang__) |
| 50 | /* native_cast, native_cast_split and native_cast_combine uses native vector |
| 51 | type as class template parameter. On GCC vector types have alignment |
| 52 | attributes specified on some architectures. This leads to "ignored |
| 53 | attributes" warning, because the attributes are not part of the type. |
| 54 | Since libsimdpp always uses the same attributes for all native_type members |
| 55 | we can safely ignore this warning. |
| 56 | */ |
| 57 | #pragma GCC diagnostic push |
| 58 | #pragma GCC diagnostic ignored "-Wignored-attributes" |
| 59 | #endif |
| 60 | |
| 61 | // The Size argument is needed to disambiguate vectors of different size on old |
| 62 | // GNU ABIs. |
| 63 | template<unsigned Size, class NativeT, class NativeR, bool IsVarArray> |
| 64 | struct native_cast; |
| 65 | |
| 66 | template<unsigned Size, class T, class R> struct native_cast<Size, T, R, false> { |
| 67 | static SIMDPP_INL R cast(const T& t) { return R(t); } |
| 68 | }; |
| 69 | |
| 70 | template<unsigned Size, class T> struct native_cast<Size, T, T, false> { |
| 71 | static SIMDPP_INL T cast(const T& t) { return t; } |
| 72 | }; |
| 73 | |
| 74 | template<unsigned Size, class T, class R> struct native_cast<Size, T, R, true> { |
| 75 | static SIMDPP_INL R cast(const T& t) |
| 76 | { |
| 77 | R r; |
| 78 | cast_bitwise(t, r); |
| 79 | return r; |
| 80 | } |
| 81 | }; |
| 82 | |
| 83 | #define NATIVE_CAST_IMPL(SIZE, T_TYPE, R_TYPE, FUNC) \ |
| 84 | template<> struct native_cast<SIZE, T_TYPE, R_TYPE, false> { \ |
| 85 | static SIMDPP_INL R_TYPE cast(const T_TYPE& t) { return FUNC(t); } \ |
| 86 | } |
| 87 | |
| 88 | #if SIMDPP_USE_SSE2 |
| 89 | NATIVE_CAST_IMPL(16, __m128, __m128i, _mm_castps_si128); |
| 90 | NATIVE_CAST_IMPL(16, __m128, __m128d, _mm_castps_pd); |
| 91 | NATIVE_CAST_IMPL(16, __m128i, __m128, _mm_castsi128_ps); |
| 92 | NATIVE_CAST_IMPL(16, __m128i, __m128d, _mm_castsi128_pd); |
| 93 | NATIVE_CAST_IMPL(16, __m128d, __m128i, _mm_castpd_si128); |
| 94 | NATIVE_CAST_IMPL(16, __m128d, __m128, _mm_castpd_ps); |
| 95 | #endif |
| 96 | |
| 97 | #if SIMDPP_USE_AVX |
| 98 | NATIVE_CAST_IMPL(32, __m256, __m256i, _mm256_castps_si256); |
| 99 | NATIVE_CAST_IMPL(32, __m256, __m256d, _mm256_castps_pd); |
| 100 | NATIVE_CAST_IMPL(32, __m256i, __m256, _mm256_castsi256_ps); |
| 101 | NATIVE_CAST_IMPL(32, __m256i, __m256d, _mm256_castsi256_pd); |
| 102 | NATIVE_CAST_IMPL(32, __m256d, __m256i, _mm256_castpd_si256); |
| 103 | NATIVE_CAST_IMPL(32, __m256d, __m256, _mm256_castpd_ps); |
| 104 | #endif |
| 105 | |
| 106 | #if SIMDPP_USE_AVX512F |
| 107 | NATIVE_CAST_IMPL(64, __m512, __m512i, _mm512_castps_si512); |
| 108 | NATIVE_CAST_IMPL(64, __m512, __m512d, _mm512_castps_pd); |
| 109 | NATIVE_CAST_IMPL(64, __m512i, __m512, _mm512_castsi512_ps); |
| 110 | NATIVE_CAST_IMPL(64, __m512i, __m512d, _mm512_castsi512_pd); |
| 111 | NATIVE_CAST_IMPL(64, __m512d, __m512i, _mm512_castpd_si512); |
| 112 | NATIVE_CAST_IMPL(64, __m512d, __m512, _mm512_castpd_ps); |
| 113 | #endif |
| 114 | |
| 115 | #if SIMDPP_USE_NEON |
| 116 | NATIVE_CAST_IMPL(16, float32x4_t, uint64x2_t, vreinterpretq_u64_f32); |
| 117 | NATIVE_CAST_IMPL(16, float32x4_t, int64x2_t, vreinterpretq_s64_f32); |
| 118 | NATIVE_CAST_IMPL(16, float32x4_t, uint32x4_t, vreinterpretq_u32_f32); |
| 119 | NATIVE_CAST_IMPL(16, float32x4_t, int32x4_t, vreinterpretq_s32_f32); |
| 120 | NATIVE_CAST_IMPL(16, float32x4_t, uint16x8_t, vreinterpretq_u16_f32); |
| 121 | NATIVE_CAST_IMPL(16, float32x4_t, int16x8_t, vreinterpretq_s16_f32); |
| 122 | NATIVE_CAST_IMPL(16, float32x4_t, uint8x16_t, vreinterpretq_u8_f32); |
| 123 | NATIVE_CAST_IMPL(16, float32x4_t, int8x16_t, vreinterpretq_s8_f32); |
| 124 | |
| 125 | NATIVE_CAST_IMPL(16, uint64x2_t, int64x2_t, vreinterpretq_s64_u64); |
| 126 | NATIVE_CAST_IMPL(16, uint64x2_t, uint32x4_t, vreinterpretq_u32_u64); |
| 127 | NATIVE_CAST_IMPL(16, uint64x2_t, int32x4_t, vreinterpretq_s32_u64); |
| 128 | NATIVE_CAST_IMPL(16, uint64x2_t, uint16x8_t, vreinterpretq_u16_u64); |
| 129 | NATIVE_CAST_IMPL(16, uint64x2_t, int16x8_t, vreinterpretq_s16_u64); |
| 130 | NATIVE_CAST_IMPL(16, uint64x2_t, uint8x16_t, vreinterpretq_u8_u64); |
| 131 | NATIVE_CAST_IMPL(16, uint64x2_t, int8x16_t, vreinterpretq_s8_u64); |
| 132 | NATIVE_CAST_IMPL(16, uint64x2_t, float32x4_t, vreinterpretq_f32_u64); |
| 133 | |
| 134 | NATIVE_CAST_IMPL(16, int64x2_t, uint64x2_t, vreinterpretq_u64_s64); |
| 135 | NATIVE_CAST_IMPL(16, int64x2_t, uint32x4_t, vreinterpretq_u32_s64); |
| 136 | NATIVE_CAST_IMPL(16, int64x2_t, int32x4_t, vreinterpretq_s32_s64); |
| 137 | NATIVE_CAST_IMPL(16, int64x2_t, uint16x8_t, vreinterpretq_u16_s64); |
| 138 | NATIVE_CAST_IMPL(16, int64x2_t, int16x8_t, vreinterpretq_s16_s64); |
| 139 | NATIVE_CAST_IMPL(16, int64x2_t, uint8x16_t, vreinterpretq_u8_s64); |
| 140 | NATIVE_CAST_IMPL(16, int64x2_t, int8x16_t, vreinterpretq_s8_s64); |
| 141 | NATIVE_CAST_IMPL(16, int64x2_t, float32x4_t, vreinterpretq_f32_s64); |
| 142 | |
| 143 | NATIVE_CAST_IMPL(16, uint32x4_t, uint64x2_t, vreinterpretq_u64_u32); |
| 144 | NATIVE_CAST_IMPL(16, uint32x4_t, int64x2_t, vreinterpretq_s64_u32); |
| 145 | NATIVE_CAST_IMPL(16, uint32x4_t, int32x4_t, vreinterpretq_s32_u32); |
| 146 | NATIVE_CAST_IMPL(16, uint32x4_t, uint16x8_t, vreinterpretq_u16_u32); |
| 147 | NATIVE_CAST_IMPL(16, uint32x4_t, int16x8_t, vreinterpretq_s16_u32); |
| 148 | NATIVE_CAST_IMPL(16, uint32x4_t, uint8x16_t, vreinterpretq_u8_u32); |
| 149 | NATIVE_CAST_IMPL(16, uint32x4_t, int8x16_t, vreinterpretq_s8_u32); |
| 150 | NATIVE_CAST_IMPL(16, uint32x4_t, float32x4_t, vreinterpretq_f32_u32); |
| 151 | |
| 152 | NATIVE_CAST_IMPL(16, int32x4_t, uint64x2_t, vreinterpretq_u64_s32); |
| 153 | NATIVE_CAST_IMPL(16, int32x4_t, int64x2_t, vreinterpretq_s64_s32); |
| 154 | NATIVE_CAST_IMPL(16, int32x4_t, uint32x4_t, vreinterpretq_u32_s32); |
| 155 | NATIVE_CAST_IMPL(16, int32x4_t, uint16x8_t, vreinterpretq_u16_s32); |
| 156 | NATIVE_CAST_IMPL(16, int32x4_t, int16x8_t, vreinterpretq_s16_s32); |
| 157 | NATIVE_CAST_IMPL(16, int32x4_t, uint8x16_t, vreinterpretq_u8_s32); |
| 158 | NATIVE_CAST_IMPL(16, int32x4_t, int8x16_t, vreinterpretq_s8_s32); |
| 159 | NATIVE_CAST_IMPL(16, int32x4_t, float32x4_t, vreinterpretq_f32_s32); |
| 160 | |
| 161 | NATIVE_CAST_IMPL(16, uint16x8_t, uint64x2_t, vreinterpretq_u64_u16); |
| 162 | NATIVE_CAST_IMPL(16, uint16x8_t, int64x2_t, vreinterpretq_s64_u16); |
| 163 | NATIVE_CAST_IMPL(16, uint16x8_t, uint32x4_t, vreinterpretq_u32_u16); |
| 164 | NATIVE_CAST_IMPL(16, uint16x8_t, int32x4_t, vreinterpretq_s32_u16); |
| 165 | NATIVE_CAST_IMPL(16, uint16x8_t, int16x8_t, vreinterpretq_s16_u16); |
| 166 | NATIVE_CAST_IMPL(16, uint16x8_t, uint8x16_t, vreinterpretq_u8_u16); |
| 167 | NATIVE_CAST_IMPL(16, uint16x8_t, int8x16_t, vreinterpretq_s8_u16); |
| 168 | NATIVE_CAST_IMPL(16, uint16x8_t, float32x4_t, vreinterpretq_f32_u16); |
| 169 | |
| 170 | NATIVE_CAST_IMPL(16, int16x8_t, uint64x2_t, vreinterpretq_u64_s16); |
| 171 | NATIVE_CAST_IMPL(16, int16x8_t, int64x2_t, vreinterpretq_s64_s16); |
| 172 | NATIVE_CAST_IMPL(16, int16x8_t, uint32x4_t, vreinterpretq_u32_s16); |
| 173 | NATIVE_CAST_IMPL(16, int16x8_t, int32x4_t, vreinterpretq_s32_s16); |
| 174 | NATIVE_CAST_IMPL(16, int16x8_t, uint16x8_t, vreinterpretq_u16_s16); |
| 175 | NATIVE_CAST_IMPL(16, int16x8_t, uint8x16_t, vreinterpretq_u8_s16); |
| 176 | NATIVE_CAST_IMPL(16, int16x8_t, int8x16_t, vreinterpretq_s8_s16); |
| 177 | NATIVE_CAST_IMPL(16, int16x8_t, float32x4_t, vreinterpretq_f32_s16); |
| 178 | |
| 179 | NATIVE_CAST_IMPL(16, uint8x16_t, uint64x2_t, vreinterpretq_u64_u8); |
| 180 | NATIVE_CAST_IMPL(16, uint8x16_t, int64x2_t, vreinterpretq_s64_u8); |
| 181 | NATIVE_CAST_IMPL(16, uint8x16_t, uint32x4_t, vreinterpretq_u32_u8); |
| 182 | NATIVE_CAST_IMPL(16, uint8x16_t, int32x4_t, vreinterpretq_s32_u8); |
| 183 | NATIVE_CAST_IMPL(16, uint8x16_t, uint16x8_t, vreinterpretq_u16_u8); |
| 184 | NATIVE_CAST_IMPL(16, uint8x16_t, int16x8_t, vreinterpretq_s16_u8); |
| 185 | NATIVE_CAST_IMPL(16, uint8x16_t, int8x16_t, vreinterpretq_s8_u8); |
| 186 | NATIVE_CAST_IMPL(16, uint8x16_t, float32x4_t, vreinterpretq_f32_u8); |
| 187 | |
| 188 | NATIVE_CAST_IMPL(16, int8x16_t, uint64x2_t, vreinterpretq_u64_s8); |
| 189 | NATIVE_CAST_IMPL(16, int8x16_t, int64x2_t, vreinterpretq_s64_s8); |
| 190 | NATIVE_CAST_IMPL(16, int8x16_t, uint32x4_t, vreinterpretq_u32_s8); |
| 191 | NATIVE_CAST_IMPL(16, int8x16_t, int32x4_t, vreinterpretq_s32_s8); |
| 192 | NATIVE_CAST_IMPL(16, int8x16_t, uint16x8_t, vreinterpretq_u16_s8); |
| 193 | NATIVE_CAST_IMPL(16, int8x16_t, int16x8_t, vreinterpretq_s16_s8); |
| 194 | NATIVE_CAST_IMPL(16, int8x16_t, uint8x16_t, vreinterpretq_u8_s8); |
| 195 | NATIVE_CAST_IMPL(16, int8x16_t, float32x4_t, vreinterpretq_f32_s8); |
| 196 | #endif |
| 197 | |
| 198 | #if SIMDPP_USE_NEON64 |
| 199 | NATIVE_CAST_IMPL(16, float64x2_t, uint64x2_t, vreinterpretq_u64_f64); |
| 200 | NATIVE_CAST_IMPL(16, float64x2_t, int64x2_t, vreinterpretq_s64_f64); |
| 201 | NATIVE_CAST_IMPL(16, float64x2_t, uint32x4_t, vreinterpretq_u32_f64); |
| 202 | NATIVE_CAST_IMPL(16, float64x2_t, int32x4_t, vreinterpretq_s32_f64); |
| 203 | NATIVE_CAST_IMPL(16, float64x2_t, uint16x8_t, vreinterpretq_u16_f64); |
| 204 | NATIVE_CAST_IMPL(16, float64x2_t, int16x8_t, vreinterpretq_s16_f64); |
| 205 | NATIVE_CAST_IMPL(16, float64x2_t, uint8x16_t, vreinterpretq_u8_f64); |
| 206 | NATIVE_CAST_IMPL(16, float64x2_t, int8x16_t, vreinterpretq_s8_f64); |
| 207 | NATIVE_CAST_IMPL(16, float64x2_t, float32x4_t, vreinterpretq_f32_f64); |
| 208 | |
| 209 | NATIVE_CAST_IMPL(16, uint64x2_t, float64x2_t, vreinterpretq_f64_u64); |
| 210 | NATIVE_CAST_IMPL(16, int64x2_t, float64x2_t, vreinterpretq_f64_s64); |
| 211 | NATIVE_CAST_IMPL(16, uint32x4_t, float64x2_t, vreinterpretq_f64_u32); |
| 212 | NATIVE_CAST_IMPL(16, int32x4_t, float64x2_t, vreinterpretq_f64_s32); |
| 213 | NATIVE_CAST_IMPL(16, uint16x8_t, float64x2_t, vreinterpretq_f64_u16); |
| 214 | NATIVE_CAST_IMPL(16, int16x8_t, float64x2_t, vreinterpretq_f64_s16); |
| 215 | NATIVE_CAST_IMPL(16, uint8x16_t, float64x2_t, vreinterpretq_f64_u8); |
| 216 | NATIVE_CAST_IMPL(16, int8x16_t, float64x2_t, vreinterpretq_f64_s8); |
| 217 | NATIVE_CAST_IMPL(16, float32x4_t, float64x2_t, vreinterpretq_f64_f32); |
| 218 | #endif |
| 219 | #undef NATIVE_CAST_IMPL |
| 220 | |
| 221 | template<unsigned SizeT, class NativeT, class NativeR> struct native_cast_split; |
| 222 | template<unsigned SizeR, class NativeT, class NativeR> struct native_cast_combine; |
| 223 | |
| 224 | #if SIMDPP_USE_AVX |
| 225 | template<> struct native_cast_split<32, __m256, __m128i> { |
| 226 | static SIMDPP_INL void cast(const __m256& t, __m128i& r0, __m128i& r1) |
| 227 | { |
| 228 | r0 = _mm_castps_si128(_mm256_castps256_ps128(t)); |
| 229 | r1 = _mm_castps_si128(_mm256_extractf128_ps(t, 1)); |
| 230 | } |
| 231 | }; |
| 232 | |
| 233 | template<> struct native_cast_split<32, __m256d, __m128i> { |
| 234 | static SIMDPP_INL void cast(const __m256d& t, __m128i& r0, __m128i& r1) |
| 235 | { |
| 236 | r0 = _mm_castpd_si128(_mm256_castpd256_pd128(t)); |
| 237 | r1 = _mm_castpd_si128(_mm256_extractf128_pd(t, 1)); |
| 238 | } |
| 239 | }; |
| 240 | |
| 241 | template<> struct native_cast_combine<32, __m128i, __m256> { |
| 242 | static SIMDPP_INL __m256 cast(const __m128i& t0, const __m128i& t1) |
| 243 | { |
| 244 | __m256 r = _mm256_castsi256_ps(_mm256_castsi128_si256(t0)); |
| 245 | r = _mm256_insertf128_ps(r, _mm_castsi128_ps(t1), 1); |
| 246 | return r; |
| 247 | } |
| 248 | }; |
| 249 | |
| 250 | template<> struct native_cast_combine<32, __m128i, __m256d> { |
| 251 | static SIMDPP_INL __m256d cast(const __m128i& t0, const __m128i& t1) |
| 252 | { |
| 253 | __m256d r = _mm256_castsi256_pd(_mm256_castsi128_si256(t0)); |
| 254 | r = _mm256_insertf128_pd(r, _mm_castsi128_pd(t1), 1); |
| 255 | return r; |
| 256 | } |
| 257 | }; |
| 258 | #endif |
| 259 | |
| 260 | #if SIMDPP_USE_AVX512F |
| 261 | template<> struct native_cast_split<64, __m512i, __m256i> { |
| 262 | static SIMDPP_INL void cast(const __m512i& t, __m256i& r0, __m256i& r1) |
| 263 | { |
| 264 | r0 = _mm512_castsi512_si256(t); |
| 265 | r1 = _mm512_extracti64x4_epi64(t, 1); |
| 266 | } |
| 267 | }; |
| 268 | |
| 269 | template<> struct native_cast_split<64, __m512, __m256i> { |
| 270 | static SIMDPP_INL void cast(const __m512& t, __m256i& r0, __m256i& r1) |
| 271 | { |
| 272 | r0 = _mm256_castps_si256(_mm512_castps512_ps256(t)); |
| 273 | r1 = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castps_pd(t), 1)); |
| 274 | } |
| 275 | }; |
| 276 | |
| 277 | template<> struct native_cast_split<64, __m512d, __m256i> { |
| 278 | static SIMDPP_INL void cast(const __m512d& t, __m256i& r0, __m256i& r1) |
| 279 | { |
| 280 | r0 = _mm256_castpd_si256(_mm512_castpd512_pd256(t)); |
| 281 | r1 = _mm256_castpd_si256(_mm512_extractf64x4_pd(t, 1)); |
| 282 | } |
| 283 | }; |
| 284 | |
| 285 | template<> struct native_cast_combine<64, __m256i, __m512i> { |
| 286 | static SIMDPP_INL __m512i cast(const __m256i& t0, const __m256i& t1) |
| 287 | { |
| 288 | __m512i r = _mm512_castsi256_si512(t0); |
| 289 | return _mm512_inserti64x4(r, t1, 1); |
| 290 | } |
| 291 | }; |
| 292 | |
| 293 | template<> struct native_cast_combine<64, __m256i, __m512> { |
| 294 | static SIMDPP_INL __m512 cast(const __m256i& t0, const __m256i& t1) |
| 295 | { |
| 296 | __m512d r = _mm512_castsi512_pd(_mm512_castsi256_si512(t0)); |
| 297 | r = _mm512_insertf64x4(r, _mm256_castsi256_pd(t1), 1); |
| 298 | return _mm512_castpd_ps(r); |
| 299 | } |
| 300 | }; |
| 301 | |
| 302 | template<> struct native_cast_combine<64, __m256i, __m512d> { |
| 303 | static SIMDPP_INL __m512d cast(const __m256i& t0, const __m256i& t1) |
| 304 | { |
| 305 | __m512d r = _mm512_castsi512_pd(_mm512_castsi256_si512(t0)); |
| 306 | r = _mm512_insertf64x4(r, _mm256_castsi256_pd(t1), 1); |
| 307 | return r; |
| 308 | } |
| 309 | }; |
| 310 | #endif |
| 311 | |
| 312 | template<unsigned CastType> |
| 313 | struct cast_bitwise_vector_impl; |
| 314 | |
| 315 | template<class T> |
| 316 | struct is_vararray : std::false_type {}; |
| 317 | |
| 318 | template<class T, unsigned N> |
| 319 | struct is_vararray<vararray<T, N>> : std::true_type {}; |
| 320 | |
| 321 | template<> |
| 322 | struct cast_bitwise_vector_impl<VECTOR_CAST_TYPE_1_TO_1> { |
| 323 | template<class T, class R> SIMDPP_INL static |
| 324 | void cast(const T& t, R& r) |
| 325 | { |
| 326 | using NativeT = typename T::base_vector_type::native_type; |
| 327 | using NativeR = typename R::base_vector_type::native_type; |
| 328 | const bool is_arg_vararray = |
| 329 | is_vararray<NativeT>::value || is_vararray<NativeR>::value; |
| 330 | using CastImpl = native_cast<sizeof(NativeT), NativeT, |
| 331 | NativeR, is_arg_vararray>; |
| 332 | |
| 333 | for (unsigned i = 0; i < T::vec_length; ++i) { |
| 334 | r.vec(i) = CastImpl::cast(t.vec(i).native()); |
| 335 | } |
| 336 | } |
| 337 | }; |
| 338 | |
| 339 | template<> |
| 340 | struct cast_bitwise_vector_impl<VECTOR_CAST_TYPE_SPLIT2> { |
| 341 | template<class T, class R> SIMDPP_INL static |
| 342 | void cast(const T& t, R& r) |
| 343 | { |
| 344 | using NativeT = typename T::base_vector_type::native_type; |
| 345 | using NativeR = typename R::base_vector_type::native_type; |
| 346 | using CastImpl = native_cast_split<sizeof(NativeT), NativeT, NativeR>; |
| 347 | |
| 348 | for (unsigned i = 0; i < T::vec_length; ++i) { |
| 349 | NativeR r0, r1; |
| 350 | CastImpl::cast(t.vec(i).native(), r0, r1); |
| 351 | r.vec(i*2) = r0; |
| 352 | r.vec(i*2+1) = r1; |
| 353 | } |
| 354 | } |
| 355 | }; |
| 356 | |
| 357 | template<> |
| 358 | struct cast_bitwise_vector_impl<VECTOR_CAST_TYPE_COMBINE2> { |
| 359 | template<class T, class R> SIMDPP_INL static |
| 360 | void cast(const T& t, R& r) |
| 361 | { |
| 362 | using NativeT = typename T::base_vector_type::native_type; |
| 363 | using NativeR = typename R::base_vector_type::native_type; |
| 364 | using CastImpl = native_cast_combine<sizeof(NativeR), NativeT, NativeR>; |
| 365 | |
| 366 | for (unsigned i = 0; i < R::vec_length; ++i) { |
| 367 | r.vec(i) = CastImpl::cast(t.vec(i*2).native(), |
| 368 | t.vec(i*2+1).native()); |
| 369 | } |
| 370 | } |
| 371 | }; |
| 372 | |
| 373 | template<class T, class R> SIMDPP_INL |
| 374 | void cast_bitwise_vector(const T& t, R& r) |
| 375 | { |
| 376 | static_assert(sizeof(R) == sizeof(T), "Size mismatch" ); |
| 377 | const unsigned vector_cast_type = |
| 378 | T::vec_length == R::vec_length ? VECTOR_CAST_TYPE_1_TO_1 : |
| 379 | T::vec_length == R::vec_length*2 ? VECTOR_CAST_TYPE_COMBINE2 : |
| 380 | T::vec_length*2 == R::vec_length ? VECTOR_CAST_TYPE_SPLIT2 : |
| 381 | VECTOR_CAST_TYPE_INVALID; |
| 382 | |
| 383 | cast_bitwise_vector_impl<vector_cast_type>::cast(t, r); |
| 384 | } |
| 385 | |
| 386 | #if (__GNUC__ >= 6) && !defined(__INTEL_COMPILER) && !defined(__clang__) |
| 387 | #pragma GCC diagnostic pop |
| 388 | #endif |
| 389 | |
| 390 | } // namespace detail |
| 391 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 392 | } // namespace simdpp |
| 393 | |
| 394 | #endif |
| 395 | |