| 1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MAKE_CONST_H |
| 9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_MAKE_CONST_H |
| 10 | |
| 11 | #ifndef LIBSIMDPP_SIMD_H |
| 12 | #error "This file must be included through simd.h" |
| 13 | #endif |
| 14 | |
| 15 | #include <simdpp/types.h> |
| 16 | #include <simdpp/detail/null/set.h> |
| 17 | #include <simdpp/detail/traits.h> |
| 18 | |
| 19 | #if _MSC_VER |
| 20 | #pragma warning(push) |
| 21 | #pragma warning(disable: 4244) |
| 22 | #endif |
| 23 | |
| 24 | namespace simdpp { |
| 25 | namespace SIMDPP_ARCH_NAMESPACE { |
| 26 | namespace detail { |
| 27 | namespace insn { |
| 28 | |
| 29 | #if SIMDPP_USE_NEON_FLT_SP |
| 30 | template<class VE> SIMDPP_INL |
| 31 | void i_make_const(float32<4>& v, const expr_vec_make_const<VE,1>& e, unsigned) |
| 32 | { |
| 33 | float rv = e.val(0); |
| 34 | v = vld1q_dup_f32(&rv); |
| 35 | } |
| 36 | |
| 37 | template<class VE> SIMDPP_INL |
| 38 | void i_make_const(float32<4>& v, const expr_vec_make_const<VE,2>& e, unsigned off) |
| 39 | { |
| 40 | float SIMDPP_ALIGN(8) data[2] = { |
| 41 | (float) e.val(off+0), |
| 42 | (float) e.val(off+1) |
| 43 | }; |
| 44 | float32x2_t half = vld1_f32(data); |
| 45 | v = vcombine_f32(half, half); |
| 46 | } |
| 47 | #endif |
| 48 | |
| 49 | template<class VE, unsigned N> SIMDPP_INL |
| 50 | void i_make_const(float32<4>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 51 | { |
| 52 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
| 53 | v = detail::null::make_vec<float32<4>, float>(e.val(off+0), e.val(off+1), |
| 54 | e.val(off+2), e.val(off+3)); |
| 55 | #elif SIMDPP_USE_SSE2 |
| 56 | v = _mm_set_ps(e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 57 | #elif SIMDPP_USE_NEON |
| 58 | float SIMDPP_ALIGN(16) data[4] = { |
| 59 | (float) e.val(off+0), |
| 60 | (float) e.val(off+1), |
| 61 | (float) e.val(off+2), |
| 62 | (float) e.val(off+3) |
| 63 | }; |
| 64 | v = vld1q_f32(data); |
| 65 | #elif SIMDPP_USE_ALTIVEC |
| 66 | v = (__vector float){ float(e.val(off+0)), float(e.val(off+1)), |
| 67 | float(e.val(off+2)), float(e.val(off+3)) }; |
| 68 | #elif SIMDPP_USE_MSA |
| 69 | v = (v4f32){ float(e.val(off+0)), float(e.val(off+1)), |
| 70 | float(e.val(off+2)), float(e.val(off+3)) }; |
| 71 | #endif |
| 72 | } |
| 73 | |
| 74 | #if SIMDPP_USE_AVX |
| 75 | template<class VE, unsigned N> SIMDPP_INL |
| 76 | void i_make_const(float32<8>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 77 | { |
| 78 | v = _mm256_set_ps(e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
| 79 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 80 | } |
| 81 | |
| 82 | #endif |
| 83 | |
| 84 | #if SIMDPP_USE_AVX512F |
| 85 | template<class VE, unsigned N> SIMDPP_INL |
| 86 | void i_make_const(float32<16>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 87 | { |
| 88 | v = _mm512_set_ps(e.val(off+15), e.val(off+14), e.val(off+13), e.val(off+12), |
| 89 | e.val(off+11), e.val(off+10), e.val(off+9), e.val(off+8), |
| 90 | e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
| 91 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 92 | } |
| 93 | #endif |
| 94 | |
| 95 | // ----------------------------------------------------------------------------- |
| 96 | |
| 97 | template<class VE, unsigned N> SIMDPP_INL |
| 98 | void i_make_const(float64<2>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 99 | { |
| 100 | #if SIMDPP_USE_SSE2 |
| 101 | v = _mm_set_pd(e.val(off+1), e.val(off+0)); |
| 102 | #elif SIMDPP_USE_NEON64 |
| 103 | double SIMDPP_ALIGN(16) data[2] = { |
| 104 | (double) e.val(off+0), |
| 105 | (double) e.val(off+1) |
| 106 | }; |
| 107 | v = vld1q_f64(data); |
| 108 | #elif SIMDPP_USE_VSX_206 |
| 109 | __vector double r = { double(e.val(off+0)), double(e.val(off+1)) }; |
| 110 | v = r; |
| 111 | #elif SIMDPP_USE_MSA |
| 112 | v = (v2f64){ double(e.val(off+0)), double(e.val(off+1)) }; |
| 113 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
| 114 | v = detail::null::make_vec<float64<2>, double>(e.val(off+0), e.val(off+1)); |
| 115 | #endif |
| 116 | } |
| 117 | |
| 118 | #if SIMDPP_USE_AVX |
| 119 | template<class VE, unsigned N> SIMDPP_INL |
| 120 | void i_make_const(float64<4>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 121 | { |
| 122 | v = _mm256_set_pd(e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 123 | } |
| 124 | #endif |
| 125 | |
| 126 | #if SIMDPP_USE_AVX512F |
| 127 | template<class VE, unsigned N> SIMDPP_INL |
| 128 | void i_make_const(float64<8>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 129 | { |
| 130 | v = _mm512_set_pd(e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
| 131 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 132 | } |
| 133 | #endif |
| 134 | |
| 135 | // ----------------------------------------------------------------------------- |
| 136 | |
| 137 | #if SIMDPP_USE_NEON |
| 138 | template<class VE> SIMDPP_INL |
| 139 | void i_make_const(uint8<16>& v, const expr_vec_make_const<VE,1>& e, unsigned off) |
| 140 | { |
| 141 | uint8_t rv = e.val(off+0); |
| 142 | v = vld1q_dup_u8(&rv); |
| 143 | } |
| 144 | |
| 145 | template<class VE> SIMDPP_INL |
| 146 | void i_make_const(uint8<16>& v, const expr_vec_make_const<VE,2>& e, unsigned off) |
| 147 | { |
| 148 | uint16_t rv = (e.val(off+0) & 0xff) | (e.val(off+1) & 0xff) << 8; |
| 149 | v = (uint16<8>) vld1q_dup_u16(&rv); |
| 150 | } |
| 151 | |
| 152 | template<class VE> SIMDPP_INL |
| 153 | void i_make_const(uint8<16>& v, const expr_vec_make_const<VE,4>& e, unsigned off) |
| 154 | { |
| 155 | uint32_t rv = (e.val(off+0) & 0xff) | (e.val(off+1) & 0xff) << 8 | |
| 156 | (e.val(off+2) & 0xff) << 16 | (e.val(off+3) & 0xff) << 24; |
| 157 | v = (uint32<4>) vld1q_dup_u32(&rv); |
| 158 | } |
| 159 | |
| 160 | template<class VE> SIMDPP_INL |
| 161 | void i_make_const(uint8<16>& v, const expr_vec_make_const<VE,8>& e, unsigned off) |
| 162 | { |
| 163 | uint8_t SIMDPP_ALIGN(8) data[8] = { |
| 164 | (uint8_t) e.val(off+0), (uint8_t) e.val(off+1), |
| 165 | (uint8_t) e.val(off+2), (uint8_t) e.val(off+3), |
| 166 | (uint8_t) e.val(off+4), (uint8_t) e.val(off+5), |
| 167 | (uint8_t) e.val(off+6), (uint8_t) e.val(off+7) |
| 168 | }; |
| 169 | uint8x8_t half = vld1_u8(data); |
| 170 | v = vcombine_u8(half, half); |
| 171 | } |
| 172 | #endif |
| 173 | |
| 174 | template<class VE, unsigned N> SIMDPP_INL |
| 175 | void i_make_const(uint8<16>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 176 | { |
| 177 | #if SIMDPP_USE_NULL |
| 178 | v = detail::null::make_vec<uint8<16>, uint8_t>( |
| 179 | e.val(off+0), e.val(off+1), e.val(off+2), e.val(off+3), |
| 180 | e.val(off+4), e.val(off+5), e.val(off+6), e.val(off+7), |
| 181 | e.val(off+8), e.val(off+9), e.val(off+10), e.val(off+11), |
| 182 | e.val(off+12), e.val(off+13), e.val(off+14), e.val(off+15)); |
| 183 | #elif SIMDPP_USE_SSE2 |
| 184 | v = _mm_set_epi8(e.val(off+15), e.val(off+14), e.val(off+13), e.val(off+12), |
| 185 | e.val(off+11), e.val(off+10), e.val(off+9), e.val(off+8), |
| 186 | e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
| 187 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 188 | #elif SIMDPP_USE_NEON |
| 189 | uint8_t SIMDPP_ALIGN(16) data[16] = { |
| 190 | (uint8_t) e.val(off+0), (uint8_t) e.val(off+1), |
| 191 | (uint8_t) e.val(off+2), (uint8_t) e.val(off+3), |
| 192 | (uint8_t) e.val(off+4), (uint8_t) e.val(off+5), |
| 193 | (uint8_t) e.val(off+6), (uint8_t) e.val(off+7), |
| 194 | (uint8_t) e.val(off+8), (uint8_t) e.val(off+9), |
| 195 | (uint8_t) e.val(off+10), (uint8_t) e.val(off+11), |
| 196 | (uint8_t) e.val(off+12), (uint8_t) e.val(off+13), |
| 197 | (uint8_t) e.val(off+14), (uint8_t) e.val(off+15) |
| 198 | }; |
| 199 | v = vld1q_u8(data); |
| 200 | #elif SIMDPP_USE_ALTIVEC |
| 201 | v = (__vector uint8_t){ |
| 202 | uint8_t(e.val(off+0)), uint8_t(e.val(off+1)), uint8_t(e.val(off+2)), uint8_t(e.val(off+3)), |
| 203 | uint8_t(e.val(off+4)), uint8_t(e.val(off+5)), uint8_t(e.val(off+6)), uint8_t(e.val(off+7)), |
| 204 | uint8_t(e.val(off+8)), uint8_t(e.val(off+9)), uint8_t(e.val(off+10)), uint8_t(e.val(off+11)), |
| 205 | uint8_t(e.val(off+12)), uint8_t(e.val(off+13)), uint8_t(e.val(off+14)), uint8_t(e.val(off+15)) |
| 206 | }; |
| 207 | #elif SIMDPP_USE_MSA |
| 208 | v = (v16u8){ |
| 209 | uint8_t(e.val(off+0)), uint8_t(e.val(off+1)), uint8_t(e.val(off+2)), uint8_t(e.val(off+3)), |
| 210 | uint8_t(e.val(off+4)), uint8_t(e.val(off+5)), uint8_t(e.val(off+6)), uint8_t(e.val(off+7)), |
| 211 | uint8_t(e.val(off+8)), uint8_t(e.val(off+9)), uint8_t(e.val(off+10)), uint8_t(e.val(off+11)), |
| 212 | uint8_t(e.val(off+12)), uint8_t(e.val(off+13)), uint8_t(e.val(off+14)), uint8_t(e.val(off+15)) |
| 213 | }; |
| 214 | #endif |
| 215 | } |
| 216 | |
| 217 | #if SIMDPP_USE_AVX2 |
| 218 | template<class VE, unsigned N> SIMDPP_INL |
| 219 | void i_make_const(uint8<32>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 220 | { |
| 221 | v = _mm256_set_epi8(e.val(off+31), e.val(off+30), e.val(off+29), e.val(off+28), |
| 222 | e.val(off+27), e.val(off+26), e.val(off+25), e.val(off+24), |
| 223 | e.val(off+23), e.val(off+22), e.val(off+21), e.val(off+20), |
| 224 | e.val(off+19), e.val(off+18), e.val(off+17), e.val(off+16), |
| 225 | e.val(off+15), e.val(off+14), e.val(off+13), e.val(off+12), |
| 226 | e.val(off+11), e.val(off+10), e.val(off+9), e.val(off+8), |
| 227 | e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
| 228 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 229 | } |
| 230 | #endif |
| 231 | |
| 232 | #if SIMDPP_USE_AVX512BW |
| 233 | SIMDPP_INL uint32_t make_uint32_uint8(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4) |
| 234 | { |
| 235 | return (a1 & 0xff) | ((a2 & 0xff) << 8) | ((a3 & 0xff) << 16) | ((a4 & 0xff) << 24); |
| 236 | } |
| 237 | |
| 238 | template<class VE, unsigned N> SIMDPP_INL |
| 239 | void i_make_const(uint8<64>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 240 | { |
| 241 | v = _mm512_set_epi32( |
| 242 | make_uint32_uint8(e.val(off+60), e.val(off+61), e.val(off+62), e.val(off+63)), |
| 243 | make_uint32_uint8(e.val(off+56), e.val(off+57), e.val(off+58), e.val(off+59)), |
| 244 | make_uint32_uint8(e.val(off+52), e.val(off+53), e.val(off+54), e.val(off+55)), |
| 245 | make_uint32_uint8(e.val(off+48), e.val(off+49), e.val(off+50), e.val(off+51)), |
| 246 | make_uint32_uint8(e.val(off+44), e.val(off+45), e.val(off+46), e.val(off+47)), |
| 247 | make_uint32_uint8(e.val(off+40), e.val(off+41), e.val(off+42), e.val(off+43)), |
| 248 | make_uint32_uint8(e.val(off+36), e.val(off+37), e.val(off+38), e.val(off+39)), |
| 249 | make_uint32_uint8(e.val(off+32), e.val(off+33), e.val(off+34), e.val(off+35)), |
| 250 | make_uint32_uint8(e.val(off+28), e.val(off+29), e.val(off+30), e.val(off+31)), |
| 251 | make_uint32_uint8(e.val(off+24), e.val(off+25), e.val(off+26), e.val(off+27)), |
| 252 | make_uint32_uint8(e.val(off+20), e.val(off+21), e.val(off+22), e.val(off+23)), |
| 253 | make_uint32_uint8(e.val(off+16), e.val(off+17), e.val(off+18), e.val(off+19)), |
| 254 | make_uint32_uint8(e.val(off+12), e.val(off+13), e.val(off+14), e.val(off+15)), |
| 255 | make_uint32_uint8(e.val(off+8), e.val(off+9), e.val(off+10), e.val(off+11)), |
| 256 | make_uint32_uint8(e.val(off+4), e.val(off+5), e.val(off+6), e.val(off+7)), |
| 257 | make_uint32_uint8(e.val(off+0), e.val(off+1), e.val(off+2), e.val(off+3)) |
| 258 | ); |
| 259 | } |
| 260 | #endif |
| 261 | |
| 262 | |
| 263 | // ----------------------------------------------------------------------------- |
| 264 | |
| 265 | #if SIMDPP_USE_NEON |
| 266 | template<class VE> SIMDPP_INL |
| 267 | void i_make_const(uint16<8>& v, const expr_vec_make_const<VE,1>& e, unsigned off) |
| 268 | { |
| 269 | uint16_t rv = e.val(off+0); |
| 270 | v = vld1q_dup_u16(&rv); |
| 271 | } |
| 272 | |
| 273 | template<class VE> SIMDPP_INL |
| 274 | void i_make_const(uint16<8>& v, const expr_vec_make_const<VE,2>& e, unsigned off) |
| 275 | { |
| 276 | uint32_t rv = (e.val(off+0) & 0xffff) | (e.val(off+1) & 0xffff) << 16; |
| 277 | v = (uint32<4>) vld1q_dup_u32(&rv); |
| 278 | } |
| 279 | |
| 280 | template<class VE> SIMDPP_INL |
| 281 | void i_make_const(uint16<8>& v, const expr_vec_make_const<VE,4>& e, unsigned off) |
| 282 | { |
| 283 | uint16_t SIMDPP_ALIGN(8) data[4] = { |
| 284 | (uint16_t) e.val(off+0), |
| 285 | (uint16_t) e.val(off+1), |
| 286 | (uint16_t) e.val(off+2), |
| 287 | (uint16_t) e.val(off+3) |
| 288 | }; |
| 289 | uint16x4_t half = vld1_u16(data); |
| 290 | v = vcombine_u16(half, half); |
| 291 | } |
| 292 | #endif |
| 293 | |
| 294 | |
| 295 | template<class VE, unsigned N> SIMDPP_INL |
| 296 | void i_make_const(uint16<8>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 297 | { |
| 298 | #if SIMDPP_USE_NULL |
| 299 | v = detail::null::make_vec<uint16<8>, uint16_t>(e.val(off+0), e.val(off+1), e.val(off+2), e.val(off+3), |
| 300 | e.val(off+4), e.val(off+5), e.val(off+6), e.val(off+7)); |
| 301 | #elif SIMDPP_USE_SSE2 |
| 302 | v = _mm_set_epi16(e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
| 303 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 304 | #elif SIMDPP_USE_NEON |
| 305 | uint16_t SIMDPP_ALIGN(16) data[8] = { |
| 306 | (uint16_t) e.val(off+0), (uint16_t) e.val(off+1), |
| 307 | (uint16_t) e.val(off+2), (uint16_t) e.val(off+3), |
| 308 | (uint16_t) e.val(off+4), (uint16_t) e.val(off+5), |
| 309 | (uint16_t) e.val(off+6), (uint16_t) e.val(off+7) |
| 310 | }; |
| 311 | v = vld1q_u16(data); |
| 312 | #elif SIMDPP_USE_ALTIVEC |
| 313 | v = (__vector uint16_t){ |
| 314 | uint16_t(e.val(off+0)), uint16_t(e.val(off+1)), uint16_t(e.val(off+2)), uint16_t(e.val(off+3)), |
| 315 | uint16_t(e.val(off+4)), uint16_t(e.val(off+5)), uint16_t(e.val(off+6)), uint16_t(e.val(off+7)) |
| 316 | }; |
| 317 | #elif SIMDPP_USE_MSA |
| 318 | v = (v8u16){ |
| 319 | uint16_t(e.val(off+0)), uint16_t(e.val(off+1)), uint16_t(e.val(off+2)), uint16_t(e.val(off+3)), |
| 320 | uint16_t(e.val(off+4)), uint16_t(e.val(off+5)), uint16_t(e.val(off+6)), uint16_t(e.val(off+7)) |
| 321 | }; |
| 322 | #endif |
| 323 | } |
| 324 | |
| 325 | #if SIMDPP_USE_AVX2 |
| 326 | template<class VE, unsigned N> SIMDPP_INL |
| 327 | void i_make_const(uint16<16>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 328 | { |
| 329 | v = _mm256_set_epi16(e.val(off+15), e.val(off+14), e.val(off+13), e.val(off+12), |
| 330 | e.val(off+11), e.val(off+10), e.val(off+9), e.val(off+8), |
| 331 | e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
| 332 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 333 | } |
| 334 | #endif |
| 335 | |
| 336 | #if SIMDPP_USE_AVX512BW |
| 337 | SIMDPP_INL uint32_t make_uint32_uint16(uint16_t a1, uint16_t a2) |
| 338 | { |
| 339 | return (a1 & 0xffff) | ((a2 & 0xffff) << 16); |
| 340 | } |
| 341 | |
| 342 | template<class VE, unsigned N> SIMDPP_INL |
| 343 | void i_make_const(uint16<32>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 344 | { |
| 345 | v = _mm512_set_epi32( |
| 346 | make_uint32_uint16(e.val(off+30), e.val(off+31)), |
| 347 | make_uint32_uint16(e.val(off+28), e.val(off+29)), |
| 348 | make_uint32_uint16(e.val(off+26), e.val(off+27)), |
| 349 | make_uint32_uint16(e.val(off+24), e.val(off+25)), |
| 350 | make_uint32_uint16(e.val(off+22), e.val(off+23)), |
| 351 | make_uint32_uint16(e.val(off+20), e.val(off+21)), |
| 352 | make_uint32_uint16(e.val(off+18), e.val(off+19)), |
| 353 | make_uint32_uint16(e.val(off+16), e.val(off+17)), |
| 354 | make_uint32_uint16(e.val(off+14), e.val(off+15)), |
| 355 | make_uint32_uint16(e.val(off+12), e.val(off+13)), |
| 356 | make_uint32_uint16(e.val(off+10), e.val(off+11)), |
| 357 | make_uint32_uint16(e.val(off+8), e.val(off+9)), |
| 358 | make_uint32_uint16(e.val(off+6), e.val(off+7)), |
| 359 | make_uint32_uint16(e.val(off+4), e.val(off+5)), |
| 360 | make_uint32_uint16(e.val(off+2), e.val(off+3)), |
| 361 | make_uint32_uint16(e.val(off+0), e.val(off+1))); |
| 362 | } |
| 363 | #endif |
| 364 | |
| 365 | // ----------------------------------------------------------------------------- |
| 366 | |
| 367 | #if SIMDPP_USE_NEON |
| 368 | template<class VE> SIMDPP_INL |
| 369 | void i_make_const(uint32<4>& v, const expr_vec_make_const<VE,1>& e, unsigned off) |
| 370 | { |
| 371 | uint32_t rv = e.val(off+0); |
| 372 | v = vld1q_dup_u32(&rv); |
| 373 | } |
| 374 | |
| 375 | template<class VE> SIMDPP_INL |
| 376 | void i_make_const(uint32<4>& v, const expr_vec_make_const<VE,2>& e, unsigned off) |
| 377 | { |
| 378 | uint32_t SIMDPP_ALIGN(8) data[2] = { |
| 379 | (uint32_t) e.val(off+0), |
| 380 | (uint32_t) e.val(off+1) |
| 381 | }; |
| 382 | uint32x2_t half = vld1_u32(data); |
| 383 | v = vcombine_u32(half, half); |
| 384 | } |
| 385 | #endif |
| 386 | |
| 387 | template<class VE, unsigned N> SIMDPP_INL |
| 388 | void i_make_const(uint32<4>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 389 | { |
| 390 | #if SIMDPP_USE_NULL |
| 391 | v = detail::null::make_vec<uint32<4>, uint32_t>(e.val(off+0), e.val(off+1), e.val(off+2), e.val(off+3)); |
| 392 | #elif SIMDPP_USE_SSE2 |
| 393 | v = _mm_set_epi32(e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 394 | #elif SIMDPP_USE_NEON |
| 395 | uint32_t SIMDPP_ALIGN(16) data[4] = { |
| 396 | (uint32_t) e.val(off+0), (uint32_t) e.val(off+1), |
| 397 | (uint32_t) e.val(off+2), (uint32_t) e.val(off+3) |
| 398 | }; |
| 399 | v = vld1q_u32(data); |
| 400 | #elif SIMDPP_USE_ALTIVEC |
| 401 | v = (__vector uint32_t) { uint32_t(e.val(off+0)), uint32_t(e.val(off+1)), |
| 402 | uint32_t(e.val(off+2)), uint32_t(e.val(off+3)) }; |
| 403 | #elif SIMDPP_USE_MSA |
| 404 | v = (v4u32) { uint32_t(e.val(off+0)), uint32_t(e.val(off+1)), |
| 405 | uint32_t(e.val(off+2)), uint32_t(e.val(off+3)) }; |
| 406 | #endif |
| 407 | } |
| 408 | |
| 409 | #if SIMDPP_USE_AVX2 |
| 410 | template<class VE, unsigned N> SIMDPP_INL |
| 411 | void i_make_const(uint32<8>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 412 | { |
| 413 | v = _mm256_set_epi32(e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
| 414 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 415 | } |
| 416 | #endif |
| 417 | |
| 418 | #if SIMDPP_USE_AVX512F |
| 419 | template<class VE, unsigned N> SIMDPP_INL |
| 420 | void i_make_const(uint32<16>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 421 | { |
| 422 | v = _mm512_set_epi32(e.val(off+15), e.val(off+14), e.val(off+13), e.val(off+12), |
| 423 | e.val(off+11), e.val(off+10), e.val(off+9), e.val(off+8), |
| 424 | e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
| 425 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 426 | } |
| 427 | #endif |
| 428 | |
| 429 | // ----------------------------------------------------------------------------- |
| 430 | |
| 431 | #if SIMDPP_USE_NEON |
| 432 | template<class VE> SIMDPP_INL |
| 433 | void i_make_const(uint64<2>& v, const expr_vec_make_const<VE,1>& e, unsigned off) |
| 434 | { |
| 435 | uint64x1_t r0 = vcreate_u64(uint64_t(e.val(off+0))); |
| 436 | v = vcombine_u64(r0, r0); |
| 437 | } |
| 438 | #endif |
| 439 | |
| 440 | template<class VE, unsigned N> SIMDPP_INL |
| 441 | void i_make_const(uint64<2>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 442 | { |
| 443 | #if SIMDPP_USE_SSE2 |
| 444 | #if SIMDPP_32_BITS && _MSC_VER |
| 445 | // MSVC does not support _mm_set_epi64x in 32-bit mode |
| 446 | uint64_t v1 = e.val(off+1); |
| 447 | uint64_t v0 = e.val(off+0); |
| 448 | v = _mm_set_epi32(v1 >> 32, v1 & 0xffffffff, v0 >> 32, v0 & 0xffffffff); |
| 449 | #else |
| 450 | v = _mm_set_epi64x(e.val(off+1), e.val(off+0)); |
| 451 | #endif |
| 452 | #elif SIMDPP_USE_NEON |
| 453 | uint64_t SIMDPP_ALIGN(16) data[2] = { |
| 454 | (uint64_t) e.val(off+0), |
| 455 | (uint64_t) e.val(off+1) |
| 456 | }; |
| 457 | v = vld1q_u64(data); |
| 458 | #elif SIMDPP_USE_VSX_207 |
| 459 | __vector uint64_t r = { (uint64_t)e.val(off+0), (uint64_t)e.val(off+1) }; |
| 460 | v = r; |
| 461 | #elif SIMDPP_USE_MSA |
| 462 | v = (v2u64) { uint64_t(e.val(off+0)), uint64_t(e.val(off+1)) }; |
| 463 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
| 464 | v = detail::null::make_vec<uint64<2>, uint64_t>(e.val(off+0), e.val(off+1)); |
| 465 | #endif |
| 466 | } |
| 467 | |
| 468 | #if SIMDPP_USE_AVX2 |
| 469 | template<class VE, unsigned N> SIMDPP_INL |
| 470 | void i_make_const(uint64<4>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 471 | { |
| 472 | #if SIMDPP_32_BITS && _MSC_VER |
| 473 | // MSVC does not support _mm256_set_epi64x in 32-bit mode |
| 474 | uint64_t v3 = e.val(off+3); |
| 475 | uint64_t v2 = e.val(off+2); |
| 476 | uint64_t v1 = e.val(off+1); |
| 477 | uint64_t v0 = e.val(off+0); |
| 478 | v = _mm256_set_epi32(v3 >> 32, v3 & 0xffffffff, v2 >> 32, v2 & 0xffffffff, |
| 479 | v1 >> 32, v1 & 0xffffffff, v0 >> 32, v0 & 0xffffffff); |
| 480 | #else |
| 481 | v = _mm256_set_epi64x(e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 482 | #endif |
| 483 | } |
| 484 | #endif |
| 485 | |
| 486 | #if SIMDPP_USE_AVX512F |
| 487 | template<class VE, unsigned N> SIMDPP_INL |
| 488 | void i_make_const(uint64<8>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
| 489 | { |
| 490 | v = _mm512_set_epi64(e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
| 491 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
| 492 | } |
| 493 | #endif |
| 494 | |
| 495 | // ----------------------------------------------------------------------------- |
| 496 | |
| 497 | template<class V, class VE, unsigned NE> SIMDPP_INL |
| 498 | void i_make_const(V& v, const expr_vec_make_const<VE,NE>& e, unsigned off) |
| 499 | { |
| 500 | for (unsigned i = 0; i < v.vec_length; ++i) { |
| 501 | i_make_const(v.vec(i), e, off + v.base_length * i); |
| 502 | } |
| 503 | } |
| 504 | |
| 505 | // ----------------------------------------------------------------------------- |
| 506 | |
| 507 | template<class V, class VE, unsigned N> SIMDPP_INL |
| 508 | V i_make_const_any(const expr_vec_make_const<VE,N>& e) |
| 509 | { |
| 510 | typename detail::remove_sign<V>::type r; |
| 511 | i_make_const(r, e, 0); |
| 512 | return V(r); |
| 513 | } |
| 514 | |
| 515 | // ----------------------------------------------------------------------------- |
| 516 | } // namespace insn |
| 517 | |
| 518 | template<class V, class VE, unsigned N> SIMDPP_INL |
| 519 | void construct_eval(V& v, const expr_vec_make_const<VE, N>& e) |
| 520 | { |
| 521 | v = insn::i_make_const_any<V>(e); |
| 522 | } |
| 523 | |
| 524 | template<class V> SIMDPP_INL |
| 525 | void construct_eval(V& v, const expr_vec_make_ones& e) |
| 526 | { |
| 527 | (void) e; |
| 528 | expr_vec_make_const<uint64_t,1> e2; |
| 529 | e2.a[0] = (uint64_t)-1; |
| 530 | typename V::uint_vector_type u; |
| 531 | insn::i_make_const(u, e2, 0); |
| 532 | v = u; |
| 533 | } |
| 534 | |
| 535 | } // namespace detail |
| 536 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 537 | } // namespace simdpp |
| 538 | |
| 539 | #if _MSC_VER |
| 540 | #pragma warning(pop) |
| 541 | #endif |
| 542 | |
| 543 | #endif |
| 544 | |
| 545 | |