| 1 | /* Copyright (C) 2011-2017 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_H |
| 9 | #define |
| 10 | |
| 11 | #ifndef LIBSIMDPP_SIMD_H |
| 12 | #error "This file must be included through simd.h" |
| 13 | #endif |
| 14 | |
| 15 | #include <simdpp/types.h> |
| 16 | #include <simdpp/core/cast.h> |
| 17 | #include <simdpp/core/move_l.h> |
| 18 | #include <simdpp/core/i_shift_l.h> |
| 19 | #include <simdpp/core/i_sub.h> |
| 20 | #include <simdpp/core/make_int.h> |
| 21 | #include <simdpp/detail/insn/split.h> |
| 22 | #include <simdpp/detail/mem_block.h> |
| 23 | |
| 24 | namespace simdpp { |
| 25 | namespace SIMDPP_ARCH_NAMESPACE { |
| 26 | namespace detail { |
| 27 | namespace insn { |
| 28 | |
| 29 | template<unsigned id> SIMDPP_INL |
| 30 | uint8_t (const uint8<16>& a) |
| 31 | { |
| 32 | #if SIMDPP_USE_NULL |
| 33 | return a.el(id); |
| 34 | #elif SIMDPP_USE_SSE4_1 |
| 35 | // Explicit cast is needed due to bug in Clang headers (intrinsic |
| 36 | // implemented as a macro with no appropriate casts) and a bug in Clang |
| 37 | // (thinks explicit conversion operators have the same rank as the regular |
| 38 | // ones) |
| 39 | return _mm_extract_epi8(a.native(), id); |
| 40 | #elif SIMDPP_USE_SSE2 |
| 41 | unsigned shift = (id % 2 == 1) ? 8 : 0; |
| 42 | return _mm_extract_epi16(a.native(), id/2) >> shift; |
| 43 | #elif SIMDPP_USE_NEON |
| 44 | return vgetq_lane_u8(a.native(), id); |
| 45 | #elif SIMDPP_USE_ALTIVEC |
| 46 | detail::mem_block<uint8x16> ax(a); |
| 47 | vec_ste(a.native(), 0, &ax[id]); |
| 48 | return ax[id]; |
| 49 | #elif SIMDPP_USE_MSA |
| 50 | return __msa_copy_u_b((v16i8) a.native(), id); |
| 51 | #endif |
| 52 | } |
| 53 | |
| 54 | #if SIMDPP_USE_AVX2 |
| 55 | template<unsigned id> SIMDPP_INL |
| 56 | uint8_t i_extract(const uint8<32>& a) |
| 57 | { |
| 58 | __m128i val = _mm256_extracti128_si256(a.native(), id / 16); |
| 59 | return _mm_extract_epi8(val, id % 16); |
| 60 | } |
| 61 | #endif |
| 62 | |
| 63 | #if SIMDPP_USE_AVX512BW |
| 64 | template<unsigned id> SIMDPP_INL |
| 65 | uint8_t i_extract(const uint8<64>& a) |
| 66 | { |
| 67 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 16); |
| 68 | return _mm_extract_epi8(val, id % 16); |
| 69 | } |
| 70 | #endif |
| 71 | |
| 72 | // ----------------------------------------------------------------------------- |
| 73 | |
| 74 | template<unsigned id> SIMDPP_INL |
| 75 | int8_t (const int8<16>& a) |
| 76 | { |
| 77 | #if SIMDPP_USE_MSA |
| 78 | return __msa_copy_s_b(a.native(), id); |
| 79 | #else |
| 80 | return i_extract<id>(uint8x16(a)); |
| 81 | #endif |
| 82 | } |
| 83 | |
| 84 | #if SIMDPP_USE_AVX2 |
| 85 | template<unsigned id> SIMDPP_INL |
| 86 | int8_t i_extract(const int8<32>& a) |
| 87 | { |
| 88 | __m128i val = _mm256_extracti128_si256(a.native(), id / 16); |
| 89 | return _mm_extract_epi8(val, id % 16); |
| 90 | } |
| 91 | #endif |
| 92 | |
| 93 | #if SIMDPP_USE_AVX512BW |
| 94 | template<unsigned id> SIMDPP_INL |
| 95 | int8_t i_extract(const int8<64>& a) |
| 96 | { |
| 97 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 16); |
| 98 | return _mm_extract_epi8(val, id % 16); |
| 99 | } |
| 100 | #endif |
| 101 | |
| 102 | // ----------------------------------------------------------------------------- |
| 103 | |
| 104 | template<unsigned id> SIMDPP_INL |
| 105 | uint16_t (const uint16<8>& a) |
| 106 | { |
| 107 | #if SIMDPP_USE_NULL |
| 108 | return a.el(id); |
| 109 | #elif SIMDPP_USE_SSE2 |
| 110 | return _mm_extract_epi16(a.native(), id); |
| 111 | #elif SIMDPP_USE_NEON |
| 112 | return vgetq_lane_u16(a.native(), id); |
| 113 | #elif SIMDPP_USE_ALTIVEC |
| 114 | detail::mem_block<uint16x8> ax(a); |
| 115 | vec_ste(a.native(), 0, &ax[id]); |
| 116 | return ax[id]; |
| 117 | #elif SIMDPP_USE_MSA |
| 118 | return __msa_copy_u_h((v8i16) a.native(), id); |
| 119 | #endif |
| 120 | } |
| 121 | |
| 122 | #if SIMDPP_USE_AVX2 |
| 123 | template<unsigned id> SIMDPP_INL |
| 124 | uint16_t i_extract(const uint16<16>& a) |
| 125 | { |
| 126 | __m128i val = _mm256_extracti128_si256(a.native(), id / 8); |
| 127 | return _mm_extract_epi16(val, id % 8); |
| 128 | } |
| 129 | #endif |
| 130 | |
| 131 | #if SIMDPP_USE_AVX512BW |
| 132 | template<unsigned id> SIMDPP_INL |
| 133 | uint16_t i_extract(const uint16<32>& a) |
| 134 | { |
| 135 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 8); |
| 136 | return _mm_extract_epi16(val, id % 8); |
| 137 | } |
| 138 | #endif |
| 139 | |
| 140 | // ----------------------------------------------------------------------------- |
| 141 | |
| 142 | template<unsigned id> SIMDPP_INL |
| 143 | int16_t (const int16<8>& a) |
| 144 | { |
| 145 | #if SIMDPP_USE_MSA |
| 146 | return __msa_copy_s_h(a.native(), id); |
| 147 | #else |
| 148 | return i_extract<id>(uint16x8(a)); |
| 149 | #endif |
| 150 | } |
| 151 | |
| 152 | #if SIMDPP_USE_AVX2 |
| 153 | template<unsigned id> SIMDPP_INL |
| 154 | int16_t i_extract(const int16<16>& a) |
| 155 | { |
| 156 | __m128i val = _mm256_extracti128_si256(a.native(), id / 8); |
| 157 | return _mm_extract_epi16(val, id % 8); |
| 158 | } |
| 159 | #endif |
| 160 | |
| 161 | #if SIMDPP_USE_AVX512BW |
| 162 | template<unsigned id> SIMDPP_INL |
| 163 | int16_t i_extract(const int16<32>& a) |
| 164 | { |
| 165 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 8); |
| 166 | return _mm_extract_epi16(val, id % 8); |
| 167 | } |
| 168 | #endif |
| 169 | |
| 170 | // ----------------------------------------------------------------------------- |
| 171 | |
| 172 | template<unsigned id> SIMDPP_INL |
| 173 | uint32_t (const uint32<4>& a) |
| 174 | { |
| 175 | #if SIMDPP_USE_NULL |
| 176 | return a.el(id); |
| 177 | #elif SIMDPP_USE_SSE4_1 |
| 178 | return _mm_extract_epi32(a.native(), id); |
| 179 | #elif SIMDPP_USE_SSE2 |
| 180 | // when id==0, move_l is template-specialized and does nothing |
| 181 | return _mm_cvtsi128_si32(move4_l<id>(a).eval().native()); |
| 182 | #elif SIMDPP_USE_NEON |
| 183 | return vgetq_lane_u32(a.native(), id); |
| 184 | #elif SIMDPP_USE_ALTIVEC |
| 185 | detail::mem_block<uint32x4> ax(a); |
| 186 | vec_ste(a.native(), 0, &ax[id]); |
| 187 | return ax[id]; |
| 188 | #elif SIMDPP_USE_MSA |
| 189 | return __msa_copy_u_w((v4i32) a.native(), id); |
| 190 | #endif |
| 191 | } |
| 192 | |
| 193 | #if SIMDPP_USE_AVX2 |
| 194 | template<unsigned id> SIMDPP_INL |
| 195 | uint32_t i_extract(const uint32<8>& a) |
| 196 | { |
| 197 | __m128i val = _mm256_extracti128_si256(a.native(), id / 4); |
| 198 | return _mm_extract_epi32(val, id % 4); |
| 199 | } |
| 200 | #endif |
| 201 | |
| 202 | #if SIMDPP_USE_AVX512F |
| 203 | template<unsigned id> SIMDPP_INL |
| 204 | uint32_t i_extract(const uint32<16>& a) |
| 205 | { |
| 206 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 4); |
| 207 | return _mm_extract_epi32(val, id % 4); |
| 208 | } |
| 209 | #endif |
| 210 | |
| 211 | // ----------------------------------------------------------------------------- |
| 212 | |
| 213 | template<unsigned id> SIMDPP_INL |
| 214 | int32_t (const int32<4>& a) |
| 215 | { |
| 216 | #if SIMDPP_USE_MSA |
| 217 | return __msa_copy_s_w(a.native(), id); |
| 218 | #else |
| 219 | return i_extract<id>(uint32x4(a)); |
| 220 | #endif |
| 221 | } |
| 222 | |
| 223 | #if SIMDPP_USE_AVX2 |
| 224 | template<unsigned id> SIMDPP_INL |
| 225 | int32_t i_extract(const int32<8>& a) |
| 226 | { |
| 227 | __m128i val = _mm256_extracti128_si256(a.native(), id / 4); |
| 228 | return _mm_extract_epi32(val, id % 4); |
| 229 | } |
| 230 | #endif |
| 231 | |
| 232 | #if SIMDPP_USE_AVX512F |
| 233 | template<unsigned id> SIMDPP_INL |
| 234 | int32_t i_extract(const int32<16>& a) |
| 235 | { |
| 236 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 4); |
| 237 | return _mm_extract_epi32(val, id % 4); |
| 238 | } |
| 239 | #endif |
| 240 | |
| 241 | // ----------------------------------------------------------------------------- |
| 242 | |
| 243 | template<unsigned id> SIMDPP_INL |
| 244 | uint64_t (const uint64<2>& a) |
| 245 | { |
| 246 | #if SIMDPP_USE_NULL |
| 247 | return a.el(id); |
| 248 | #elif SIMDPP_USE_SSE4_1 |
| 249 | #if SIMDPP_32_BITS |
| 250 | uint32x4 t = uint32x4(a); |
| 251 | uint64_t r = i_extract<id*2>(t); |
| 252 | r |= uint64_t(i_extract<id*2+1>(t)) << 32; |
| 253 | return r; |
| 254 | #else |
| 255 | return _mm_extract_epi64(a.native(), id); |
| 256 | #endif |
| 257 | #elif SIMDPP_USE_SSE2 |
| 258 | #if SIMDPP_32_BITS |
| 259 | uint32x4 t = uint32x4(a); |
| 260 | uint64_t r = 0; |
| 261 | t = move4_l<id*2>(t); // when id==0, move_l is template-specialized and does nothing |
| 262 | r = i_extract<0>(t); |
| 263 | t = move4_l<1>(t); |
| 264 | r |= uint64_t(i_extract<0>(t)) << 32; |
| 265 | return r; |
| 266 | #else |
| 267 | uint64x2 t = a; |
| 268 | if (id != 0) { |
| 269 | t = move2_l<id>(t); |
| 270 | } |
| 271 | return _mm_cvtsi128_si64(t.native()); |
| 272 | #endif |
| 273 | #elif SIMDPP_USE_NEON |
| 274 | return vgetq_lane_u64(a.native(), id); |
| 275 | #elif SIMDPP_USE_ALTIVEC |
| 276 | detail::mem_block<uint64x2> ax(a); |
| 277 | return ax[id]; |
| 278 | #elif SIMDPP_USE_MSA |
| 279 | #if SIMDPP_64_BITS |
| 280 | return __msa_copy_u_d((v2i64) a.native(), id); |
| 281 | #else |
| 282 | v4i32 a32 = (v4i32) a.native(); |
| 283 | uint64_t lo = __msa_copy_u_w(a32, id*2); |
| 284 | uint64_t hi = __msa_copy_u_w(a32, id*2+1); |
| 285 | return lo | (hi << 32); |
| 286 | #endif |
| 287 | #endif |
| 288 | } |
| 289 | |
| 290 | #if SIMDPP_USE_AVX2 |
| 291 | template<unsigned id> SIMDPP_INL |
| 292 | uint64_t i_extract(const uint64<4>& a) |
| 293 | { |
| 294 | uint64<2> val = _mm256_extracti128_si256(a.native(), id / 2); |
| 295 | return i_extract<id % 2>(val); |
| 296 | } |
| 297 | #endif |
| 298 | |
| 299 | #if SIMDPP_USE_AVX512F |
| 300 | template<unsigned id> SIMDPP_INL |
| 301 | uint64_t i_extract(const uint64<8>& a) |
| 302 | { |
| 303 | uint64<2> val = _mm512_extracti32x4_epi32(a.native(), id / 2); |
| 304 | return i_extract<id % 2>(val); |
| 305 | } |
| 306 | #endif |
| 307 | |
| 308 | // ----------------------------------------------------------------------------- |
| 309 | |
| 310 | template<unsigned id> SIMDPP_INL |
| 311 | int64_t (const int64<2>& a) |
| 312 | { |
| 313 | #if SIMDPP_USE_MSA |
| 314 | #if SIMDPP_64_BITS |
| 315 | return __msa_copy_s_d(a, id); |
| 316 | #else |
| 317 | v4i32 a32 = (v4i32) a.native(); |
| 318 | int64_t lo = __msa_copy_s_w(a32, id*2); |
| 319 | int64_t hi = __msa_copy_s_w(a32, id*2+1); |
| 320 | return lo | (hi << 32); |
| 321 | #endif |
| 322 | #else |
| 323 | return i_extract<id>(uint64x2(a)); |
| 324 | #endif |
| 325 | } |
| 326 | |
| 327 | #if SIMDPP_USE_AVX2 |
| 328 | template<unsigned id> SIMDPP_INL |
| 329 | int64_t i_extract(const int64<4>& a) |
| 330 | { |
| 331 | uint64<2> val = _mm256_extracti128_si256(a.native(), id / 2); |
| 332 | return i_extract<id % 2>(val); |
| 333 | } |
| 334 | #endif |
| 335 | |
| 336 | #if SIMDPP_USE_AVX512F |
| 337 | template<unsigned id> SIMDPP_INL |
| 338 | int64_t i_extract(const int64<8>& a) |
| 339 | { |
| 340 | uint64<2> val = _mm512_extracti32x4_epi32(a.native(), id / 2); |
| 341 | return i_extract<id % 2>(val); |
| 342 | } |
| 343 | #endif |
| 344 | |
| 345 | // ----------------------------------------------------------------------------- |
| 346 | |
| 347 | template<unsigned id> SIMDPP_INL |
| 348 | float (const float32<4>& a) |
| 349 | { |
| 350 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
| 351 | return a.el(id); |
| 352 | #elif SIMDPP_USE_SSE2 |
| 353 | return bit_cast<float>(i_extract<id>(int32x4(a))); |
| 354 | #elif SIMDPP_USE_NEON |
| 355 | return vgetq_lane_f32(a.native(), id); |
| 356 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 357 | detail::mem_block<float32x4> ax(a); |
| 358 | return ax[id]; |
| 359 | #endif |
| 360 | } |
| 361 | |
| 362 | #if SIMDPP_USE_AVX |
| 363 | template<unsigned id> SIMDPP_INL |
| 364 | float i_extract(const float32<8>& a) |
| 365 | { |
| 366 | __m128 val = _mm256_extractf128_ps(a.native(), id / 4); |
| 367 | return bit_cast<float>(_mm_extract_epi32(_mm_castps_si128(val), id % 4)); |
| 368 | } |
| 369 | #endif |
| 370 | |
| 371 | #if SIMDPP_USE_AVX512F |
| 372 | template<unsigned id> SIMDPP_INL |
| 373 | float i_extract(const float32<16>& a) |
| 374 | { |
| 375 | __m128 val = _mm512_extractf32x4_ps(a.native(), id / 4); |
| 376 | return bit_cast<float>(_mm_extract_epi32(_mm_castps_si128(val), id % 4)); |
| 377 | } |
| 378 | #endif |
| 379 | |
| 380 | // ----------------------------------------------------------------------------- |
| 381 | |
| 382 | template<unsigned id> SIMDPP_INL |
| 383 | double (const float64<2>& a) |
| 384 | { |
| 385 | #if SIMDPP_USE_NULL |
| 386 | return a.el(id); |
| 387 | #elif SIMDPP_USE_SSE2 |
| 388 | return bit_cast<double>(i_extract<id>(int64x2(a))); |
| 389 | #elif SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
| 390 | detail::mem_block<float64x2> ax(a); |
| 391 | return ax[id]; |
| 392 | #elif SIMDPP_USE_NEON64 |
| 393 | return vgetq_lane_f64(a.native(), id); |
| 394 | #endif |
| 395 | } |
| 396 | |
| 397 | #if SIMDPP_USE_AVX |
| 398 | template<unsigned id> SIMDPP_INL |
| 399 | double i_extract(const float64<4>& a) |
| 400 | { |
| 401 | __m128d val = _mm256_extractf128_pd(a.native(), id / 2); |
| 402 | return bit_cast<double>(i_extract<id % 2>((uint64<2>)_mm_castpd_si128(val))); |
| 403 | } |
| 404 | #endif |
| 405 | |
| 406 | #if SIMDPP_USE_AVX512F |
| 407 | template<unsigned id> SIMDPP_INL |
| 408 | double i_extract(const float64<8>& a) |
| 409 | { |
| 410 | __m128 val = _mm512_extractf32x4_ps(_mm512_castpd_ps(a.native()), id / 2); |
| 411 | return bit_cast<double>(i_extract<id % 2>((uint64<2>)_mm_castps_si128(val))); |
| 412 | } |
| 413 | #endif |
| 414 | |
| 415 | // ----------------------------------------------------------------------------- |
| 416 | |
| 417 | template<unsigned id, class V> SIMDPP_INL |
| 418 | typename V::element_type (const V& a) |
| 419 | { |
| 420 | typename V::base_vector_type base = a.vec(id / V::base_length); |
| 421 | return i_extract<id % V::base_length>(base); |
| 422 | } |
| 423 | |
| 424 | } // namespace insn |
| 425 | } // namespace detail |
| 426 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 427 | } // namespace simdpp |
| 428 | |
| 429 | #endif |
| 430 | |