| 1 | /* Copyright (C) 2011-2017 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_INSERT_H |
| 9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_INSERT_H |
| 10 | |
| 11 | #ifndef LIBSIMDPP_SIMD_H |
| 12 | #error "This file must be included through simd.h" |
| 13 | #endif |
| 14 | |
| 15 | #include <simdpp/types.h> |
| 16 | #include <simdpp/core/cast.h> |
| 17 | #include <simdpp/core/move_l.h> |
| 18 | #include <simdpp/core/i_shift_l.h> |
| 19 | #include <simdpp/core/i_sub.h> |
| 20 | #include <simdpp/core/make_int.h> |
| 21 | #include <simdpp/detail/insn/split.h> |
| 22 | #include <simdpp/detail/mem_block.h> |
| 23 | |
| 24 | namespace simdpp { |
| 25 | namespace SIMDPP_ARCH_NAMESPACE { |
| 26 | namespace detail { |
| 27 | namespace insn { |
| 28 | |
| 29 | template<unsigned id> SIMDPP_INL |
| 30 | uint8x16 i_insert(const uint8x16& ca, uint8_t x) |
| 31 | { |
| 32 | uint8<16> a = ca; |
| 33 | #if SIMDPP_USE_NULL |
| 34 | a.el(id) = x; |
| 35 | return a; |
| 36 | #elif SIMDPP_USE_SSE4_1 |
| 37 | return _mm_insert_epi8(a.native(), x, id); |
| 38 | #elif SIMDPP_USE_SSE2 |
| 39 | uint16_t r = _mm_extract_epi16(a.native(), id/2); |
| 40 | if (id % 2 == 1) { |
| 41 | r = (r & 0x00ff) | (x << 8); |
| 42 | } else { |
| 43 | r = (r & 0xff00) | x; |
| 44 | } |
| 45 | a = _mm_insert_epi16(a.native(), r, id/2); |
| 46 | return a; |
| 47 | #elif SIMDPP_USE_NEON |
| 48 | return vsetq_lane_u8(x, a.native(), id); |
| 49 | #elif SIMDPP_USE_ALTIVEC |
| 50 | detail::mem_block<uint8x16> ax(a); |
| 51 | ax[id] = x; |
| 52 | a = ax; |
| 53 | return a; |
| 54 | #elif SIMDPP_USE_MSA |
| 55 | return (v16u8) __msa_insert_b((v16i8) a.native(), id, x); |
| 56 | #endif |
| 57 | } |
| 58 | |
| 59 | #if SIMDPP_USE_AVX2 |
| 60 | template<unsigned id> SIMDPP_INL |
| 61 | uint8<32> i_insert(const uint8<32>& a, uint8_t x) |
| 62 | { |
| 63 | __m256i val = a.native(); |
| 64 | __m128i val128 = _mm256_extracti128_si256(val, id / 16); |
| 65 | val128 = _mm_insert_epi8(val128, x, id % 16); |
| 66 | return _mm256_inserti128_si256(val, val128, id / 16); |
| 67 | } |
| 68 | #endif |
| 69 | |
| 70 | #if SIMDPP_USE_AVX512BW |
| 71 | template<unsigned id> SIMDPP_INL |
| 72 | uint8<64> i_insert(const uint8<64>& a, uint8_t x) |
| 73 | { |
| 74 | __m512i val = a.native(); |
| 75 | __m128i val128 = _mm512_extracti32x4_epi32(val, id / 16); |
| 76 | val128 = _mm_insert_epi8(val128, x, id % 16); |
| 77 | return _mm512_inserti32x4(val, val128, id / 16); |
| 78 | } |
| 79 | #endif |
| 80 | |
| 81 | // ----------------------------------------------------------------------------- |
| 82 | |
| 83 | template<unsigned id> SIMDPP_INL |
| 84 | uint16x8 i_insert(const uint16x8& ca, uint16_t x) |
| 85 | { |
| 86 | uint16<8> a = ca; |
| 87 | #if SIMDPP_USE_NULL |
| 88 | a.el(id) = x; |
| 89 | return a; |
| 90 | #elif SIMDPP_USE_SSE2 |
| 91 | return _mm_insert_epi16(a.native(), x, id); |
| 92 | #elif SIMDPP_USE_NEON |
| 93 | return vsetq_lane_u16(x, a.native(), id); |
| 94 | #elif SIMDPP_USE_ALTIVEC |
| 95 | detail::mem_block<uint16x8> ax(a); |
| 96 | ax[id] = x; |
| 97 | a = ax; |
| 98 | return a; |
| 99 | #elif SIMDPP_USE_MSA |
| 100 | return (v8u16) __msa_insert_h((v8i16) a.native(), id, x); |
| 101 | #endif |
| 102 | } |
| 103 | |
| 104 | #if SIMDPP_USE_AVX2 |
| 105 | template<unsigned id> SIMDPP_INL |
| 106 | uint16<16> i_insert(const uint16<16>& a, uint16_t x) |
| 107 | { |
| 108 | __m256i val = a.native(); |
| 109 | __m128i val128 = _mm256_extracti128_si256(val, id / 8); |
| 110 | val128 = _mm_insert_epi16(val128, x, id % 8); |
| 111 | return _mm256_inserti128_si256(val, val128, id / 8); |
| 112 | } |
| 113 | #endif |
| 114 | |
| 115 | #if SIMDPP_USE_AVX512BW |
| 116 | template<unsigned id> SIMDPP_INL |
| 117 | uint16<32> i_insert(const uint16<32>& a, uint16_t x) |
| 118 | { |
| 119 | __m512i val = a.native(); |
| 120 | __m128i val128 = _mm512_extracti32x4_epi32(val, id / 8); |
| 121 | val128 = _mm_insert_epi16(val128, x, id % 8); |
| 122 | return _mm512_inserti32x4(val, val128, id / 8); |
| 123 | } |
| 124 | #endif |
| 125 | |
| 126 | // ----------------------------------------------------------------------------- |
| 127 | |
| 128 | template<unsigned id> SIMDPP_INL |
| 129 | uint32x4 i_insert(const uint32x4& ca, uint32_t x) |
| 130 | { |
| 131 | uint32<4> a = ca; |
| 132 | #if SIMDPP_USE_NULL |
| 133 | a.el(id) = x; |
| 134 | return a; |
| 135 | #elif SIMDPP_USE_SSE4_1 |
| 136 | return _mm_insert_epi32(a.native(), x, id); |
| 137 | #elif SIMDPP_USE_SSE2 |
| 138 | uint16_t lo = x & 0xffff; |
| 139 | uint16_t hi = x >> 16; |
| 140 | uint16x8 a1 = uint16<8>(a); |
| 141 | a1 = i_insert<id*2>(a1, lo); |
| 142 | a1 = i_insert<id*2+1>(a1, hi); |
| 143 | return uint32<4>(a1); |
| 144 | #elif SIMDPP_USE_NEON |
| 145 | return vsetq_lane_u32(x, a.native(), id); |
| 146 | #elif SIMDPP_USE_ALTIVEC |
| 147 | detail::mem_block<uint32x4> ax(a); |
| 148 | ax[id] = x; |
| 149 | a = ax; |
| 150 | return a; |
| 151 | #elif SIMDPP_USE_MSA |
| 152 | return (v4u32) __msa_insert_w((v4i32) a.native(), id, x); |
| 153 | #endif |
| 154 | } |
| 155 | |
| 156 | #if SIMDPP_USE_AVX2 |
| 157 | template<unsigned id> SIMDPP_INL |
| 158 | uint32<8> i_insert(const uint32<8>& a, uint32_t x) |
| 159 | { |
| 160 | __m256i val = a.native(); |
| 161 | __m128i val128 = _mm256_extracti128_si256(val, id / 4); |
| 162 | val128 = _mm_insert_epi32(val128, x, id % 4); |
| 163 | return _mm256_inserti128_si256(val, val128, id / 4); |
| 164 | } |
| 165 | #endif |
| 166 | |
| 167 | #if SIMDPP_USE_AVX512F |
| 168 | template<unsigned id> SIMDPP_INL |
| 169 | uint32<16> i_insert(const uint32<16>& a, uint32_t x) |
| 170 | { |
| 171 | __m512i val = a.native(); |
| 172 | __m128i val128 = _mm512_extracti32x4_epi32(val, id / 4); |
| 173 | val128 = _mm_insert_epi32(val128, x, id % 4); |
| 174 | return _mm512_inserti32x4(val, val128, id / 4); |
| 175 | } |
| 176 | #endif |
| 177 | |
| 178 | // ----------------------------------------------------------------------------- |
| 179 | |
| 180 | template<unsigned id> SIMDPP_INL |
| 181 | uint64x2 i_insert(const uint64x2& ca, uint64_t x) |
| 182 | { |
| 183 | uint64<2> a = ca; |
| 184 | #if SIMDPP_USE_NULL |
| 185 | a.el(id) = x; |
| 186 | return a; |
| 187 | #elif SIMDPP_USE_SSE4_1 |
| 188 | #if SIMDPP_32_BITS |
| 189 | uint32x4 a0 = (uint32x4) a; |
| 190 | a0 = i_insert<id*2>(a0, uint32_t(x)); |
| 191 | a0 = i_insert<id*2+1>(a0, uint32_t(x >> 32)); |
| 192 | return (uint64x2) a0; |
| 193 | #else |
| 194 | return _mm_insert_epi64(a.native(), x, id); |
| 195 | #endif |
| 196 | #elif SIMDPP_USE_SSE2 |
| 197 | #if SIMDPP_32_BITS |
| 198 | int32x4 va = _mm_cvtsi32_si128(uint32_t(x)); |
| 199 | int32x4 vb = _mm_cvtsi32_si128(uint32_t(x >> 32)); |
| 200 | int64x2 vx = (int64x2) zip4_lo(va, vb); |
| 201 | if (id == 0) { |
| 202 | a = shuffle1<0,1>(vx, a); |
| 203 | } else { |
| 204 | a = shuffle1<0,0>(a, vx); |
| 205 | } |
| 206 | return a; |
| 207 | #else |
| 208 | int64x2 vx = _mm_cvtsi64_si128(x); |
| 209 | if (id == 0) { |
| 210 | a = shuffle1<0,1>(vx, a); |
| 211 | } else { |
| 212 | a = shuffle1<0,0>(a, vx); |
| 213 | } |
| 214 | return a; |
| 215 | #endif |
| 216 | #elif SIMDPP_USE_NEON |
| 217 | return vsetq_lane_u64(x, a.native(), id); |
| 218 | #elif SIMDPP_USE_ALTIVEC |
| 219 | detail::mem_block<uint64x2> ax(a); |
| 220 | ax[id] = x; |
| 221 | a = ax; |
| 222 | return a; |
| 223 | #elif SIMDPP_USE_MSA |
| 224 | #if SIMDPP_64_BITS |
| 225 | return (v2u64) __msa_insert_d((v2i64) a.native(), id, x); |
| 226 | #else |
| 227 | int32<4> a32; |
| 228 | a32 = a; |
| 229 | a32 = __msa_insert_w(a32.native(), id*2, x); |
| 230 | a32 = __msa_insert_w(a32.native(), id*2+1, x >> 32); |
| 231 | return (uint64<2>) a32; |
| 232 | #endif |
| 233 | #endif |
| 234 | } |
| 235 | |
| 236 | #if SIMDPP_USE_AVX2 |
| 237 | template<unsigned id> SIMDPP_INL |
| 238 | uint64<4> i_insert(const uint64<4>& a, uint64_t x) |
| 239 | { |
| 240 | __m256i val = a.native(); |
| 241 | uint64<2> val128 = _mm256_extracti128_si256(val, id / 2); |
| 242 | val128 = i_insert<id % 2>(val128, x); |
| 243 | return _mm256_inserti128_si256(val, val128.native(), id / 2); |
| 244 | } |
| 245 | #endif |
| 246 | |
| 247 | #if SIMDPP_USE_AVX512F |
| 248 | template<unsigned id> SIMDPP_INL |
| 249 | uint64<8> i_insert(const uint64<8>& a, uint64_t x) |
| 250 | { |
| 251 | __m512i val = a.native(); |
| 252 | uint64<2> val128 = _mm512_extracti32x4_epi32(val, id / 2); |
| 253 | val128 = i_insert<id % 2>(val128, x); |
| 254 | return _mm512_inserti32x4(val, val128.native(), id / 2); |
| 255 | } |
| 256 | #endif |
| 257 | |
| 258 | // ----------------------------------------------------------------------------- |
| 259 | |
| 260 | template<unsigned id> SIMDPP_INL |
| 261 | float32x4 i_insert(const float32x4& a, float x) |
| 262 | { |
| 263 | #if SIMDPP_USE_NEON_FLT_SP |
| 264 | return vsetq_lane_f32(x, a.native(), id); |
| 265 | #else |
| 266 | return float32<4>(i_insert<id>(uint32<4>(a), bit_cast<uint32_t>(x))); |
| 267 | #endif |
| 268 | } |
| 269 | |
| 270 | #if SIMDPP_USE_AVX |
| 271 | template<unsigned id> SIMDPP_INL |
| 272 | float32<8> i_insert(const float32<8>& a, float x) |
| 273 | { |
| 274 | __m256 val = a.native(); |
| 275 | float32<4> val128 = _mm256_extractf128_ps(val, id / 4); |
| 276 | val128 = i_insert<id % 4>(val128, x); |
| 277 | return _mm256_insertf128_ps(val, val128.native(), id / 4); |
| 278 | } |
| 279 | #endif |
| 280 | |
| 281 | #if SIMDPP_USE_AVX512F |
| 282 | template<unsigned id> SIMDPP_INL |
| 283 | float32<16> i_insert(const float32<16>& a, float x) |
| 284 | { |
| 285 | __m512 val = a.native(); |
| 286 | float32<4> val128 = _mm512_extractf32x4_ps(val, id / 4); |
| 287 | val128 = i_insert<id % 4>(val128, x); |
| 288 | return _mm512_insertf32x4(val, val128.native(), id / 4); |
| 289 | } |
| 290 | #endif |
| 291 | |
| 292 | // ----------------------------------------------------------------------------- |
| 293 | |
| 294 | template<unsigned id> SIMDPP_INL |
| 295 | float64x2 i_insert(const float64x2& a, double x) |
| 296 | { |
| 297 | return float64<2>(i_insert<id>(uint64<2>(a), bit_cast<int64_t>(x))); |
| 298 | } |
| 299 | |
| 300 | #if SIMDPP_USE_AVX |
| 301 | template<unsigned id> SIMDPP_INL |
| 302 | float64<4> i_insert(const float64<4>& a, double x) |
| 303 | { |
| 304 | __m256d val = a.native(); |
| 305 | float64<2> val128 = _mm256_extractf128_pd(val, id / 2); |
| 306 | val128 = i_insert<id % 2>(val128, x); |
| 307 | return _mm256_insertf128_pd(val, val128.native(), id / 2); |
| 308 | } |
| 309 | #endif |
| 310 | |
| 311 | #if SIMDPP_USE_AVX512F |
| 312 | template<unsigned id> SIMDPP_INL |
| 313 | float64<8> i_insert(const float64<8>& a, double x) |
| 314 | { |
| 315 | __m512 val = _mm512_castpd_ps(a.native()); |
| 316 | float64<2> val128 = _mm_castps_pd(_mm512_extractf32x4_ps(val, id / 2)); |
| 317 | val128 = i_insert<id % 2>(val128, x); |
| 318 | return _mm512_castps_pd(_mm512_insertf32x4(val, _mm_castpd_ps(val128.native()), id / 2)); |
| 319 | } |
| 320 | #endif |
| 321 | |
| 322 | // ----------------------------------------------------------------------------- |
| 323 | |
| 324 | template<unsigned id, class V, class E> SIMDPP_INL |
| 325 | V i_insert(const V& ca, E el) |
| 326 | { |
| 327 | V a = ca; |
| 328 | typename V::base_vector_type base = a.vec(id / V::base_length); |
| 329 | base = i_insert<id % V::base_length>(base, (typename V::element_type) el); |
| 330 | a.vec(id / V::base_length) = base; |
| 331 | return a; |
| 332 | } |
| 333 | |
| 334 | } // namespace insn |
| 335 | } // namespace detail |
| 336 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 337 | } // namespace simdpp |
| 338 | |
| 339 | #endif |
| 340 | |