1/* Copyright (C) 2011-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_INSERT_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_INSERT_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/cast.h>
17#include <simdpp/core/move_l.h>
18#include <simdpp/core/i_shift_l.h>
19#include <simdpp/core/i_sub.h>
20#include <simdpp/core/make_int.h>
21#include <simdpp/detail/insn/split.h>
22#include <simdpp/detail/mem_block.h>
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace insn {
28
29template<unsigned id> SIMDPP_INL
30uint8x16 i_insert(const uint8x16& ca, uint8_t x)
31{
32 uint8<16> a = ca;
33#if SIMDPP_USE_NULL
34 a.el(id) = x;
35 return a;
36#elif SIMDPP_USE_SSE4_1
37 return _mm_insert_epi8(a.native(), x, id);
38#elif SIMDPP_USE_SSE2
39 uint16_t r = _mm_extract_epi16(a.native(), id/2);
40 if (id % 2 == 1) {
41 r = (r & 0x00ff) | (x << 8);
42 } else {
43 r = (r & 0xff00) | x;
44 }
45 a = _mm_insert_epi16(a.native(), r, id/2);
46 return a;
47#elif SIMDPP_USE_NEON
48 return vsetq_lane_u8(x, a.native(), id);
49#elif SIMDPP_USE_ALTIVEC
50 detail::mem_block<uint8x16> ax(a);
51 ax[id] = x;
52 a = ax;
53 return a;
54#elif SIMDPP_USE_MSA
55 return (v16u8) __msa_insert_b((v16i8) a.native(), id, x);
56#endif
57}
58
59#if SIMDPP_USE_AVX2
60template<unsigned id> SIMDPP_INL
61uint8<32> i_insert(const uint8<32>& a, uint8_t x)
62{
63 __m256i val = a.native();
64 __m128i val128 = _mm256_extracti128_si256(val, id / 16);
65 val128 = _mm_insert_epi8(val128, x, id % 16);
66 return _mm256_inserti128_si256(val, val128, id / 16);
67}
68#endif
69
70#if SIMDPP_USE_AVX512BW
71template<unsigned id> SIMDPP_INL
72uint8<64> i_insert(const uint8<64>& a, uint8_t x)
73{
74 __m512i val = a.native();
75 __m128i val128 = _mm512_extracti32x4_epi32(val, id / 16);
76 val128 = _mm_insert_epi8(val128, x, id % 16);
77 return _mm512_inserti32x4(val, val128, id / 16);
78}
79#endif
80
81// -----------------------------------------------------------------------------
82
83template<unsigned id> SIMDPP_INL
84uint16x8 i_insert(const uint16x8& ca, uint16_t x)
85{
86 uint16<8> a = ca;
87#if SIMDPP_USE_NULL
88 a.el(id) = x;
89 return a;
90#elif SIMDPP_USE_SSE2
91 return _mm_insert_epi16(a.native(), x, id);
92#elif SIMDPP_USE_NEON
93 return vsetq_lane_u16(x, a.native(), id);
94#elif SIMDPP_USE_ALTIVEC
95 detail::mem_block<uint16x8> ax(a);
96 ax[id] = x;
97 a = ax;
98 return a;
99#elif SIMDPP_USE_MSA
100 return (v8u16) __msa_insert_h((v8i16) a.native(), id, x);
101#endif
102}
103
104#if SIMDPP_USE_AVX2
105template<unsigned id> SIMDPP_INL
106uint16<16> i_insert(const uint16<16>& a, uint16_t x)
107{
108 __m256i val = a.native();
109 __m128i val128 = _mm256_extracti128_si256(val, id / 8);
110 val128 = _mm_insert_epi16(val128, x, id % 8);
111 return _mm256_inserti128_si256(val, val128, id / 8);
112}
113#endif
114
115#if SIMDPP_USE_AVX512BW
116template<unsigned id> SIMDPP_INL
117uint16<32> i_insert(const uint16<32>& a, uint16_t x)
118{
119 __m512i val = a.native();
120 __m128i val128 = _mm512_extracti32x4_epi32(val, id / 8);
121 val128 = _mm_insert_epi16(val128, x, id % 8);
122 return _mm512_inserti32x4(val, val128, id / 8);
123}
124#endif
125
126// -----------------------------------------------------------------------------
127
128template<unsigned id> SIMDPP_INL
129uint32x4 i_insert(const uint32x4& ca, uint32_t x)
130{
131 uint32<4> a = ca;
132#if SIMDPP_USE_NULL
133 a.el(id) = x;
134 return a;
135#elif SIMDPP_USE_SSE4_1
136 return _mm_insert_epi32(a.native(), x, id);
137#elif SIMDPP_USE_SSE2
138 uint16_t lo = x & 0xffff;
139 uint16_t hi = x >> 16;
140 uint16x8 a1 = uint16<8>(a);
141 a1 = i_insert<id*2>(a1, lo);
142 a1 = i_insert<id*2+1>(a1, hi);
143 return uint32<4>(a1);
144#elif SIMDPP_USE_NEON
145 return vsetq_lane_u32(x, a.native(), id);
146#elif SIMDPP_USE_ALTIVEC
147 detail::mem_block<uint32x4> ax(a);
148 ax[id] = x;
149 a = ax;
150 return a;
151#elif SIMDPP_USE_MSA
152 return (v4u32) __msa_insert_w((v4i32) a.native(), id, x);
153#endif
154}
155
156#if SIMDPP_USE_AVX2
157template<unsigned id> SIMDPP_INL
158uint32<8> i_insert(const uint32<8>& a, uint32_t x)
159{
160 __m256i val = a.native();
161 __m128i val128 = _mm256_extracti128_si256(val, id / 4);
162 val128 = _mm_insert_epi32(val128, x, id % 4);
163 return _mm256_inserti128_si256(val, val128, id / 4);
164}
165#endif
166
167#if SIMDPP_USE_AVX512F
168template<unsigned id> SIMDPP_INL
169uint32<16> i_insert(const uint32<16>& a, uint32_t x)
170{
171 __m512i val = a.native();
172 __m128i val128 = _mm512_extracti32x4_epi32(val, id / 4);
173 val128 = _mm_insert_epi32(val128, x, id % 4);
174 return _mm512_inserti32x4(val, val128, id / 4);
175}
176#endif
177
178// -----------------------------------------------------------------------------
179
180template<unsigned id> SIMDPP_INL
181uint64x2 i_insert(const uint64x2& ca, uint64_t x)
182{
183 uint64<2> a = ca;
184#if SIMDPP_USE_NULL
185 a.el(id) = x;
186 return a;
187#elif SIMDPP_USE_SSE4_1
188#if SIMDPP_32_BITS
189 uint32x4 a0 = (uint32x4) a;
190 a0 = i_insert<id*2>(a0, uint32_t(x));
191 a0 = i_insert<id*2+1>(a0, uint32_t(x >> 32));
192 return (uint64x2) a0;
193#else
194 return _mm_insert_epi64(a.native(), x, id);
195#endif
196#elif SIMDPP_USE_SSE2
197#if SIMDPP_32_BITS
198 int32x4 va = _mm_cvtsi32_si128(uint32_t(x));
199 int32x4 vb = _mm_cvtsi32_si128(uint32_t(x >> 32));
200 int64x2 vx = (int64x2) zip4_lo(va, vb);
201 if (id == 0) {
202 a = shuffle1<0,1>(vx, a);
203 } else {
204 a = shuffle1<0,0>(a, vx);
205 }
206 return a;
207#else
208 int64x2 vx = _mm_cvtsi64_si128(x);
209 if (id == 0) {
210 a = shuffle1<0,1>(vx, a);
211 } else {
212 a = shuffle1<0,0>(a, vx);
213 }
214 return a;
215#endif
216#elif SIMDPP_USE_NEON
217 return vsetq_lane_u64(x, a.native(), id);
218#elif SIMDPP_USE_ALTIVEC
219 detail::mem_block<uint64x2> ax(a);
220 ax[id] = x;
221 a = ax;
222 return a;
223#elif SIMDPP_USE_MSA
224#if SIMDPP_64_BITS
225 return (v2u64) __msa_insert_d((v2i64) a.native(), id, x);
226#else
227 int32<4> a32;
228 a32 = a;
229 a32 = __msa_insert_w(a32.native(), id*2, x);
230 a32 = __msa_insert_w(a32.native(), id*2+1, x >> 32);
231 return (uint64<2>) a32;
232#endif
233#endif
234}
235
236#if SIMDPP_USE_AVX2
237template<unsigned id> SIMDPP_INL
238uint64<4> i_insert(const uint64<4>& a, uint64_t x)
239{
240 __m256i val = a.native();
241 uint64<2> val128 = _mm256_extracti128_si256(val, id / 2);
242 val128 = i_insert<id % 2>(val128, x);
243 return _mm256_inserti128_si256(val, val128.native(), id / 2);
244}
245#endif
246
247#if SIMDPP_USE_AVX512F
248template<unsigned id> SIMDPP_INL
249uint64<8> i_insert(const uint64<8>& a, uint64_t x)
250{
251 __m512i val = a.native();
252 uint64<2> val128 = _mm512_extracti32x4_epi32(val, id / 2);
253 val128 = i_insert<id % 2>(val128, x);
254 return _mm512_inserti32x4(val, val128.native(), id / 2);
255}
256#endif
257
258// -----------------------------------------------------------------------------
259
260template<unsigned id> SIMDPP_INL
261float32x4 i_insert(const float32x4& a, float x)
262{
263#if SIMDPP_USE_NEON_FLT_SP
264 return vsetq_lane_f32(x, a.native(), id);
265#else
266 return float32<4>(i_insert<id>(uint32<4>(a), bit_cast<uint32_t>(x)));
267#endif
268}
269
270#if SIMDPP_USE_AVX
271template<unsigned id> SIMDPP_INL
272float32<8> i_insert(const float32<8>& a, float x)
273{
274 __m256 val = a.native();
275 float32<4> val128 = _mm256_extractf128_ps(val, id / 4);
276 val128 = i_insert<id % 4>(val128, x);
277 return _mm256_insertf128_ps(val, val128.native(), id / 4);
278}
279#endif
280
281#if SIMDPP_USE_AVX512F
282template<unsigned id> SIMDPP_INL
283float32<16> i_insert(const float32<16>& a, float x)
284{
285 __m512 val = a.native();
286 float32<4> val128 = _mm512_extractf32x4_ps(val, id / 4);
287 val128 = i_insert<id % 4>(val128, x);
288 return _mm512_insertf32x4(val, val128.native(), id / 4);
289}
290#endif
291
292// -----------------------------------------------------------------------------
293
294template<unsigned id> SIMDPP_INL
295float64x2 i_insert(const float64x2& a, double x)
296{
297 return float64<2>(i_insert<id>(uint64<2>(a), bit_cast<int64_t>(x)));
298}
299
300#if SIMDPP_USE_AVX
301template<unsigned id> SIMDPP_INL
302float64<4> i_insert(const float64<4>& a, double x)
303{
304 __m256d val = a.native();
305 float64<2> val128 = _mm256_extractf128_pd(val, id / 2);
306 val128 = i_insert<id % 2>(val128, x);
307 return _mm256_insertf128_pd(val, val128.native(), id / 2);
308}
309#endif
310
311#if SIMDPP_USE_AVX512F
312template<unsigned id> SIMDPP_INL
313float64<8> i_insert(const float64<8>& a, double x)
314{
315 __m512 val = _mm512_castpd_ps(a.native());
316 float64<2> val128 = _mm_castps_pd(_mm512_extractf32x4_ps(val, id / 2));
317 val128 = i_insert<id % 2>(val128, x);
318 return _mm512_castps_pd(_mm512_insertf32x4(val, _mm_castpd_ps(val128.native()), id / 2));
319}
320#endif
321
322// -----------------------------------------------------------------------------
323
324template<unsigned id, class V, class E> SIMDPP_INL
325V i_insert(const V& ca, E el)
326{
327 V a = ca;
328 typename V::base_vector_type base = a.vec(id / V::base_length);
329 base = i_insert<id % V::base_length>(base, (typename V::element_type) el);
330 a.vec(id / V::base_length) = base;
331 return a;
332}
333
334} // namespace insn
335} // namespace detail
336} // namespace SIMDPP_ARCH_NAMESPACE
337} // namespace simdpp
338
339#endif
340