1 | /* Copyright (C) 2011-2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_INSERT_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_INSERT_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/cast.h> |
17 | #include <simdpp/core/move_l.h> |
18 | #include <simdpp/core/i_shift_l.h> |
19 | #include <simdpp/core/i_sub.h> |
20 | #include <simdpp/core/make_int.h> |
21 | #include <simdpp/detail/insn/split.h> |
22 | #include <simdpp/detail/mem_block.h> |
23 | |
24 | namespace simdpp { |
25 | namespace SIMDPP_ARCH_NAMESPACE { |
26 | namespace detail { |
27 | namespace insn { |
28 | |
29 | template<unsigned id> SIMDPP_INL |
30 | uint8x16 i_insert(const uint8x16& ca, uint8_t x) |
31 | { |
32 | uint8<16> a = ca; |
33 | #if SIMDPP_USE_NULL |
34 | a.el(id) = x; |
35 | return a; |
36 | #elif SIMDPP_USE_SSE4_1 |
37 | return _mm_insert_epi8(a.native(), x, id); |
38 | #elif SIMDPP_USE_SSE2 |
39 | uint16_t r = _mm_extract_epi16(a.native(), id/2); |
40 | if (id % 2 == 1) { |
41 | r = (r & 0x00ff) | (x << 8); |
42 | } else { |
43 | r = (r & 0xff00) | x; |
44 | } |
45 | a = _mm_insert_epi16(a.native(), r, id/2); |
46 | return a; |
47 | #elif SIMDPP_USE_NEON |
48 | return vsetq_lane_u8(x, a.native(), id); |
49 | #elif SIMDPP_USE_ALTIVEC |
50 | detail::mem_block<uint8x16> ax(a); |
51 | ax[id] = x; |
52 | a = ax; |
53 | return a; |
54 | #elif SIMDPP_USE_MSA |
55 | return (v16u8) __msa_insert_b((v16i8) a.native(), id, x); |
56 | #endif |
57 | } |
58 | |
59 | #if SIMDPP_USE_AVX2 |
60 | template<unsigned id> SIMDPP_INL |
61 | uint8<32> i_insert(const uint8<32>& a, uint8_t x) |
62 | { |
63 | __m256i val = a.native(); |
64 | __m128i val128 = _mm256_extracti128_si256(val, id / 16); |
65 | val128 = _mm_insert_epi8(val128, x, id % 16); |
66 | return _mm256_inserti128_si256(val, val128, id / 16); |
67 | } |
68 | #endif |
69 | |
70 | #if SIMDPP_USE_AVX512BW |
71 | template<unsigned id> SIMDPP_INL |
72 | uint8<64> i_insert(const uint8<64>& a, uint8_t x) |
73 | { |
74 | __m512i val = a.native(); |
75 | __m128i val128 = _mm512_extracti32x4_epi32(val, id / 16); |
76 | val128 = _mm_insert_epi8(val128, x, id % 16); |
77 | return _mm512_inserti32x4(val, val128, id / 16); |
78 | } |
79 | #endif |
80 | |
81 | // ----------------------------------------------------------------------------- |
82 | |
83 | template<unsigned id> SIMDPP_INL |
84 | uint16x8 i_insert(const uint16x8& ca, uint16_t x) |
85 | { |
86 | uint16<8> a = ca; |
87 | #if SIMDPP_USE_NULL |
88 | a.el(id) = x; |
89 | return a; |
90 | #elif SIMDPP_USE_SSE2 |
91 | return _mm_insert_epi16(a.native(), x, id); |
92 | #elif SIMDPP_USE_NEON |
93 | return vsetq_lane_u16(x, a.native(), id); |
94 | #elif SIMDPP_USE_ALTIVEC |
95 | detail::mem_block<uint16x8> ax(a); |
96 | ax[id] = x; |
97 | a = ax; |
98 | return a; |
99 | #elif SIMDPP_USE_MSA |
100 | return (v8u16) __msa_insert_h((v8i16) a.native(), id, x); |
101 | #endif |
102 | } |
103 | |
104 | #if SIMDPP_USE_AVX2 |
105 | template<unsigned id> SIMDPP_INL |
106 | uint16<16> i_insert(const uint16<16>& a, uint16_t x) |
107 | { |
108 | __m256i val = a.native(); |
109 | __m128i val128 = _mm256_extracti128_si256(val, id / 8); |
110 | val128 = _mm_insert_epi16(val128, x, id % 8); |
111 | return _mm256_inserti128_si256(val, val128, id / 8); |
112 | } |
113 | #endif |
114 | |
115 | #if SIMDPP_USE_AVX512BW |
116 | template<unsigned id> SIMDPP_INL |
117 | uint16<32> i_insert(const uint16<32>& a, uint16_t x) |
118 | { |
119 | __m512i val = a.native(); |
120 | __m128i val128 = _mm512_extracti32x4_epi32(val, id / 8); |
121 | val128 = _mm_insert_epi16(val128, x, id % 8); |
122 | return _mm512_inserti32x4(val, val128, id / 8); |
123 | } |
124 | #endif |
125 | |
126 | // ----------------------------------------------------------------------------- |
127 | |
128 | template<unsigned id> SIMDPP_INL |
129 | uint32x4 i_insert(const uint32x4& ca, uint32_t x) |
130 | { |
131 | uint32<4> a = ca; |
132 | #if SIMDPP_USE_NULL |
133 | a.el(id) = x; |
134 | return a; |
135 | #elif SIMDPP_USE_SSE4_1 |
136 | return _mm_insert_epi32(a.native(), x, id); |
137 | #elif SIMDPP_USE_SSE2 |
138 | uint16_t lo = x & 0xffff; |
139 | uint16_t hi = x >> 16; |
140 | uint16x8 a1 = uint16<8>(a); |
141 | a1 = i_insert<id*2>(a1, lo); |
142 | a1 = i_insert<id*2+1>(a1, hi); |
143 | return uint32<4>(a1); |
144 | #elif SIMDPP_USE_NEON |
145 | return vsetq_lane_u32(x, a.native(), id); |
146 | #elif SIMDPP_USE_ALTIVEC |
147 | detail::mem_block<uint32x4> ax(a); |
148 | ax[id] = x; |
149 | a = ax; |
150 | return a; |
151 | #elif SIMDPP_USE_MSA |
152 | return (v4u32) __msa_insert_w((v4i32) a.native(), id, x); |
153 | #endif |
154 | } |
155 | |
156 | #if SIMDPP_USE_AVX2 |
157 | template<unsigned id> SIMDPP_INL |
158 | uint32<8> i_insert(const uint32<8>& a, uint32_t x) |
159 | { |
160 | __m256i val = a.native(); |
161 | __m128i val128 = _mm256_extracti128_si256(val, id / 4); |
162 | val128 = _mm_insert_epi32(val128, x, id % 4); |
163 | return _mm256_inserti128_si256(val, val128, id / 4); |
164 | } |
165 | #endif |
166 | |
167 | #if SIMDPP_USE_AVX512F |
168 | template<unsigned id> SIMDPP_INL |
169 | uint32<16> i_insert(const uint32<16>& a, uint32_t x) |
170 | { |
171 | __m512i val = a.native(); |
172 | __m128i val128 = _mm512_extracti32x4_epi32(val, id / 4); |
173 | val128 = _mm_insert_epi32(val128, x, id % 4); |
174 | return _mm512_inserti32x4(val, val128, id / 4); |
175 | } |
176 | #endif |
177 | |
178 | // ----------------------------------------------------------------------------- |
179 | |
180 | template<unsigned id> SIMDPP_INL |
181 | uint64x2 i_insert(const uint64x2& ca, uint64_t x) |
182 | { |
183 | uint64<2> a = ca; |
184 | #if SIMDPP_USE_NULL |
185 | a.el(id) = x; |
186 | return a; |
187 | #elif SIMDPP_USE_SSE4_1 |
188 | #if SIMDPP_32_BITS |
189 | uint32x4 a0 = (uint32x4) a; |
190 | a0 = i_insert<id*2>(a0, uint32_t(x)); |
191 | a0 = i_insert<id*2+1>(a0, uint32_t(x >> 32)); |
192 | return (uint64x2) a0; |
193 | #else |
194 | return _mm_insert_epi64(a.native(), x, id); |
195 | #endif |
196 | #elif SIMDPP_USE_SSE2 |
197 | #if SIMDPP_32_BITS |
198 | int32x4 va = _mm_cvtsi32_si128(uint32_t(x)); |
199 | int32x4 vb = _mm_cvtsi32_si128(uint32_t(x >> 32)); |
200 | int64x2 vx = (int64x2) zip4_lo(va, vb); |
201 | if (id == 0) { |
202 | a = shuffle1<0,1>(vx, a); |
203 | } else { |
204 | a = shuffle1<0,0>(a, vx); |
205 | } |
206 | return a; |
207 | #else |
208 | int64x2 vx = _mm_cvtsi64_si128(x); |
209 | if (id == 0) { |
210 | a = shuffle1<0,1>(vx, a); |
211 | } else { |
212 | a = shuffle1<0,0>(a, vx); |
213 | } |
214 | return a; |
215 | #endif |
216 | #elif SIMDPP_USE_NEON |
217 | return vsetq_lane_u64(x, a.native(), id); |
218 | #elif SIMDPP_USE_ALTIVEC |
219 | detail::mem_block<uint64x2> ax(a); |
220 | ax[id] = x; |
221 | a = ax; |
222 | return a; |
223 | #elif SIMDPP_USE_MSA |
224 | #if SIMDPP_64_BITS |
225 | return (v2u64) __msa_insert_d((v2i64) a.native(), id, x); |
226 | #else |
227 | int32<4> a32; |
228 | a32 = a; |
229 | a32 = __msa_insert_w(a32.native(), id*2, x); |
230 | a32 = __msa_insert_w(a32.native(), id*2+1, x >> 32); |
231 | return (uint64<2>) a32; |
232 | #endif |
233 | #endif |
234 | } |
235 | |
236 | #if SIMDPP_USE_AVX2 |
237 | template<unsigned id> SIMDPP_INL |
238 | uint64<4> i_insert(const uint64<4>& a, uint64_t x) |
239 | { |
240 | __m256i val = a.native(); |
241 | uint64<2> val128 = _mm256_extracti128_si256(val, id / 2); |
242 | val128 = i_insert<id % 2>(val128, x); |
243 | return _mm256_inserti128_si256(val, val128.native(), id / 2); |
244 | } |
245 | #endif |
246 | |
247 | #if SIMDPP_USE_AVX512F |
248 | template<unsigned id> SIMDPP_INL |
249 | uint64<8> i_insert(const uint64<8>& a, uint64_t x) |
250 | { |
251 | __m512i val = a.native(); |
252 | uint64<2> val128 = _mm512_extracti32x4_epi32(val, id / 2); |
253 | val128 = i_insert<id % 2>(val128, x); |
254 | return _mm512_inserti32x4(val, val128.native(), id / 2); |
255 | } |
256 | #endif |
257 | |
258 | // ----------------------------------------------------------------------------- |
259 | |
260 | template<unsigned id> SIMDPP_INL |
261 | float32x4 i_insert(const float32x4& a, float x) |
262 | { |
263 | #if SIMDPP_USE_NEON_FLT_SP |
264 | return vsetq_lane_f32(x, a.native(), id); |
265 | #else |
266 | return float32<4>(i_insert<id>(uint32<4>(a), bit_cast<uint32_t>(x))); |
267 | #endif |
268 | } |
269 | |
270 | #if SIMDPP_USE_AVX |
271 | template<unsigned id> SIMDPP_INL |
272 | float32<8> i_insert(const float32<8>& a, float x) |
273 | { |
274 | __m256 val = a.native(); |
275 | float32<4> val128 = _mm256_extractf128_ps(val, id / 4); |
276 | val128 = i_insert<id % 4>(val128, x); |
277 | return _mm256_insertf128_ps(val, val128.native(), id / 4); |
278 | } |
279 | #endif |
280 | |
281 | #if SIMDPP_USE_AVX512F |
282 | template<unsigned id> SIMDPP_INL |
283 | float32<16> i_insert(const float32<16>& a, float x) |
284 | { |
285 | __m512 val = a.native(); |
286 | float32<4> val128 = _mm512_extractf32x4_ps(val, id / 4); |
287 | val128 = i_insert<id % 4>(val128, x); |
288 | return _mm512_insertf32x4(val, val128.native(), id / 4); |
289 | } |
290 | #endif |
291 | |
292 | // ----------------------------------------------------------------------------- |
293 | |
294 | template<unsigned id> SIMDPP_INL |
295 | float64x2 i_insert(const float64x2& a, double x) |
296 | { |
297 | return float64<2>(i_insert<id>(uint64<2>(a), bit_cast<int64_t>(x))); |
298 | } |
299 | |
300 | #if SIMDPP_USE_AVX |
301 | template<unsigned id> SIMDPP_INL |
302 | float64<4> i_insert(const float64<4>& a, double x) |
303 | { |
304 | __m256d val = a.native(); |
305 | float64<2> val128 = _mm256_extractf128_pd(val, id / 2); |
306 | val128 = i_insert<id % 2>(val128, x); |
307 | return _mm256_insertf128_pd(val, val128.native(), id / 2); |
308 | } |
309 | #endif |
310 | |
311 | #if SIMDPP_USE_AVX512F |
312 | template<unsigned id> SIMDPP_INL |
313 | float64<8> i_insert(const float64<8>& a, double x) |
314 | { |
315 | __m512 val = _mm512_castpd_ps(a.native()); |
316 | float64<2> val128 = _mm_castps_pd(_mm512_extractf32x4_ps(val, id / 2)); |
317 | val128 = i_insert<id % 2>(val128, x); |
318 | return _mm512_castps_pd(_mm512_insertf32x4(val, _mm_castpd_ps(val128.native()), id / 2)); |
319 | } |
320 | #endif |
321 | |
322 | // ----------------------------------------------------------------------------- |
323 | |
324 | template<unsigned id, class V, class E> SIMDPP_INL |
325 | V i_insert(const V& ca, E el) |
326 | { |
327 | V a = ca; |
328 | typename V::base_vector_type base = a.vec(id / V::base_length); |
329 | base = i_insert<id % V::base_length>(base, (typename V::element_type) el); |
330 | a.vec(id / V::base_length) = base; |
331 | return a; |
332 | } |
333 | |
334 | } // namespace insn |
335 | } // namespace detail |
336 | } // namespace SIMDPP_ARCH_NAMESPACE |
337 | } // namespace simdpp |
338 | |
339 | #endif |
340 | |