1 | /* Copyright (C) 2011-2017 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_H |
9 | #define |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/cast.h> |
17 | #include <simdpp/core/move_l.h> |
18 | #include <simdpp/core/i_shift_l.h> |
19 | #include <simdpp/core/i_sub.h> |
20 | #include <simdpp/core/make_int.h> |
21 | #include <simdpp/detail/insn/split.h> |
22 | #include <simdpp/detail/mem_block.h> |
23 | |
24 | namespace simdpp { |
25 | namespace SIMDPP_ARCH_NAMESPACE { |
26 | namespace detail { |
27 | namespace insn { |
28 | |
29 | template<unsigned id> SIMDPP_INL |
30 | uint8_t (const uint8<16>& a) |
31 | { |
32 | #if SIMDPP_USE_NULL |
33 | return a.el(id); |
34 | #elif SIMDPP_USE_SSE4_1 |
35 | // Explicit cast is needed due to bug in Clang headers (intrinsic |
36 | // implemented as a macro with no appropriate casts) and a bug in Clang |
37 | // (thinks explicit conversion operators have the same rank as the regular |
38 | // ones) |
39 | return _mm_extract_epi8(a.native(), id); |
40 | #elif SIMDPP_USE_SSE2 |
41 | unsigned shift = (id % 2 == 1) ? 8 : 0; |
42 | return _mm_extract_epi16(a.native(), id/2) >> shift; |
43 | #elif SIMDPP_USE_NEON |
44 | return vgetq_lane_u8(a.native(), id); |
45 | #elif SIMDPP_USE_ALTIVEC |
46 | detail::mem_block<uint8x16> ax(a); |
47 | vec_ste(a.native(), 0, &ax[id]); |
48 | return ax[id]; |
49 | #elif SIMDPP_USE_MSA |
50 | return __msa_copy_u_b((v16i8) a.native(), id); |
51 | #endif |
52 | } |
53 | |
54 | #if SIMDPP_USE_AVX2 |
55 | template<unsigned id> SIMDPP_INL |
56 | uint8_t i_extract(const uint8<32>& a) |
57 | { |
58 | __m128i val = _mm256_extracti128_si256(a.native(), id / 16); |
59 | return _mm_extract_epi8(val, id % 16); |
60 | } |
61 | #endif |
62 | |
63 | #if SIMDPP_USE_AVX512BW |
64 | template<unsigned id> SIMDPP_INL |
65 | uint8_t i_extract(const uint8<64>& a) |
66 | { |
67 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 16); |
68 | return _mm_extract_epi8(val, id % 16); |
69 | } |
70 | #endif |
71 | |
72 | // ----------------------------------------------------------------------------- |
73 | |
74 | template<unsigned id> SIMDPP_INL |
75 | int8_t (const int8<16>& a) |
76 | { |
77 | #if SIMDPP_USE_MSA |
78 | return __msa_copy_s_b(a.native(), id); |
79 | #else |
80 | return i_extract<id>(uint8x16(a)); |
81 | #endif |
82 | } |
83 | |
84 | #if SIMDPP_USE_AVX2 |
85 | template<unsigned id> SIMDPP_INL |
86 | int8_t i_extract(const int8<32>& a) |
87 | { |
88 | __m128i val = _mm256_extracti128_si256(a.native(), id / 16); |
89 | return _mm_extract_epi8(val, id % 16); |
90 | } |
91 | #endif |
92 | |
93 | #if SIMDPP_USE_AVX512BW |
94 | template<unsigned id> SIMDPP_INL |
95 | int8_t i_extract(const int8<64>& a) |
96 | { |
97 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 16); |
98 | return _mm_extract_epi8(val, id % 16); |
99 | } |
100 | #endif |
101 | |
102 | // ----------------------------------------------------------------------------- |
103 | |
104 | template<unsigned id> SIMDPP_INL |
105 | uint16_t (const uint16<8>& a) |
106 | { |
107 | #if SIMDPP_USE_NULL |
108 | return a.el(id); |
109 | #elif SIMDPP_USE_SSE2 |
110 | return _mm_extract_epi16(a.native(), id); |
111 | #elif SIMDPP_USE_NEON |
112 | return vgetq_lane_u16(a.native(), id); |
113 | #elif SIMDPP_USE_ALTIVEC |
114 | detail::mem_block<uint16x8> ax(a); |
115 | vec_ste(a.native(), 0, &ax[id]); |
116 | return ax[id]; |
117 | #elif SIMDPP_USE_MSA |
118 | return __msa_copy_u_h((v8i16) a.native(), id); |
119 | #endif |
120 | } |
121 | |
122 | #if SIMDPP_USE_AVX2 |
123 | template<unsigned id> SIMDPP_INL |
124 | uint16_t i_extract(const uint16<16>& a) |
125 | { |
126 | __m128i val = _mm256_extracti128_si256(a.native(), id / 8); |
127 | return _mm_extract_epi16(val, id % 8); |
128 | } |
129 | #endif |
130 | |
131 | #if SIMDPP_USE_AVX512BW |
132 | template<unsigned id> SIMDPP_INL |
133 | uint16_t i_extract(const uint16<32>& a) |
134 | { |
135 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 8); |
136 | return _mm_extract_epi16(val, id % 8); |
137 | } |
138 | #endif |
139 | |
140 | // ----------------------------------------------------------------------------- |
141 | |
142 | template<unsigned id> SIMDPP_INL |
143 | int16_t (const int16<8>& a) |
144 | { |
145 | #if SIMDPP_USE_MSA |
146 | return __msa_copy_s_h(a.native(), id); |
147 | #else |
148 | return i_extract<id>(uint16x8(a)); |
149 | #endif |
150 | } |
151 | |
152 | #if SIMDPP_USE_AVX2 |
153 | template<unsigned id> SIMDPP_INL |
154 | int16_t i_extract(const int16<16>& a) |
155 | { |
156 | __m128i val = _mm256_extracti128_si256(a.native(), id / 8); |
157 | return _mm_extract_epi16(val, id % 8); |
158 | } |
159 | #endif |
160 | |
161 | #if SIMDPP_USE_AVX512BW |
162 | template<unsigned id> SIMDPP_INL |
163 | int16_t i_extract(const int16<32>& a) |
164 | { |
165 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 8); |
166 | return _mm_extract_epi16(val, id % 8); |
167 | } |
168 | #endif |
169 | |
170 | // ----------------------------------------------------------------------------- |
171 | |
172 | template<unsigned id> SIMDPP_INL |
173 | uint32_t (const uint32<4>& a) |
174 | { |
175 | #if SIMDPP_USE_NULL |
176 | return a.el(id); |
177 | #elif SIMDPP_USE_SSE4_1 |
178 | return _mm_extract_epi32(a.native(), id); |
179 | #elif SIMDPP_USE_SSE2 |
180 | // when id==0, move_l is template-specialized and does nothing |
181 | return _mm_cvtsi128_si32(move4_l<id>(a).eval().native()); |
182 | #elif SIMDPP_USE_NEON |
183 | return vgetq_lane_u32(a.native(), id); |
184 | #elif SIMDPP_USE_ALTIVEC |
185 | detail::mem_block<uint32x4> ax(a); |
186 | vec_ste(a.native(), 0, &ax[id]); |
187 | return ax[id]; |
188 | #elif SIMDPP_USE_MSA |
189 | return __msa_copy_u_w((v4i32) a.native(), id); |
190 | #endif |
191 | } |
192 | |
193 | #if SIMDPP_USE_AVX2 |
194 | template<unsigned id> SIMDPP_INL |
195 | uint32_t i_extract(const uint32<8>& a) |
196 | { |
197 | __m128i val = _mm256_extracti128_si256(a.native(), id / 4); |
198 | return _mm_extract_epi32(val, id % 4); |
199 | } |
200 | #endif |
201 | |
202 | #if SIMDPP_USE_AVX512F |
203 | template<unsigned id> SIMDPP_INL |
204 | uint32_t i_extract(const uint32<16>& a) |
205 | { |
206 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 4); |
207 | return _mm_extract_epi32(val, id % 4); |
208 | } |
209 | #endif |
210 | |
211 | // ----------------------------------------------------------------------------- |
212 | |
213 | template<unsigned id> SIMDPP_INL |
214 | int32_t (const int32<4>& a) |
215 | { |
216 | #if SIMDPP_USE_MSA |
217 | return __msa_copy_s_w(a.native(), id); |
218 | #else |
219 | return i_extract<id>(uint32x4(a)); |
220 | #endif |
221 | } |
222 | |
223 | #if SIMDPP_USE_AVX2 |
224 | template<unsigned id> SIMDPP_INL |
225 | int32_t i_extract(const int32<8>& a) |
226 | { |
227 | __m128i val = _mm256_extracti128_si256(a.native(), id / 4); |
228 | return _mm_extract_epi32(val, id % 4); |
229 | } |
230 | #endif |
231 | |
232 | #if SIMDPP_USE_AVX512F |
233 | template<unsigned id> SIMDPP_INL |
234 | int32_t i_extract(const int32<16>& a) |
235 | { |
236 | __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 4); |
237 | return _mm_extract_epi32(val, id % 4); |
238 | } |
239 | #endif |
240 | |
241 | // ----------------------------------------------------------------------------- |
242 | |
243 | template<unsigned id> SIMDPP_INL |
244 | uint64_t (const uint64<2>& a) |
245 | { |
246 | #if SIMDPP_USE_NULL |
247 | return a.el(id); |
248 | #elif SIMDPP_USE_SSE4_1 |
249 | #if SIMDPP_32_BITS |
250 | uint32x4 t = uint32x4(a); |
251 | uint64_t r = i_extract<id*2>(t); |
252 | r |= uint64_t(i_extract<id*2+1>(t)) << 32; |
253 | return r; |
254 | #else |
255 | return _mm_extract_epi64(a.native(), id); |
256 | #endif |
257 | #elif SIMDPP_USE_SSE2 |
258 | #if SIMDPP_32_BITS |
259 | uint32x4 t = uint32x4(a); |
260 | uint64_t r = 0; |
261 | t = move4_l<id*2>(t); // when id==0, move_l is template-specialized and does nothing |
262 | r = i_extract<0>(t); |
263 | t = move4_l<1>(t); |
264 | r |= uint64_t(i_extract<0>(t)) << 32; |
265 | return r; |
266 | #else |
267 | uint64x2 t = a; |
268 | if (id != 0) { |
269 | t = move2_l<id>(t); |
270 | } |
271 | return _mm_cvtsi128_si64(t.native()); |
272 | #endif |
273 | #elif SIMDPP_USE_NEON |
274 | return vgetq_lane_u64(a.native(), id); |
275 | #elif SIMDPP_USE_ALTIVEC |
276 | detail::mem_block<uint64x2> ax(a); |
277 | return ax[id]; |
278 | #elif SIMDPP_USE_MSA |
279 | #if SIMDPP_64_BITS |
280 | return __msa_copy_u_d((v2i64) a.native(), id); |
281 | #else |
282 | v4i32 a32 = (v4i32) a.native(); |
283 | uint64_t lo = __msa_copy_u_w(a32, id*2); |
284 | uint64_t hi = __msa_copy_u_w(a32, id*2+1); |
285 | return lo | (hi << 32); |
286 | #endif |
287 | #endif |
288 | } |
289 | |
290 | #if SIMDPP_USE_AVX2 |
291 | template<unsigned id> SIMDPP_INL |
292 | uint64_t i_extract(const uint64<4>& a) |
293 | { |
294 | uint64<2> val = _mm256_extracti128_si256(a.native(), id / 2); |
295 | return i_extract<id % 2>(val); |
296 | } |
297 | #endif |
298 | |
299 | #if SIMDPP_USE_AVX512F |
300 | template<unsigned id> SIMDPP_INL |
301 | uint64_t i_extract(const uint64<8>& a) |
302 | { |
303 | uint64<2> val = _mm512_extracti32x4_epi32(a.native(), id / 2); |
304 | return i_extract<id % 2>(val); |
305 | } |
306 | #endif |
307 | |
308 | // ----------------------------------------------------------------------------- |
309 | |
310 | template<unsigned id> SIMDPP_INL |
311 | int64_t (const int64<2>& a) |
312 | { |
313 | #if SIMDPP_USE_MSA |
314 | #if SIMDPP_64_BITS |
315 | return __msa_copy_s_d(a, id); |
316 | #else |
317 | v4i32 a32 = (v4i32) a.native(); |
318 | int64_t lo = __msa_copy_s_w(a32, id*2); |
319 | int64_t hi = __msa_copy_s_w(a32, id*2+1); |
320 | return lo | (hi << 32); |
321 | #endif |
322 | #else |
323 | return i_extract<id>(uint64x2(a)); |
324 | #endif |
325 | } |
326 | |
327 | #if SIMDPP_USE_AVX2 |
328 | template<unsigned id> SIMDPP_INL |
329 | int64_t i_extract(const int64<4>& a) |
330 | { |
331 | uint64<2> val = _mm256_extracti128_si256(a.native(), id / 2); |
332 | return i_extract<id % 2>(val); |
333 | } |
334 | #endif |
335 | |
336 | #if SIMDPP_USE_AVX512F |
337 | template<unsigned id> SIMDPP_INL |
338 | int64_t i_extract(const int64<8>& a) |
339 | { |
340 | uint64<2> val = _mm512_extracti32x4_epi32(a.native(), id / 2); |
341 | return i_extract<id % 2>(val); |
342 | } |
343 | #endif |
344 | |
345 | // ----------------------------------------------------------------------------- |
346 | |
347 | template<unsigned id> SIMDPP_INL |
348 | float (const float32<4>& a) |
349 | { |
350 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
351 | return a.el(id); |
352 | #elif SIMDPP_USE_SSE2 |
353 | return bit_cast<float>(i_extract<id>(int32x4(a))); |
354 | #elif SIMDPP_USE_NEON |
355 | return vgetq_lane_f32(a.native(), id); |
356 | #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
357 | detail::mem_block<float32x4> ax(a); |
358 | return ax[id]; |
359 | #endif |
360 | } |
361 | |
362 | #if SIMDPP_USE_AVX |
363 | template<unsigned id> SIMDPP_INL |
364 | float i_extract(const float32<8>& a) |
365 | { |
366 | __m128 val = _mm256_extractf128_ps(a.native(), id / 4); |
367 | return bit_cast<float>(_mm_extract_epi32(_mm_castps_si128(val), id % 4)); |
368 | } |
369 | #endif |
370 | |
371 | #if SIMDPP_USE_AVX512F |
372 | template<unsigned id> SIMDPP_INL |
373 | float i_extract(const float32<16>& a) |
374 | { |
375 | __m128 val = _mm512_extractf32x4_ps(a.native(), id / 4); |
376 | return bit_cast<float>(_mm_extract_epi32(_mm_castps_si128(val), id % 4)); |
377 | } |
378 | #endif |
379 | |
380 | // ----------------------------------------------------------------------------- |
381 | |
382 | template<unsigned id> SIMDPP_INL |
383 | double (const float64<2>& a) |
384 | { |
385 | #if SIMDPP_USE_NULL |
386 | return a.el(id); |
387 | #elif SIMDPP_USE_SSE2 |
388 | return bit_cast<double>(i_extract<id>(int64x2(a))); |
389 | #elif SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
390 | detail::mem_block<float64x2> ax(a); |
391 | return ax[id]; |
392 | #elif SIMDPP_USE_NEON64 |
393 | return vgetq_lane_f64(a.native(), id); |
394 | #endif |
395 | } |
396 | |
397 | #if SIMDPP_USE_AVX |
398 | template<unsigned id> SIMDPP_INL |
399 | double i_extract(const float64<4>& a) |
400 | { |
401 | __m128d val = _mm256_extractf128_pd(a.native(), id / 2); |
402 | return bit_cast<double>(i_extract<id % 2>((uint64<2>)_mm_castpd_si128(val))); |
403 | } |
404 | #endif |
405 | |
406 | #if SIMDPP_USE_AVX512F |
407 | template<unsigned id> SIMDPP_INL |
408 | double i_extract(const float64<8>& a) |
409 | { |
410 | __m128 val = _mm512_extractf32x4_ps(_mm512_castpd_ps(a.native()), id / 2); |
411 | return bit_cast<double>(i_extract<id % 2>((uint64<2>)_mm_castps_si128(val))); |
412 | } |
413 | #endif |
414 | |
415 | // ----------------------------------------------------------------------------- |
416 | |
417 | template<unsigned id, class V> SIMDPP_INL |
418 | typename V::element_type (const V& a) |
419 | { |
420 | typename V::base_vector_type base = a.vec(id / V::base_length); |
421 | return i_extract<id % V::base_length>(base); |
422 | } |
423 | |
424 | } // namespace insn |
425 | } // namespace detail |
426 | } // namespace SIMDPP_ARCH_NAMESPACE |
427 | } // namespace simdpp |
428 | |
429 | #endif |
430 | |