1/* Copyright (C) 2011-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/cast.h>
17#include <simdpp/core/move_l.h>
18#include <simdpp/core/i_shift_l.h>
19#include <simdpp/core/i_sub.h>
20#include <simdpp/core/make_int.h>
21#include <simdpp/detail/insn/split.h>
22#include <simdpp/detail/mem_block.h>
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace insn {
28
29template<unsigned id> SIMDPP_INL
30uint8_t i_extract(const uint8<16>& a)
31{
32#if SIMDPP_USE_NULL
33 return a.el(id);
34#elif SIMDPP_USE_SSE4_1
35 // Explicit cast is needed due to bug in Clang headers (intrinsic
36 // implemented as a macro with no appropriate casts) and a bug in Clang
37 // (thinks explicit conversion operators have the same rank as the regular
38 // ones)
39 return _mm_extract_epi8(a.native(), id);
40#elif SIMDPP_USE_SSE2
41 unsigned shift = (id % 2 == 1) ? 8 : 0;
42 return _mm_extract_epi16(a.native(), id/2) >> shift;
43#elif SIMDPP_USE_NEON
44 return vgetq_lane_u8(a.native(), id);
45#elif SIMDPP_USE_ALTIVEC
46 detail::mem_block<uint8x16> ax(a);
47 vec_ste(a.native(), 0, &ax[id]);
48 return ax[id];
49#elif SIMDPP_USE_MSA
50 return __msa_copy_u_b((v16i8) a.native(), id);
51#endif
52}
53
54#if SIMDPP_USE_AVX2
55template<unsigned id> SIMDPP_INL
56uint8_t i_extract(const uint8<32>& a)
57{
58 __m128i val = _mm256_extracti128_si256(a.native(), id / 16);
59 return _mm_extract_epi8(val, id % 16);
60}
61#endif
62
63#if SIMDPP_USE_AVX512BW
64template<unsigned id> SIMDPP_INL
65uint8_t i_extract(const uint8<64>& a)
66{
67 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 16);
68 return _mm_extract_epi8(val, id % 16);
69}
70#endif
71
72// -----------------------------------------------------------------------------
73
74template<unsigned id> SIMDPP_INL
75int8_t i_extract(const int8<16>& a)
76{
77#if SIMDPP_USE_MSA
78 return __msa_copy_s_b(a.native(), id);
79#else
80 return i_extract<id>(uint8x16(a));
81#endif
82}
83
84#if SIMDPP_USE_AVX2
85template<unsigned id> SIMDPP_INL
86int8_t i_extract(const int8<32>& a)
87{
88 __m128i val = _mm256_extracti128_si256(a.native(), id / 16);
89 return _mm_extract_epi8(val, id % 16);
90}
91#endif
92
93#if SIMDPP_USE_AVX512BW
94template<unsigned id> SIMDPP_INL
95int8_t i_extract(const int8<64>& a)
96{
97 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 16);
98 return _mm_extract_epi8(val, id % 16);
99}
100#endif
101
102// -----------------------------------------------------------------------------
103
104template<unsigned id> SIMDPP_INL
105uint16_t i_extract(const uint16<8>& a)
106{
107#if SIMDPP_USE_NULL
108 return a.el(id);
109#elif SIMDPP_USE_SSE2
110 return _mm_extract_epi16(a.native(), id);
111#elif SIMDPP_USE_NEON
112 return vgetq_lane_u16(a.native(), id);
113#elif SIMDPP_USE_ALTIVEC
114 detail::mem_block<uint16x8> ax(a);
115 vec_ste(a.native(), 0, &ax[id]);
116 return ax[id];
117#elif SIMDPP_USE_MSA
118 return __msa_copy_u_h((v8i16) a.native(), id);
119#endif
120}
121
122#if SIMDPP_USE_AVX2
123template<unsigned id> SIMDPP_INL
124uint16_t i_extract(const uint16<16>& a)
125{
126 __m128i val = _mm256_extracti128_si256(a.native(), id / 8);
127 return _mm_extract_epi16(val, id % 8);
128}
129#endif
130
131#if SIMDPP_USE_AVX512BW
132template<unsigned id> SIMDPP_INL
133uint16_t i_extract(const uint16<32>& a)
134{
135 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 8);
136 return _mm_extract_epi16(val, id % 8);
137}
138#endif
139
140// -----------------------------------------------------------------------------
141
142template<unsigned id> SIMDPP_INL
143int16_t i_extract(const int16<8>& a)
144{
145#if SIMDPP_USE_MSA
146 return __msa_copy_s_h(a.native(), id);
147#else
148 return i_extract<id>(uint16x8(a));
149#endif
150}
151
152#if SIMDPP_USE_AVX2
153template<unsigned id> SIMDPP_INL
154int16_t i_extract(const int16<16>& a)
155{
156 __m128i val = _mm256_extracti128_si256(a.native(), id / 8);
157 return _mm_extract_epi16(val, id % 8);
158}
159#endif
160
161#if SIMDPP_USE_AVX512BW
162template<unsigned id> SIMDPP_INL
163int16_t i_extract(const int16<32>& a)
164{
165 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 8);
166 return _mm_extract_epi16(val, id % 8);
167}
168#endif
169
170// -----------------------------------------------------------------------------
171
172template<unsigned id> SIMDPP_INL
173uint32_t i_extract(const uint32<4>& a)
174{
175#if SIMDPP_USE_NULL
176 return a.el(id);
177#elif SIMDPP_USE_SSE4_1
178 return _mm_extract_epi32(a.native(), id);
179#elif SIMDPP_USE_SSE2
180 // when id==0, move_l is template-specialized and does nothing
181 return _mm_cvtsi128_si32(move4_l<id>(a).eval().native());
182#elif SIMDPP_USE_NEON
183 return vgetq_lane_u32(a.native(), id);
184#elif SIMDPP_USE_ALTIVEC
185 detail::mem_block<uint32x4> ax(a);
186 vec_ste(a.native(), 0, &ax[id]);
187 return ax[id];
188#elif SIMDPP_USE_MSA
189 return __msa_copy_u_w((v4i32) a.native(), id);
190#endif
191}
192
193#if SIMDPP_USE_AVX2
194template<unsigned id> SIMDPP_INL
195uint32_t i_extract(const uint32<8>& a)
196{
197 __m128i val = _mm256_extracti128_si256(a.native(), id / 4);
198 return _mm_extract_epi32(val, id % 4);
199}
200#endif
201
202#if SIMDPP_USE_AVX512F
203template<unsigned id> SIMDPP_INL
204uint32_t i_extract(const uint32<16>& a)
205{
206 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 4);
207 return _mm_extract_epi32(val, id % 4);
208}
209#endif
210
211// -----------------------------------------------------------------------------
212
213template<unsigned id> SIMDPP_INL
214int32_t i_extract(const int32<4>& a)
215{
216#if SIMDPP_USE_MSA
217 return __msa_copy_s_w(a.native(), id);
218#else
219 return i_extract<id>(uint32x4(a));
220#endif
221}
222
223#if SIMDPP_USE_AVX2
224template<unsigned id> SIMDPP_INL
225int32_t i_extract(const int32<8>& a)
226{
227 __m128i val = _mm256_extracti128_si256(a.native(), id / 4);
228 return _mm_extract_epi32(val, id % 4);
229}
230#endif
231
232#if SIMDPP_USE_AVX512F
233template<unsigned id> SIMDPP_INL
234int32_t i_extract(const int32<16>& a)
235{
236 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 4);
237 return _mm_extract_epi32(val, id % 4);
238}
239#endif
240
241// -----------------------------------------------------------------------------
242
243template<unsigned id> SIMDPP_INL
244uint64_t i_extract(const uint64<2>& a)
245{
246#if SIMDPP_USE_NULL
247 return a.el(id);
248#elif SIMDPP_USE_SSE4_1
249#if SIMDPP_32_BITS
250 uint32x4 t = uint32x4(a);
251 uint64_t r = i_extract<id*2>(t);
252 r |= uint64_t(i_extract<id*2+1>(t)) << 32;
253 return r;
254#else
255 return _mm_extract_epi64(a.native(), id);
256#endif
257#elif SIMDPP_USE_SSE2
258#if SIMDPP_32_BITS
259 uint32x4 t = uint32x4(a);
260 uint64_t r = 0;
261 t = move4_l<id*2>(t); // when id==0, move_l is template-specialized and does nothing
262 r = i_extract<0>(t);
263 t = move4_l<1>(t);
264 r |= uint64_t(i_extract<0>(t)) << 32;
265 return r;
266#else
267 uint64x2 t = a;
268 if (id != 0) {
269 t = move2_l<id>(t);
270 }
271 return _mm_cvtsi128_si64(t.native());
272#endif
273#elif SIMDPP_USE_NEON
274 return vgetq_lane_u64(a.native(), id);
275#elif SIMDPP_USE_ALTIVEC
276 detail::mem_block<uint64x2> ax(a);
277 return ax[id];
278#elif SIMDPP_USE_MSA
279#if SIMDPP_64_BITS
280 return __msa_copy_u_d((v2i64) a.native(), id);
281#else
282 v4i32 a32 = (v4i32) a.native();
283 uint64_t lo = __msa_copy_u_w(a32, id*2);
284 uint64_t hi = __msa_copy_u_w(a32, id*2+1);
285 return lo | (hi << 32);
286#endif
287#endif
288}
289
290#if SIMDPP_USE_AVX2
291template<unsigned id> SIMDPP_INL
292uint64_t i_extract(const uint64<4>& a)
293{
294 uint64<2> val = _mm256_extracti128_si256(a.native(), id / 2);
295 return i_extract<id % 2>(val);
296}
297#endif
298
299#if SIMDPP_USE_AVX512F
300template<unsigned id> SIMDPP_INL
301uint64_t i_extract(const uint64<8>& a)
302{
303 uint64<2> val = _mm512_extracti32x4_epi32(a.native(), id / 2);
304 return i_extract<id % 2>(val);
305}
306#endif
307
308// -----------------------------------------------------------------------------
309
310template<unsigned id> SIMDPP_INL
311int64_t i_extract(const int64<2>& a)
312{
313#if SIMDPP_USE_MSA
314#if SIMDPP_64_BITS
315 return __msa_copy_s_d(a, id);
316#else
317 v4i32 a32 = (v4i32) a.native();
318 int64_t lo = __msa_copy_s_w(a32, id*2);
319 int64_t hi = __msa_copy_s_w(a32, id*2+1);
320 return lo | (hi << 32);
321#endif
322#else
323 return i_extract<id>(uint64x2(a));
324#endif
325}
326
327#if SIMDPP_USE_AVX2
328template<unsigned id> SIMDPP_INL
329int64_t i_extract(const int64<4>& a)
330{
331 uint64<2> val = _mm256_extracti128_si256(a.native(), id / 2);
332 return i_extract<id % 2>(val);
333}
334#endif
335
336#if SIMDPP_USE_AVX512F
337template<unsigned id> SIMDPP_INL
338int64_t i_extract(const int64<8>& a)
339{
340 uint64<2> val = _mm512_extracti32x4_epi32(a.native(), id / 2);
341 return i_extract<id % 2>(val);
342}
343#endif
344
345// -----------------------------------------------------------------------------
346
347template<unsigned id> SIMDPP_INL
348float i_extract(const float32<4>& a)
349{
350#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
351 return a.el(id);
352#elif SIMDPP_USE_SSE2
353 return bit_cast<float>(i_extract<id>(int32x4(a)));
354#elif SIMDPP_USE_NEON
355 return vgetq_lane_f32(a.native(), id);
356#elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
357 detail::mem_block<float32x4> ax(a);
358 return ax[id];
359#endif
360}
361
362#if SIMDPP_USE_AVX
363template<unsigned id> SIMDPP_INL
364float i_extract(const float32<8>& a)
365{
366 __m128 val = _mm256_extractf128_ps(a.native(), id / 4);
367 return bit_cast<float>(_mm_extract_epi32(_mm_castps_si128(val), id % 4));
368}
369#endif
370
371#if SIMDPP_USE_AVX512F
372template<unsigned id> SIMDPP_INL
373float i_extract(const float32<16>& a)
374{
375 __m128 val = _mm512_extractf32x4_ps(a.native(), id / 4);
376 return bit_cast<float>(_mm_extract_epi32(_mm_castps_si128(val), id % 4));
377}
378#endif
379
380// -----------------------------------------------------------------------------
381
382template<unsigned id> SIMDPP_INL
383double i_extract(const float64<2>& a)
384{
385#if SIMDPP_USE_NULL
386 return a.el(id);
387#elif SIMDPP_USE_SSE2
388 return bit_cast<double>(i_extract<id>(int64x2(a)));
389#elif SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
390 detail::mem_block<float64x2> ax(a);
391 return ax[id];
392#elif SIMDPP_USE_NEON64
393 return vgetq_lane_f64(a.native(), id);
394#endif
395}
396
397#if SIMDPP_USE_AVX
398template<unsigned id> SIMDPP_INL
399double i_extract(const float64<4>& a)
400{
401 __m128d val = _mm256_extractf128_pd(a.native(), id / 2);
402 return bit_cast<double>(i_extract<id % 2>((uint64<2>)_mm_castpd_si128(val)));
403}
404#endif
405
406#if SIMDPP_USE_AVX512F
407template<unsigned id> SIMDPP_INL
408double i_extract(const float64<8>& a)
409{
410 __m128 val = _mm512_extractf32x4_ps(_mm512_castpd_ps(a.native()), id / 2);
411 return bit_cast<double>(i_extract<id % 2>((uint64<2>)_mm_castps_si128(val)));
412}
413#endif
414
415// -----------------------------------------------------------------------------
416
417template<unsigned id, class V> SIMDPP_INL
418typename V::element_type i_extract(const V& a)
419{
420 typename V::base_vector_type base = a.vec(id / V::base_length);
421 return i_extract<id % V::base_length>(base);
422}
423
424} // namespace insn
425} // namespace detail
426} // namespace SIMDPP_ARCH_NAMESPACE
427} // namespace simdpp
428
429#endif
430