1/* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_BROADCAST_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_BROADCAST_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/bit_or.h>
17#include <simdpp/core/permute2.h>
18#include <simdpp/core/permute4.h>
19#include <simdpp/core/permute_bytes16.h>
20#include <simdpp/core/zip_lo.h>
21#include <simdpp/core/zip_hi.h>
22#include <simdpp/detail/null/shuffle.h>
23#include <simdpp/detail/shuffle/shuffle_mask.h>
24#include <simdpp/detail/vector_array_macros.h>
25
26namespace simdpp {
27namespace SIMDPP_ARCH_NAMESPACE {
28namespace detail {
29namespace insn {
30
31// forward declarations
32template<unsigned s> SIMDPP_INL
33uint16x8 i_splat8(const uint16x8& a);
34#if SIMDPP_USE_AVX2
35template<unsigned s> SIMDPP_INL
36uint16x16 i_splat8(const uint16x16& a);
37#endif
38#if SIMDPP_USE_AVX512BW
39template<unsigned s> SIMDPP_INL
40uint16<32> i_splat8(const uint16<32>& a);
41#endif
42
43// -----------------------------------------------------------------------------
44
45template<unsigned s> SIMDPP_INL
46uint8x16 i_splat16(const uint8x16& ca)
47{
48 uint8<16> a = ca;
49 static_assert(s < 16, "Access out of bounds");
50#if SIMDPP_USE_NULL
51 return detail::null::splat<s>(a);
52#elif SIMDPP_USE_AVX2
53 a = move16_l<s>(a);
54 return _mm_broadcastb_epi8(a.native());
55#elif SIMDPP_USE_SSSE3
56 uint8x16 mask = make_shuffle_bytes16_mask<s,s,s,s,s,s,s,s,
57 s,s,s,s,s,s,s,s>(mask);
58 return permute_bytes16(a, mask);
59#elif SIMDPP_USE_SSE2
60 __m128i n1, n2;
61
62 if (s % 2 == 1) {
63 n1 = _mm_srli_epi16(a.native(), 8);
64 n2 = _mm_slli_epi16(n1, 8);
65 } else {
66 n1 = _mm_slli_epi16(a.native(), 8);
67 n2 = _mm_srli_epi16(n1, 8);
68 }
69 uint16x8 b = _mm_or_si128(n1, n2);
70 return (uint8<16>) i_splat8<s/2>(b);
71#elif SIMDPP_USE_NEON64
72 return vdupq_laneq_u8(a.native(), s);
73#elif SIMDPP_USE_NEON
74 if (s < 8) {
75 uint8x8_t z = vget_low_u8(a.native());
76 return (uint8x16_t) vdupq_lane_u8(z, (s < 8 ? s : 0));
77 } else {
78 uint8x8_t z = vget_high_u8(a.native());
79 return (uint8x16_t) vdupq_lane_u8(z, (s < 8 ? 0 : s-8));
80 }
81#elif SIMDPP_USE_ALTIVEC
82 return vec_splat(a.native(), s);
83#elif SIMDPP_USE_MSA
84 return (v16u8) __msa_splat_b((v16i8) a.native(), s);
85#endif
86}
87
88#if SIMDPP_USE_AVX2
89template<unsigned s> SIMDPP_INL
90uint8x32 i_splat16(const uint8x32& a)
91{
92 static_assert(s < 16, "Access out of bounds");
93 uint16x16 b; b = s < 8 ? zip16_lo(a, a) : zip16_hi(a, a);
94 return (uint8x32) i_splat8<s%8>(b);
95}
96#endif
97
98#if SIMDPP_USE_AVX512BW
99template<unsigned s> SIMDPP_INL
100uint8<64> i_splat16(const uint8<64>& a)
101{
102 static_assert(s < 16, "Access out of bounds");
103 uint16<32> b;
104 b = s < 8 ? zip16_lo(a, a) : zip16_hi(a, a);
105 return (uint8<64>) i_splat8<s%8>(b);
106}
107#endif
108
109template<unsigned s, unsigned N> SIMDPP_INL
110uint8<N> i_splat16(const uint8<N>& a)
111{
112 static_assert(s < 16, "Access out of bounds");
113 SIMDPP_VEC_ARRAY_IMPL1(uint8<N>, i_splat16<s>, a);
114}
115
116// -----------------------------------------------------------------------------
117
118template<unsigned s> SIMDPP_INL
119uint16x8 i_splat8(const uint16x8& a)
120{
121 static_assert(s < 8, "Access out of bounds");
122#if SIMDPP_USE_NULL
123 return detail::null::splat<s>(a);
124#elif SIMDPP_USE_AVX2
125 uint16<8> b = move8_l<s>(a);
126 return _mm_broadcastw_epi16(b.native());
127#elif SIMDPP_USE_SSSE3
128 uint16x8 mask = make_shuffle_bytes16_mask<s,s,s,s,s,s,s,s>(mask);
129 return permute_bytes16(a, mask);
130#elif SIMDPP_USE_SSE2
131 // s2 is needed because static_assert fires in branch we don't use
132 uint64x2 b;
133 if (s < 4) {
134 const unsigned s2 = s < 4 ? s : s-4;
135 b = _mm_shufflelo_epi16(a.native(), _MM_SHUFFLE(s2,s2,s2,s2));
136 return (uint16<8>) permute2<0,0>(b);
137 } else {
138 const unsigned s2 = s < 4 ? s : s-4;
139 b = _mm_shufflehi_epi16(a.native(), _MM_SHUFFLE(s2,s2,s2,s2));
140 return (uint16<8>) permute2<1,1>(b);
141 }
142#elif SIMDPP_USE_NEON64
143 return vdupq_laneq_u16(a.native(), s);
144#elif SIMDPP_USE_NEON
145 if (s < 4) {
146 uint16x4_t z = vget_low_u16(a.native());
147 return (uint16x8_t) vdupq_lane_u16(z, (s < 4 ? s : 0));
148 } else {
149 uint16x4_t z = vget_high_u16(a.native());
150 return (uint16x8_t) vdupq_lane_u16(z, (s < 4 ? 0 : s-4));
151 }
152#elif SIMDPP_USE_ALTIVEC
153 return vec_splat(a.native(), s);
154#elif SIMDPP_USE_MSA
155 return (v8u16) __msa_splat_h((v8i16) a.native(), s);
156#endif
157}
158
159#if SIMDPP_USE_AVX2
160template<unsigned s> SIMDPP_INL
161uint16x16 i_splat8(const uint16x16& a)
162{
163 static_assert(s < 8, "Access out of bounds");
164 if (s < 4) {
165 const unsigned q = (s < 4) ? s : 0;
166 uint64x4 h = _mm256_shufflelo_epi16(a.native(), SIMDPP_SHUFFLE_MASK_4x4(q, q, q, q));
167 h = permute2<0,0>(h);
168 return uint16x16(h);
169 } else {
170 const unsigned q = (s < 4) ? 0 : s - 4;
171 uint64x4 h = _mm256_shufflehi_epi16(a.native(), SIMDPP_SHUFFLE_MASK_4x4(q, q, q, q));
172 h = permute2<1,1>(h);
173 return uint16x16(h);
174 }
175}
176#endif
177
178#if SIMDPP_USE_AVX512BW
179template<unsigned s> SIMDPP_INL
180uint16<32> i_splat8(const uint16<32>& a)
181{
182 static_assert(s < 8, "Access out of bounds");
183 uint64<8> r;
184 if (s < 4) {
185 const unsigned q = (s < 4) ? s : 0;
186 r = _mm512_shufflelo_epi16(a.native(), SIMDPP_SHUFFLE_MASK_4x4(q, q, q, q));
187 r = permute2<0,0>(r);
188 } else {
189 const unsigned q = (s < 4) ? 0 : s - 4;
190 r = _mm512_shufflehi_epi16(a.native(), SIMDPP_SHUFFLE_MASK_4x4(q, q, q, q));
191 r = permute2<1,1>(r);
192 }
193 return uint16<32>(r);
194}
195#endif
196
197template<unsigned s, unsigned N> SIMDPP_INL
198uint16<N> i_splat8(const uint16<N>& a)
199{
200 static_assert(s < 8, "Access out of bounds");
201 SIMDPP_VEC_ARRAY_IMPL1(uint16<N>, i_splat8<s>, a);
202}
203
204// -----------------------------------------------------------------------------
205
206template<unsigned s> SIMDPP_INL
207uint32x4 i_splat4(const uint32x4& a)
208{
209 static_assert(s < 4, "Access out of bounds");
210#if SIMDPP_USE_NULL
211 return detail::null::splat<s>(a);
212#elif SIMDPP_USE_SSE2
213 return permute4<s,s,s,s>(a);
214#elif SIMDPP_USE_NEON64
215 return vdupq_laneq_u32(a.native(), s);
216#elif SIMDPP_USE_NEON
217 if (s < 2) {
218 uint32x2_t z = vget_low_u32(a.native());
219 // Clang implements vdupq_lane_u32 as a macro, thus we must never
220 // supply it with s>=2, even if we know the branch will never be executed
221 return (uint32x4_t) vdupq_lane_u32(z, (s < 2 ? s : 0));
222 } else {
223 uint32x2_t z = vget_high_u32(a.native());
224 return (uint32x4_t) vdupq_lane_u32(z, (s < 2 ? 0 : s-2));
225 }
226#elif SIMDPP_USE_ALTIVEC
227 return vec_splat(a.native(), s);
228#elif SIMDPP_USE_MSA
229 return (v4u32) __msa_splat_w((v4i32) a.native(), s);
230#endif
231}
232
233#if SIMDPP_USE_AVX2
234template<unsigned s> SIMDPP_INL
235uint32x8 i_splat4(const uint32x8& a)
236{
237 static_assert(s < 4, "Access out of bounds");
238 return permute4<s,s,s,s>(a);
239}
240#endif\
241
242#if SIMDPP_USE_AVX512F
243template<unsigned s> SIMDPP_INL
244uint32<16> i_splat4(const uint32<16>& a)
245{
246 static_assert(s < 4, "Access out of bounds");
247 return permute4<s,s,s,s>(a);
248}
249#endif
250
251template<unsigned s, unsigned N> SIMDPP_INL
252uint32<N> i_splat4(const uint32<N>& a)
253{
254 static_assert(s < 4, "Access out of bounds");
255 SIMDPP_VEC_ARRAY_IMPL1(uint32<N>, i_splat4<s>, a);
256}
257
258// -----------------------------------------------------------------------------
259
260template<unsigned s> SIMDPP_INL
261uint64x2 i_splat2(const uint64x2& a)
262{
263 static_assert(s < 2, "Access out of bounds");
264#if SIMDPP_USE_SSE2
265 if (s == 0) {
266 return permute2<0,0>(a);
267 } else {
268 return permute2<1,1>(a);
269 }
270#elif SIMDPP_USE_NEON64
271 return vdupq_laneq_u64(a.native(), s);
272#elif SIMDPP_USE_NEON
273 uint64x1_t z;
274 if (s == 0) {
275 z = vget_low_u64(a.native());
276 } else {
277 z = vget_high_u64(a.native());
278 }
279 return (uint64x2_t) vdupq_lane_u64(z, 0);
280#elif SIMDPP_USE_VSX_207
281 return vec_splat(a.native(), s);
282#elif SIMDPP_USE_MSA
283 return (v2u64) __msa_splat_d((v2i64) a.native(), s);
284#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
285 return detail::null::splat<s>(a);
286#endif
287}
288
289#if SIMDPP_USE_AVX2
290template<unsigned s> SIMDPP_INL
291uint64x4 i_splat2(const uint64x4& a)
292{
293 static_assert(s < 2, "Access out of bounds");
294 return permute2<s,s>(a);
295}
296#endif
297
298#if SIMDPP_USE_AVX512F
299template<unsigned s> SIMDPP_INL
300uint64<8> i_splat2(const uint64<8>& a)
301{
302 static_assert(s < 2, "Access out of bounds");
303 return permute2<s,s>(a);
304}
305#endif
306
307template<unsigned s, unsigned N> SIMDPP_INL
308uint64<N> i_splat2(const uint64<N>& a)
309{
310 static_assert(s < 2, "Access out of bounds");
311 SIMDPP_VEC_ARRAY_IMPL1(uint64<N>, i_splat2<s>, a);
312}
313
314// -----------------------------------------------------------------------------
315
316template<unsigned s> SIMDPP_INL
317float32x4 i_splat4(const float32x4& a)
318{
319 static_assert(s < 4, "Access out of bounds");
320#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
321 return detail::null::splat<s>(a);
322#elif SIMDPP_USE_SSE2
323 return permute4<s,s,s,s>(a);
324#elif SIMDPP_USE_NEON64
325 return vdupq_laneq_f32(a.native(), s);
326#elif SIMDPP_USE_NEON
327 if (s < 2) {
328 float32x2_t z = vget_low_f32(a.native());
329 // Clang implements vdupq_lane_f32 as a macro, thus we must never
330 // supply it with s>=2, even if we know the branch will never be executed
331 return (float32x4_t) vdupq_lane_f32(z, (s < 2 ? s : 0));
332 } else {
333 float32x2_t z = vget_high_f32(a.native());
334 return (float32x4_t) vdupq_lane_f32(z, (s < 2 ? 0 : s-2) );
335 }
336#elif SIMDPP_USE_MSA
337 return (v4f32) __msa_splat_w((v4i32) a.native(), s);
338#elif SIMDPP_USE_ALTIVEC
339 return vec_splat(a.native(), s);
340#endif
341}
342
343#if SIMDPP_USE_AVX
344template<unsigned s> SIMDPP_INL
345float32x8 i_splat4(const float32x8& a)
346{
347 static_assert(s < 4, "Access out of bounds");
348 return permute4<s,s,s,s>(a);
349}
350#endif
351
352#if SIMDPP_USE_AVX512F
353template<unsigned s> SIMDPP_INL
354float32<16> i_splat4(const float32<16>& a)
355{
356 static_assert(s < 4, "Access out of bounds");
357 return permute4<s,s,s,s>(a);
358}
359#endif
360
361template<unsigned s, unsigned N> SIMDPP_INL
362float32<N> i_splat4(const float32<N>& a)
363{
364 static_assert(s < 4, "Access out of bounds");
365 SIMDPP_VEC_ARRAY_IMPL1(float32<N>, i_splat4<s>, a);
366}
367
368// -----------------------------------------------------------------------------
369
370template<unsigned s> SIMDPP_INL
371float64x2 i_splat2(const float64x2& a)
372{
373 static_assert(s < 2, "Access out of bounds");
374#if SIMDPP_USE_SSE2
375 return permute2<s,s>(a);
376#elif SIMDPP_USE_NEON64
377 return vdupq_laneq_f64(a.native(), s);
378#elif SIMDPP_USE_VSX_206
379 return vec_splat(a.native(), s);
380#elif SIMDPP_USE_MSA
381 return (v2f64) __msa_splat_d((v2i64) a.native(), s);
382#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON
383 return detail::null::splat<s>(a);
384#endif
385}
386
387#if SIMDPP_USE_AVX
388template<unsigned s> SIMDPP_INL
389float64x4 i_splat2(const float64x4& a)
390{
391 static_assert(s < 2, "Access out of bounds");
392 return permute2<s,s>(a);
393}
394#endif
395
396#if SIMDPP_USE_AVX512F
397template<unsigned s> SIMDPP_INL
398float64<8> i_splat2(const float64<8>& a)
399{
400 static_assert(s < 2, "Access out of bounds");
401 return permute2<s,s>(a);
402}
403#endif
404
405template<unsigned s, unsigned N> SIMDPP_INL
406float64<N> i_splat2(const float64<N>& a)
407{
408 static_assert(s < 2, "Access out of bounds");
409 SIMDPP_VEC_ARRAY_IMPL1(float64<N>, i_splat2<s>, a);
410}
411
412
413} // namespace insn
414} // namespace detail
415} // namespace SIMDPP_ARCH_NAMESPACE
416} // namespace simdpp
417
418#endif
419
420