1/* Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SET_SPLAT_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_SET_SPLAT_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/load.h>
17#include <simdpp/core/zip_lo.h>
18#include <simdpp/detail/altivec/load1.h>
19
20namespace simdpp {
21namespace SIMDPP_ARCH_NAMESPACE {
22namespace detail {
23namespace insn {
24
25static SIMDPP_INL
26void i_set_splat(uint32x4&, uint32_t);
27
28static SIMDPP_INL
29void i_set_splat(uint8x16& v, uint8_t v0)
30{
31#if SIMDPP_USE_NULL
32 v = detail::null::make_vec<uint8x16>(v0);
33#elif SIMDPP_USE_AVX2
34 uint32_t u0 = v0;
35 v = _mm_cvtsi32_si128(u0);
36 v = _mm_broadcastb_epi8(v.native());
37#elif SIMDPP_USE_SSE2
38 uint32_t u0;
39 u0 = v0 * 0x01010101;
40 uint32x4 u;
41 i_set_splat(u, u0);
42 v = u;
43#elif SIMDPP_USE_NEON
44 v = vdupq_n_u8(v0);
45#elif SIMDPP_USE_ALTIVEC
46 SIMDPP_ALIGN(16) uint8_t rv[16];
47 rv[0] = v0;
48 v = altivec::load1(v, rv);
49 v = splat<0>(v);
50#elif SIMDPP_USE_MSA
51 v = (v16u8) __msa_fill_b(v0);
52#endif
53}
54
55#if SIMDPP_USE_AVX2
56static SIMDPP_INL
57void i_set_splat(uint8x32& v, uint8_t v0)
58{
59 uint8x16 a = _mm_cvtsi32_si128(v0);
60 v = _mm256_broadcastb_epi8(a.native());
61}
62#endif
63
64#if SIMDPP_USE_AVX512BW
65SIMDPP_INL void i_set_splat(uint8<64>& v, uint8_t v0)
66{
67 uint8x16 a = _mm_cvtsi32_si128(v0);
68 v = _mm512_broadcastb_epi8(a.native());
69}
70#endif
71
72template<unsigned N> SIMDPP_INL
73void i_set_splat(uint8<N>& v, uint8_t v0)
74{
75 uint8v tv;
76 i_set_splat(tv, v0);
77 for (unsigned i = 0; i < v.vec_length; ++i) {
78 v.vec(i) = tv;
79 }
80}
81
82// -----------------------------------------------------------------------------
83
84static SIMDPP_INL
85void i_set_splat(uint16x8& v, uint16_t v0)
86{
87#if SIMDPP_USE_NULL
88 v = detail::null::make_vec<uint16x8>(v0);
89#elif SIMDPP_USE_AVX2
90 uint32_t u0 = v0;
91 v = _mm_cvtsi32_si128(u0);
92 v = _mm_broadcastw_epi16(v.native());
93#elif SIMDPP_USE_SSE2
94 uint32_t u0;
95 u0 = v0 | v0 << 16;
96 uint32x4 u;
97 i_set_splat(u, u0);
98 v = u;
99#elif SIMDPP_USE_NEON
100 v = vdupq_n_u16(v0);
101#elif SIMDPP_USE_ALTIVEC
102 SIMDPP_ALIGN(16) uint16_t rv[8];
103 rv[0] = v0;
104 v = altivec::load1(v, rv);
105 v = splat<0>(v);
106#elif SIMDPP_USE_MSA
107 v = (v8u16) __msa_fill_h(v0);
108#endif
109}
110
111#if SIMDPP_USE_AVX2
112static SIMDPP_INL
113void i_set_splat(uint16x16& v, uint16_t v0)
114{
115 uint16x8 a = _mm_cvtsi32_si128(v0);
116 v = _mm256_broadcastw_epi16(a.native());
117}
118#endif
119
120#if SIMDPP_USE_AVX512BW
121SIMDPP_INL void i_set_splat(uint16<32>& v, uint16_t v0)
122{
123 uint16x8 a = _mm_cvtsi32_si128(v0);
124 v = _mm512_broadcastw_epi16(a.native());
125}
126#endif
127
128template<unsigned N> SIMDPP_INL
129void i_set_splat(uint16<N>& v, uint16_t v0)
130{
131 uint16v tv;
132 i_set_splat(tv, v0);
133 for (unsigned i = 0; i < v.vec_length; ++i) {
134 v.vec(i) = tv;
135 }
136}
137
138// -----------------------------------------------------------------------------
139
140static SIMDPP_INL
141void i_set_splat(uint32x4& v, uint32_t v0)
142{
143#if SIMDPP_USE_NULL
144 v = detail::null::make_vec<uint32x4>(v0);
145#elif SIMDPP_USE_AVX2
146 v = _mm_cvtsi32_si128(v0);
147 v = _mm_broadcastd_epi32(v.native());
148#elif SIMDPP_USE_SSE2
149 v = _mm_cvtsi32_si128(v0);
150 v = permute4<0,0,0,0>(v);
151#elif SIMDPP_USE_NEON
152 v = vdupq_n_u32(v0);
153#elif SIMDPP_USE_ALTIVEC
154 SIMDPP_ALIGN(16) uint32_t rv[4];
155 rv[0] = v0;
156 v = altivec::load1(v, rv);
157 v = splat<0>(v);
158#elif SIMDPP_USE_MSA
159 v = (v4u32) __msa_fill_w(v0);
160#endif
161}
162
163#if SIMDPP_USE_AVX2
164static SIMDPP_INL
165void i_set_splat(uint32x8& v, uint32_t v0)
166{
167 uint32x4 a = _mm_cvtsi32_si128(v0);
168 v = _mm256_broadcastd_epi32(a.native());
169}
170#endif
171
172#if SIMDPP_USE_AVX512F
173static SIMDPP_INL
174void i_set_splat(uint32<16>& v, uint32_t v0)
175{
176 v = _mm512_set1_epi32(v0);
177}
178#endif
179
180template<unsigned N> SIMDPP_INL
181void i_set_splat(uint32<N>& v, uint32_t v0)
182{
183 uint32v tv;
184 i_set_splat(tv, v0);
185 for (unsigned i = 0; i < v.vec_length; ++i) {
186 v.vec(i) = tv;
187 }
188}
189
190// -----------------------------------------------------------------------------
191
192static SIMDPP_INL
193void i_set_splat(uint64x2& v, uint64_t v0)
194{
195#if SIMDPP_USE_SSE2
196#if SIMDPP_32_BITS
197 uint32x4 va = _mm_cvtsi32_si128(uint32_t(v0));
198 uint32x4 vb = _mm_cvtsi32_si128(uint32_t(v0 >> 32));
199 v = zip4_lo(va, vb);
200 v = permute2<0,0>(v);
201#else
202 v = _mm_cvtsi64_si128(v0);
203 v = permute2<0,0>(v);
204#endif
205#elif SIMDPP_USE_NEON
206 v = vdupq_n_u64(v0);
207#elif SIMDPP_USE_VSX_207
208 SIMDPP_ALIGN(16) uint64_t rv[2];
209 rv[0] = v0;
210 v = vec_ld(0, reinterpret_cast<const __vector uint64_t*>(rv));
211 v = splat<0>(v);
212#elif SIMDPP_USE_MSA
213#if SIMDPP_64_BITS
214 v = (v2u64) __msa_fill_d(v0.native());
215#else
216 uint32_t v0lo = v0;
217 uint32_t v0hi = v0 >> 32;
218#pragma GCC diagnostic push
219#pragma GCC diagnostic ignored "-Wuninitialized"
220 v4i32 vr;
221 vr = __msa_insert_w(vr, 0, v0lo);
222 vr = __msa_insert_w(vr, 1, v0hi);
223#pragma GCC diagnostic pop
224 v = (int32<4>) vr;
225 v = (v2u64) __msa_splat_d((v2i64) v.native(), 0);
226#endif
227#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
228 v = detail::null::make_vec<uint64x2>(v0);
229#endif
230}
231
232#if SIMDPP_USE_AVX2
233static SIMDPP_INL
234void i_set_splat(uint64x4& v, uint64_t v0)
235{
236#if SIMDPP_32_BITS
237 uint32x4 va = _mm_cvtsi32_si128(uint32_t(v0));
238 uint32x4 vb = _mm_cvtsi32_si128(uint32_t(v0 >> 32));
239 uint64x2 a = (uint64x2) zip4_lo(va, vb);
240 v = _mm256_broadcastq_epi64(a.native());
241#else
242 uint64x2 a = _mm_cvtsi64_si128(v0);
243 v = _mm256_broadcastq_epi64(a.native());
244#endif
245}
246#endif
247
248#if SIMDPP_USE_AVX512F
249static SIMDPP_INL
250void i_set_splat(uint64<8>& v, uint64_t v0)
251{
252 v = _mm512_set1_epi64(v0);
253}
254#endif
255
256template<unsigned N> SIMDPP_INL
257void i_set_splat(uint64<N>& v, uint64_t v0)
258{
259 uint64v tv;
260 i_set_splat(tv, v0);
261 for (unsigned i = 0; i < v.vec_length; ++i) {
262 v.vec(i) = tv;
263 }
264}
265
266// -----------------------------------------------------------------------------
267
268static SIMDPP_INL
269void i_set_splat(float32x4& v, float v0)
270{
271#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
272 v = detail::null::make_vec<float32x4>(v0);
273#elif SIMDPP_USE_SSE2
274 v = _mm_set1_ps(v0); // likely in a SSE register anyway
275#elif SIMDPP_USE_NEON
276 v = vdupq_n_f32(v0);
277#elif SIMDPP_USE_ALTIVEC
278 SIMDPP_ALIGN(16) float rv[4];
279 rv[0] = v0;
280 v = altivec::load1(v, rv);
281 v = splat<0>(v);
282#elif SIMDPP_USE_MSA
283 SIMDPP_ALIGN(16) float rv[4];
284 rv[0] = v0;
285 v = (v4f32) __msa_ld_w(rv, 0);
286 v = (v4f32) __msa_splat_w((v4i32) v.native(), 0);
287#endif
288}
289
290#if SIMDPP_USE_AVX
291static SIMDPP_INL
292void i_set_splat(float32x8& v, float v0)
293{
294 v = _mm256_broadcast_ss(&v0);
295}
296#endif
297
298#if SIMDPP_USE_AVX512F
299static SIMDPP_INL
300void i_set_splat(float32<16>& v, float v0)
301{
302 float32<4> a;
303 i_set_splat(a, v0);
304 v = _mm512_broadcast_f32x4(a.native());
305}
306#endif
307
308template<unsigned N> SIMDPP_INL
309void i_set_splat(float32<N>& v, float v0)
310{
311#ifdef __GNUC__
312#pragma GCC diagnostic push
313#pragma GCC diagnostic ignored "-Wuninitialized"
314#endif
315 // GCC thinks tv is not initialized
316 float32v tv;
317 i_set_splat(tv, v0);
318 for (unsigned i = 0; i < v.vec_length; ++i) {
319 v.vec(i) = tv;
320 }
321#ifdef __GNUC__
322#pragma GCC diagnostic pop
323#endif
324}
325
326// -----------------------------------------------------------------------------
327
328static SIMDPP_INL
329void i_set_splat(float64x2& v, double v0)
330{
331#if SIMDPP_USE_SSE2
332 v = _mm_set1_pd(v0); // likely in a SSE register anyway
333#elif SIMDPP_USE_NEON64
334 v = vdupq_n_f64(v0);
335#elif SIMDPP_USE_VSX_206
336 SIMDPP_ALIGN(16) double rv[2];
337 rv[0] = v0;
338 v = vec_ld(0, reinterpret_cast<const __vector double*>(rv));
339 v = splat<0>(v);
340#elif SIMDPP_USE_MSA
341 SIMDPP_ALIGN(16) double rv[2];
342 rv[0] = v0;
343 v = (v2f64) __msa_ld_d(rv, 0);
344 v = (v2f64) __msa_splat_d((v2i64) v.native(), 0);
345#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
346 v = detail::null::make_vec<float64x2>(v0);
347#endif
348}
349
350#if SIMDPP_USE_AVX
351static SIMDPP_INL
352void i_set_splat(float64x4& v, double v0)
353{
354 v = _mm256_broadcast_sd(&v0);
355}
356#endif
357
358#if SIMDPP_USE_AVX512F
359static SIMDPP_INL
360void i_set_splat(float64<8>& v, double v0)
361{
362 float64<4> v1;
363 i_set_splat(v1, v0);
364 v = _mm512_broadcast_f64x4(v1.native());
365}
366#endif
367
368template<unsigned N> SIMDPP_INL
369void i_set_splat(float64<N>& v, double v0)
370{
371 float64v tv;
372 i_set_splat(tv, v0);
373 for (unsigned i = 0; i < v.vec_length; ++i) {
374 v.vec(i) = tv;
375 }
376}
377
378// -----------------------------------------------------------------------------
379
380template<class V, class VE> SIMDPP_INL
381V i_splat_any(const VE& x)
382{
383#ifdef __GNUC__
384#pragma GCC diagnostic push
385#pragma GCC diagnostic ignored "-Wuninitialized"
386#endif
387 // GCC thinks r is not initialized
388 typename detail::remove_sign<V>::type r;
389 insn::i_set_splat(r, x);
390 return V(r);
391#ifdef __GNUC__
392#pragma GCC diagnostic pop
393#endif
394}
395
396} // namespace insn
397
398template<class V, class VE> SIMDPP_INL
399void construct_eval(V& v, const expr_vec_set_splat<VE>& e)
400{
401 v = insn::i_splat_any<V>(e.a);
402}
403
404template<class V, class VE> SIMDPP_INL
405V splat_impl(const VE& x)
406{
407 static_assert(is_vector<V>::value && !is_mask<V>::value,
408 "V must be a non-mask vector");
409 return insn::i_splat_any<V>(x);
410}
411
412} // namespace detail
413} // namespace SIMDPP_ARCH_NAMESPACE
414} // namespace simdpp
415
416#endif
417
418