1 | /* Copyright (C) 2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_SET_SPLAT_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_SET_SPLAT_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/load.h> |
17 | #include <simdpp/core/zip_lo.h> |
18 | #include <simdpp/detail/altivec/load1.h> |
19 | |
20 | namespace simdpp { |
21 | namespace SIMDPP_ARCH_NAMESPACE { |
22 | namespace detail { |
23 | namespace insn { |
24 | |
25 | static SIMDPP_INL |
26 | void i_set_splat(uint32x4&, uint32_t); |
27 | |
28 | static SIMDPP_INL |
29 | void i_set_splat(uint8x16& v, uint8_t v0) |
30 | { |
31 | #if SIMDPP_USE_NULL |
32 | v = detail::null::make_vec<uint8x16>(v0); |
33 | #elif SIMDPP_USE_AVX2 |
34 | uint32_t u0 = v0; |
35 | v = _mm_cvtsi32_si128(u0); |
36 | v = _mm_broadcastb_epi8(v.native()); |
37 | #elif SIMDPP_USE_SSE2 |
38 | uint32_t u0; |
39 | u0 = v0 * 0x01010101; |
40 | uint32x4 u; |
41 | i_set_splat(u, u0); |
42 | v = u; |
43 | #elif SIMDPP_USE_NEON |
44 | v = vdupq_n_u8(v0); |
45 | #elif SIMDPP_USE_ALTIVEC |
46 | SIMDPP_ALIGN(16) uint8_t rv[16]; |
47 | rv[0] = v0; |
48 | v = altivec::load1(v, rv); |
49 | v = splat<0>(v); |
50 | #elif SIMDPP_USE_MSA |
51 | v = (v16u8) __msa_fill_b(v0); |
52 | #endif |
53 | } |
54 | |
55 | #if SIMDPP_USE_AVX2 |
56 | static SIMDPP_INL |
57 | void i_set_splat(uint8x32& v, uint8_t v0) |
58 | { |
59 | uint8x16 a = _mm_cvtsi32_si128(v0); |
60 | v = _mm256_broadcastb_epi8(a.native()); |
61 | } |
62 | #endif |
63 | |
64 | #if SIMDPP_USE_AVX512BW |
65 | SIMDPP_INL void i_set_splat(uint8<64>& v, uint8_t v0) |
66 | { |
67 | uint8x16 a = _mm_cvtsi32_si128(v0); |
68 | v = _mm512_broadcastb_epi8(a.native()); |
69 | } |
70 | #endif |
71 | |
72 | template<unsigned N> SIMDPP_INL |
73 | void i_set_splat(uint8<N>& v, uint8_t v0) |
74 | { |
75 | uint8v tv; |
76 | i_set_splat(tv, v0); |
77 | for (unsigned i = 0; i < v.vec_length; ++i) { |
78 | v.vec(i) = tv; |
79 | } |
80 | } |
81 | |
82 | // ----------------------------------------------------------------------------- |
83 | |
84 | static SIMDPP_INL |
85 | void i_set_splat(uint16x8& v, uint16_t v0) |
86 | { |
87 | #if SIMDPP_USE_NULL |
88 | v = detail::null::make_vec<uint16x8>(v0); |
89 | #elif SIMDPP_USE_AVX2 |
90 | uint32_t u0 = v0; |
91 | v = _mm_cvtsi32_si128(u0); |
92 | v = _mm_broadcastw_epi16(v.native()); |
93 | #elif SIMDPP_USE_SSE2 |
94 | uint32_t u0; |
95 | u0 = v0 | v0 << 16; |
96 | uint32x4 u; |
97 | i_set_splat(u, u0); |
98 | v = u; |
99 | #elif SIMDPP_USE_NEON |
100 | v = vdupq_n_u16(v0); |
101 | #elif SIMDPP_USE_ALTIVEC |
102 | SIMDPP_ALIGN(16) uint16_t rv[8]; |
103 | rv[0] = v0; |
104 | v = altivec::load1(v, rv); |
105 | v = splat<0>(v); |
106 | #elif SIMDPP_USE_MSA |
107 | v = (v8u16) __msa_fill_h(v0); |
108 | #endif |
109 | } |
110 | |
111 | #if SIMDPP_USE_AVX2 |
112 | static SIMDPP_INL |
113 | void i_set_splat(uint16x16& v, uint16_t v0) |
114 | { |
115 | uint16x8 a = _mm_cvtsi32_si128(v0); |
116 | v = _mm256_broadcastw_epi16(a.native()); |
117 | } |
118 | #endif |
119 | |
120 | #if SIMDPP_USE_AVX512BW |
121 | SIMDPP_INL void i_set_splat(uint16<32>& v, uint16_t v0) |
122 | { |
123 | uint16x8 a = _mm_cvtsi32_si128(v0); |
124 | v = _mm512_broadcastw_epi16(a.native()); |
125 | } |
126 | #endif |
127 | |
128 | template<unsigned N> SIMDPP_INL |
129 | void i_set_splat(uint16<N>& v, uint16_t v0) |
130 | { |
131 | uint16v tv; |
132 | i_set_splat(tv, v0); |
133 | for (unsigned i = 0; i < v.vec_length; ++i) { |
134 | v.vec(i) = tv; |
135 | } |
136 | } |
137 | |
138 | // ----------------------------------------------------------------------------- |
139 | |
140 | static SIMDPP_INL |
141 | void i_set_splat(uint32x4& v, uint32_t v0) |
142 | { |
143 | #if SIMDPP_USE_NULL |
144 | v = detail::null::make_vec<uint32x4>(v0); |
145 | #elif SIMDPP_USE_AVX2 |
146 | v = _mm_cvtsi32_si128(v0); |
147 | v = _mm_broadcastd_epi32(v.native()); |
148 | #elif SIMDPP_USE_SSE2 |
149 | v = _mm_cvtsi32_si128(v0); |
150 | v = permute4<0,0,0,0>(v); |
151 | #elif SIMDPP_USE_NEON |
152 | v = vdupq_n_u32(v0); |
153 | #elif SIMDPP_USE_ALTIVEC |
154 | SIMDPP_ALIGN(16) uint32_t rv[4]; |
155 | rv[0] = v0; |
156 | v = altivec::load1(v, rv); |
157 | v = splat<0>(v); |
158 | #elif SIMDPP_USE_MSA |
159 | v = (v4u32) __msa_fill_w(v0); |
160 | #endif |
161 | } |
162 | |
163 | #if SIMDPP_USE_AVX2 |
164 | static SIMDPP_INL |
165 | void i_set_splat(uint32x8& v, uint32_t v0) |
166 | { |
167 | uint32x4 a = _mm_cvtsi32_si128(v0); |
168 | v = _mm256_broadcastd_epi32(a.native()); |
169 | } |
170 | #endif |
171 | |
172 | #if SIMDPP_USE_AVX512F |
173 | static SIMDPP_INL |
174 | void i_set_splat(uint32<16>& v, uint32_t v0) |
175 | { |
176 | v = _mm512_set1_epi32(v0); |
177 | } |
178 | #endif |
179 | |
180 | template<unsigned N> SIMDPP_INL |
181 | void i_set_splat(uint32<N>& v, uint32_t v0) |
182 | { |
183 | uint32v tv; |
184 | i_set_splat(tv, v0); |
185 | for (unsigned i = 0; i < v.vec_length; ++i) { |
186 | v.vec(i) = tv; |
187 | } |
188 | } |
189 | |
190 | // ----------------------------------------------------------------------------- |
191 | |
192 | static SIMDPP_INL |
193 | void i_set_splat(uint64x2& v, uint64_t v0) |
194 | { |
195 | #if SIMDPP_USE_SSE2 |
196 | #if SIMDPP_32_BITS |
197 | uint32x4 va = _mm_cvtsi32_si128(uint32_t(v0)); |
198 | uint32x4 vb = _mm_cvtsi32_si128(uint32_t(v0 >> 32)); |
199 | v = zip4_lo(va, vb); |
200 | v = permute2<0,0>(v); |
201 | #else |
202 | v = _mm_cvtsi64_si128(v0); |
203 | v = permute2<0,0>(v); |
204 | #endif |
205 | #elif SIMDPP_USE_NEON |
206 | v = vdupq_n_u64(v0); |
207 | #elif SIMDPP_USE_VSX_207 |
208 | SIMDPP_ALIGN(16) uint64_t rv[2]; |
209 | rv[0] = v0; |
210 | v = vec_ld(0, reinterpret_cast<const __vector uint64_t*>(rv)); |
211 | v = splat<0>(v); |
212 | #elif SIMDPP_USE_MSA |
213 | #if SIMDPP_64_BITS |
214 | v = (v2u64) __msa_fill_d(v0.native()); |
215 | #else |
216 | uint32_t v0lo = v0; |
217 | uint32_t v0hi = v0 >> 32; |
218 | #pragma GCC diagnostic push |
219 | #pragma GCC diagnostic ignored "-Wuninitialized" |
220 | v4i32 vr; |
221 | vr = __msa_insert_w(vr, 0, v0lo); |
222 | vr = __msa_insert_w(vr, 1, v0hi); |
223 | #pragma GCC diagnostic pop |
224 | v = (int32<4>) vr; |
225 | v = (v2u64) __msa_splat_d((v2i64) v.native(), 0); |
226 | #endif |
227 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
228 | v = detail::null::make_vec<uint64x2>(v0); |
229 | #endif |
230 | } |
231 | |
232 | #if SIMDPP_USE_AVX2 |
233 | static SIMDPP_INL |
234 | void i_set_splat(uint64x4& v, uint64_t v0) |
235 | { |
236 | #if SIMDPP_32_BITS |
237 | uint32x4 va = _mm_cvtsi32_si128(uint32_t(v0)); |
238 | uint32x4 vb = _mm_cvtsi32_si128(uint32_t(v0 >> 32)); |
239 | uint64x2 a = (uint64x2) zip4_lo(va, vb); |
240 | v = _mm256_broadcastq_epi64(a.native()); |
241 | #else |
242 | uint64x2 a = _mm_cvtsi64_si128(v0); |
243 | v = _mm256_broadcastq_epi64(a.native()); |
244 | #endif |
245 | } |
246 | #endif |
247 | |
248 | #if SIMDPP_USE_AVX512F |
249 | static SIMDPP_INL |
250 | void i_set_splat(uint64<8>& v, uint64_t v0) |
251 | { |
252 | v = _mm512_set1_epi64(v0); |
253 | } |
254 | #endif |
255 | |
256 | template<unsigned N> SIMDPP_INL |
257 | void i_set_splat(uint64<N>& v, uint64_t v0) |
258 | { |
259 | uint64v tv; |
260 | i_set_splat(tv, v0); |
261 | for (unsigned i = 0; i < v.vec_length; ++i) { |
262 | v.vec(i) = tv; |
263 | } |
264 | } |
265 | |
266 | // ----------------------------------------------------------------------------- |
267 | |
268 | static SIMDPP_INL |
269 | void i_set_splat(float32x4& v, float v0) |
270 | { |
271 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
272 | v = detail::null::make_vec<float32x4>(v0); |
273 | #elif SIMDPP_USE_SSE2 |
274 | v = _mm_set1_ps(v0); // likely in a SSE register anyway |
275 | #elif SIMDPP_USE_NEON |
276 | v = vdupq_n_f32(v0); |
277 | #elif SIMDPP_USE_ALTIVEC |
278 | SIMDPP_ALIGN(16) float rv[4]; |
279 | rv[0] = v0; |
280 | v = altivec::load1(v, rv); |
281 | v = splat<0>(v); |
282 | #elif SIMDPP_USE_MSA |
283 | SIMDPP_ALIGN(16) float rv[4]; |
284 | rv[0] = v0; |
285 | v = (v4f32) __msa_ld_w(rv, 0); |
286 | v = (v4f32) __msa_splat_w((v4i32) v.native(), 0); |
287 | #endif |
288 | } |
289 | |
290 | #if SIMDPP_USE_AVX |
291 | static SIMDPP_INL |
292 | void i_set_splat(float32x8& v, float v0) |
293 | { |
294 | v = _mm256_broadcast_ss(&v0); |
295 | } |
296 | #endif |
297 | |
298 | #if SIMDPP_USE_AVX512F |
299 | static SIMDPP_INL |
300 | void i_set_splat(float32<16>& v, float v0) |
301 | { |
302 | float32<4> a; |
303 | i_set_splat(a, v0); |
304 | v = _mm512_broadcast_f32x4(a.native()); |
305 | } |
306 | #endif |
307 | |
308 | template<unsigned N> SIMDPP_INL |
309 | void i_set_splat(float32<N>& v, float v0) |
310 | { |
311 | #ifdef __GNUC__ |
312 | #pragma GCC diagnostic push |
313 | #pragma GCC diagnostic ignored "-Wuninitialized" |
314 | #endif |
315 | // GCC thinks tv is not initialized |
316 | float32v tv; |
317 | i_set_splat(tv, v0); |
318 | for (unsigned i = 0; i < v.vec_length; ++i) { |
319 | v.vec(i) = tv; |
320 | } |
321 | #ifdef __GNUC__ |
322 | #pragma GCC diagnostic pop |
323 | #endif |
324 | } |
325 | |
326 | // ----------------------------------------------------------------------------- |
327 | |
328 | static SIMDPP_INL |
329 | void i_set_splat(float64x2& v, double v0) |
330 | { |
331 | #if SIMDPP_USE_SSE2 |
332 | v = _mm_set1_pd(v0); // likely in a SSE register anyway |
333 | #elif SIMDPP_USE_NEON64 |
334 | v = vdupq_n_f64(v0); |
335 | #elif SIMDPP_USE_VSX_206 |
336 | SIMDPP_ALIGN(16) double rv[2]; |
337 | rv[0] = v0; |
338 | v = vec_ld(0, reinterpret_cast<const __vector double*>(rv)); |
339 | v = splat<0>(v); |
340 | #elif SIMDPP_USE_MSA |
341 | SIMDPP_ALIGN(16) double rv[2]; |
342 | rv[0] = v0; |
343 | v = (v2f64) __msa_ld_d(rv, 0); |
344 | v = (v2f64) __msa_splat_d((v2i64) v.native(), 0); |
345 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
346 | v = detail::null::make_vec<float64x2>(v0); |
347 | #endif |
348 | } |
349 | |
350 | #if SIMDPP_USE_AVX |
351 | static SIMDPP_INL |
352 | void i_set_splat(float64x4& v, double v0) |
353 | { |
354 | v = _mm256_broadcast_sd(&v0); |
355 | } |
356 | #endif |
357 | |
358 | #if SIMDPP_USE_AVX512F |
359 | static SIMDPP_INL |
360 | void i_set_splat(float64<8>& v, double v0) |
361 | { |
362 | float64<4> v1; |
363 | i_set_splat(v1, v0); |
364 | v = _mm512_broadcast_f64x4(v1.native()); |
365 | } |
366 | #endif |
367 | |
368 | template<unsigned N> SIMDPP_INL |
369 | void i_set_splat(float64<N>& v, double v0) |
370 | { |
371 | float64v tv; |
372 | i_set_splat(tv, v0); |
373 | for (unsigned i = 0; i < v.vec_length; ++i) { |
374 | v.vec(i) = tv; |
375 | } |
376 | } |
377 | |
378 | // ----------------------------------------------------------------------------- |
379 | |
380 | template<class V, class VE> SIMDPP_INL |
381 | V i_splat_any(const VE& x) |
382 | { |
383 | #ifdef __GNUC__ |
384 | #pragma GCC diagnostic push |
385 | #pragma GCC diagnostic ignored "-Wuninitialized" |
386 | #endif |
387 | // GCC thinks r is not initialized |
388 | typename detail::remove_sign<V>::type r; |
389 | insn::i_set_splat(r, x); |
390 | return V(r); |
391 | #ifdef __GNUC__ |
392 | #pragma GCC diagnostic pop |
393 | #endif |
394 | } |
395 | |
396 | } // namespace insn |
397 | |
398 | template<class V, class VE> SIMDPP_INL |
399 | void construct_eval(V& v, const expr_vec_set_splat<VE>& e) |
400 | { |
401 | v = insn::i_splat_any<V>(e.a); |
402 | } |
403 | |
404 | template<class V, class VE> SIMDPP_INL |
405 | V splat_impl(const VE& x) |
406 | { |
407 | static_assert(is_vector<V>::value && !is_mask<V>::value, |
408 | "V must be a non-mask vector" ); |
409 | return insn::i_splat_any<V>(x); |
410 | } |
411 | |
412 | } // namespace detail |
413 | } // namespace SIMDPP_ARCH_NAMESPACE |
414 | } // namespace simdpp |
415 | |
416 | #endif |
417 | |
418 | |