1 | /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MAKE_CONST_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_MAKE_CONST_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/null/set.h> |
17 | #include <simdpp/detail/traits.h> |
18 | |
19 | #if _MSC_VER |
20 | #pragma warning(push) |
21 | #pragma warning(disable: 4244) |
22 | #endif |
23 | |
24 | namespace simdpp { |
25 | namespace SIMDPP_ARCH_NAMESPACE { |
26 | namespace detail { |
27 | namespace insn { |
28 | |
29 | #if SIMDPP_USE_NEON_FLT_SP |
30 | template<class VE> SIMDPP_INL |
31 | void i_make_const(float32<4>& v, const expr_vec_make_const<VE,1>& e, unsigned) |
32 | { |
33 | float rv = e.val(0); |
34 | v = vld1q_dup_f32(&rv); |
35 | } |
36 | |
37 | template<class VE> SIMDPP_INL |
38 | void i_make_const(float32<4>& v, const expr_vec_make_const<VE,2>& e, unsigned off) |
39 | { |
40 | float SIMDPP_ALIGN(8) data[2] = { |
41 | (float) e.val(off+0), |
42 | (float) e.val(off+1) |
43 | }; |
44 | float32x2_t half = vld1_f32(data); |
45 | v = vcombine_f32(half, half); |
46 | } |
47 | #endif |
48 | |
49 | template<class VE, unsigned N> SIMDPP_INL |
50 | void i_make_const(float32<4>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
51 | { |
52 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
53 | v = detail::null::make_vec<float32<4>, float>(e.val(off+0), e.val(off+1), |
54 | e.val(off+2), e.val(off+3)); |
55 | #elif SIMDPP_USE_SSE2 |
56 | v = _mm_set_ps(e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
57 | #elif SIMDPP_USE_NEON |
58 | float SIMDPP_ALIGN(16) data[4] = { |
59 | (float) e.val(off+0), |
60 | (float) e.val(off+1), |
61 | (float) e.val(off+2), |
62 | (float) e.val(off+3) |
63 | }; |
64 | v = vld1q_f32(data); |
65 | #elif SIMDPP_USE_ALTIVEC |
66 | v = (__vector float){ float(e.val(off+0)), float(e.val(off+1)), |
67 | float(e.val(off+2)), float(e.val(off+3)) }; |
68 | #elif SIMDPP_USE_MSA |
69 | v = (v4f32){ float(e.val(off+0)), float(e.val(off+1)), |
70 | float(e.val(off+2)), float(e.val(off+3)) }; |
71 | #endif |
72 | } |
73 | |
74 | #if SIMDPP_USE_AVX |
75 | template<class VE, unsigned N> SIMDPP_INL |
76 | void i_make_const(float32<8>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
77 | { |
78 | v = _mm256_set_ps(e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
79 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
80 | } |
81 | |
82 | #endif |
83 | |
84 | #if SIMDPP_USE_AVX512F |
85 | template<class VE, unsigned N> SIMDPP_INL |
86 | void i_make_const(float32<16>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
87 | { |
88 | v = _mm512_set_ps(e.val(off+15), e.val(off+14), e.val(off+13), e.val(off+12), |
89 | e.val(off+11), e.val(off+10), e.val(off+9), e.val(off+8), |
90 | e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
91 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
92 | } |
93 | #endif |
94 | |
95 | // ----------------------------------------------------------------------------- |
96 | |
97 | template<class VE, unsigned N> SIMDPP_INL |
98 | void i_make_const(float64<2>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
99 | { |
100 | #if SIMDPP_USE_SSE2 |
101 | v = _mm_set_pd(e.val(off+1), e.val(off+0)); |
102 | #elif SIMDPP_USE_NEON64 |
103 | double SIMDPP_ALIGN(16) data[2] = { |
104 | (double) e.val(off+0), |
105 | (double) e.val(off+1) |
106 | }; |
107 | v = vld1q_f64(data); |
108 | #elif SIMDPP_USE_VSX_206 |
109 | __vector double r = { double(e.val(off+0)), double(e.val(off+1)) }; |
110 | v = r; |
111 | #elif SIMDPP_USE_MSA |
112 | v = (v2f64){ double(e.val(off+0)), double(e.val(off+1)) }; |
113 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
114 | v = detail::null::make_vec<float64<2>, double>(e.val(off+0), e.val(off+1)); |
115 | #endif |
116 | } |
117 | |
118 | #if SIMDPP_USE_AVX |
119 | template<class VE, unsigned N> SIMDPP_INL |
120 | void i_make_const(float64<4>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
121 | { |
122 | v = _mm256_set_pd(e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
123 | } |
124 | #endif |
125 | |
126 | #if SIMDPP_USE_AVX512F |
127 | template<class VE, unsigned N> SIMDPP_INL |
128 | void i_make_const(float64<8>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
129 | { |
130 | v = _mm512_set_pd(e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
131 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
132 | } |
133 | #endif |
134 | |
135 | // ----------------------------------------------------------------------------- |
136 | |
137 | #if SIMDPP_USE_NEON |
138 | template<class VE> SIMDPP_INL |
139 | void i_make_const(uint8<16>& v, const expr_vec_make_const<VE,1>& e, unsigned off) |
140 | { |
141 | uint8_t rv = e.val(off+0); |
142 | v = vld1q_dup_u8(&rv); |
143 | } |
144 | |
145 | template<class VE> SIMDPP_INL |
146 | void i_make_const(uint8<16>& v, const expr_vec_make_const<VE,2>& e, unsigned off) |
147 | { |
148 | uint16_t rv = (e.val(off+0) & 0xff) | (e.val(off+1) & 0xff) << 8; |
149 | v = (uint16<8>) vld1q_dup_u16(&rv); |
150 | } |
151 | |
152 | template<class VE> SIMDPP_INL |
153 | void i_make_const(uint8<16>& v, const expr_vec_make_const<VE,4>& e, unsigned off) |
154 | { |
155 | uint32_t rv = (e.val(off+0) & 0xff) | (e.val(off+1) & 0xff) << 8 | |
156 | (e.val(off+2) & 0xff) << 16 | (e.val(off+3) & 0xff) << 24; |
157 | v = (uint32<4>) vld1q_dup_u32(&rv); |
158 | } |
159 | |
160 | template<class VE> SIMDPP_INL |
161 | void i_make_const(uint8<16>& v, const expr_vec_make_const<VE,8>& e, unsigned off) |
162 | { |
163 | uint8_t SIMDPP_ALIGN(8) data[8] = { |
164 | (uint8_t) e.val(off+0), (uint8_t) e.val(off+1), |
165 | (uint8_t) e.val(off+2), (uint8_t) e.val(off+3), |
166 | (uint8_t) e.val(off+4), (uint8_t) e.val(off+5), |
167 | (uint8_t) e.val(off+6), (uint8_t) e.val(off+7) |
168 | }; |
169 | uint8x8_t half = vld1_u8(data); |
170 | v = vcombine_u8(half, half); |
171 | } |
172 | #endif |
173 | |
174 | template<class VE, unsigned N> SIMDPP_INL |
175 | void i_make_const(uint8<16>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
176 | { |
177 | #if SIMDPP_USE_NULL |
178 | v = detail::null::make_vec<uint8<16>, uint8_t>( |
179 | e.val(off+0), e.val(off+1), e.val(off+2), e.val(off+3), |
180 | e.val(off+4), e.val(off+5), e.val(off+6), e.val(off+7), |
181 | e.val(off+8), e.val(off+9), e.val(off+10), e.val(off+11), |
182 | e.val(off+12), e.val(off+13), e.val(off+14), e.val(off+15)); |
183 | #elif SIMDPP_USE_SSE2 |
184 | v = _mm_set_epi8(e.val(off+15), e.val(off+14), e.val(off+13), e.val(off+12), |
185 | e.val(off+11), e.val(off+10), e.val(off+9), e.val(off+8), |
186 | e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
187 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
188 | #elif SIMDPP_USE_NEON |
189 | uint8_t SIMDPP_ALIGN(16) data[16] = { |
190 | (uint8_t) e.val(off+0), (uint8_t) e.val(off+1), |
191 | (uint8_t) e.val(off+2), (uint8_t) e.val(off+3), |
192 | (uint8_t) e.val(off+4), (uint8_t) e.val(off+5), |
193 | (uint8_t) e.val(off+6), (uint8_t) e.val(off+7), |
194 | (uint8_t) e.val(off+8), (uint8_t) e.val(off+9), |
195 | (uint8_t) e.val(off+10), (uint8_t) e.val(off+11), |
196 | (uint8_t) e.val(off+12), (uint8_t) e.val(off+13), |
197 | (uint8_t) e.val(off+14), (uint8_t) e.val(off+15) |
198 | }; |
199 | v = vld1q_u8(data); |
200 | #elif SIMDPP_USE_ALTIVEC |
201 | v = (__vector uint8_t){ |
202 | uint8_t(e.val(off+0)), uint8_t(e.val(off+1)), uint8_t(e.val(off+2)), uint8_t(e.val(off+3)), |
203 | uint8_t(e.val(off+4)), uint8_t(e.val(off+5)), uint8_t(e.val(off+6)), uint8_t(e.val(off+7)), |
204 | uint8_t(e.val(off+8)), uint8_t(e.val(off+9)), uint8_t(e.val(off+10)), uint8_t(e.val(off+11)), |
205 | uint8_t(e.val(off+12)), uint8_t(e.val(off+13)), uint8_t(e.val(off+14)), uint8_t(e.val(off+15)) |
206 | }; |
207 | #elif SIMDPP_USE_MSA |
208 | v = (v16u8){ |
209 | uint8_t(e.val(off+0)), uint8_t(e.val(off+1)), uint8_t(e.val(off+2)), uint8_t(e.val(off+3)), |
210 | uint8_t(e.val(off+4)), uint8_t(e.val(off+5)), uint8_t(e.val(off+6)), uint8_t(e.val(off+7)), |
211 | uint8_t(e.val(off+8)), uint8_t(e.val(off+9)), uint8_t(e.val(off+10)), uint8_t(e.val(off+11)), |
212 | uint8_t(e.val(off+12)), uint8_t(e.val(off+13)), uint8_t(e.val(off+14)), uint8_t(e.val(off+15)) |
213 | }; |
214 | #endif |
215 | } |
216 | |
217 | #if SIMDPP_USE_AVX2 |
218 | template<class VE, unsigned N> SIMDPP_INL |
219 | void i_make_const(uint8<32>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
220 | { |
221 | v = _mm256_set_epi8(e.val(off+31), e.val(off+30), e.val(off+29), e.val(off+28), |
222 | e.val(off+27), e.val(off+26), e.val(off+25), e.val(off+24), |
223 | e.val(off+23), e.val(off+22), e.val(off+21), e.val(off+20), |
224 | e.val(off+19), e.val(off+18), e.val(off+17), e.val(off+16), |
225 | e.val(off+15), e.val(off+14), e.val(off+13), e.val(off+12), |
226 | e.val(off+11), e.val(off+10), e.val(off+9), e.val(off+8), |
227 | e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
228 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
229 | } |
230 | #endif |
231 | |
232 | #if SIMDPP_USE_AVX512BW |
233 | SIMDPP_INL uint32_t make_uint32_uint8(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4) |
234 | { |
235 | return (a1 & 0xff) | ((a2 & 0xff) << 8) | ((a3 & 0xff) << 16) | ((a4 & 0xff) << 24); |
236 | } |
237 | |
238 | template<class VE, unsigned N> SIMDPP_INL |
239 | void i_make_const(uint8<64>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
240 | { |
241 | v = _mm512_set_epi32( |
242 | make_uint32_uint8(e.val(off+60), e.val(off+61), e.val(off+62), e.val(off+63)), |
243 | make_uint32_uint8(e.val(off+56), e.val(off+57), e.val(off+58), e.val(off+59)), |
244 | make_uint32_uint8(e.val(off+52), e.val(off+53), e.val(off+54), e.val(off+55)), |
245 | make_uint32_uint8(e.val(off+48), e.val(off+49), e.val(off+50), e.val(off+51)), |
246 | make_uint32_uint8(e.val(off+44), e.val(off+45), e.val(off+46), e.val(off+47)), |
247 | make_uint32_uint8(e.val(off+40), e.val(off+41), e.val(off+42), e.val(off+43)), |
248 | make_uint32_uint8(e.val(off+36), e.val(off+37), e.val(off+38), e.val(off+39)), |
249 | make_uint32_uint8(e.val(off+32), e.val(off+33), e.val(off+34), e.val(off+35)), |
250 | make_uint32_uint8(e.val(off+28), e.val(off+29), e.val(off+30), e.val(off+31)), |
251 | make_uint32_uint8(e.val(off+24), e.val(off+25), e.val(off+26), e.val(off+27)), |
252 | make_uint32_uint8(e.val(off+20), e.val(off+21), e.val(off+22), e.val(off+23)), |
253 | make_uint32_uint8(e.val(off+16), e.val(off+17), e.val(off+18), e.val(off+19)), |
254 | make_uint32_uint8(e.val(off+12), e.val(off+13), e.val(off+14), e.val(off+15)), |
255 | make_uint32_uint8(e.val(off+8), e.val(off+9), e.val(off+10), e.val(off+11)), |
256 | make_uint32_uint8(e.val(off+4), e.val(off+5), e.val(off+6), e.val(off+7)), |
257 | make_uint32_uint8(e.val(off+0), e.val(off+1), e.val(off+2), e.val(off+3)) |
258 | ); |
259 | } |
260 | #endif |
261 | |
262 | |
263 | // ----------------------------------------------------------------------------- |
264 | |
265 | #if SIMDPP_USE_NEON |
266 | template<class VE> SIMDPP_INL |
267 | void i_make_const(uint16<8>& v, const expr_vec_make_const<VE,1>& e, unsigned off) |
268 | { |
269 | uint16_t rv = e.val(off+0); |
270 | v = vld1q_dup_u16(&rv); |
271 | } |
272 | |
273 | template<class VE> SIMDPP_INL |
274 | void i_make_const(uint16<8>& v, const expr_vec_make_const<VE,2>& e, unsigned off) |
275 | { |
276 | uint32_t rv = (e.val(off+0) & 0xffff) | (e.val(off+1) & 0xffff) << 16; |
277 | v = (uint32<4>) vld1q_dup_u32(&rv); |
278 | } |
279 | |
280 | template<class VE> SIMDPP_INL |
281 | void i_make_const(uint16<8>& v, const expr_vec_make_const<VE,4>& e, unsigned off) |
282 | { |
283 | uint16_t SIMDPP_ALIGN(8) data[4] = { |
284 | (uint16_t) e.val(off+0), |
285 | (uint16_t) e.val(off+1), |
286 | (uint16_t) e.val(off+2), |
287 | (uint16_t) e.val(off+3) |
288 | }; |
289 | uint16x4_t half = vld1_u16(data); |
290 | v = vcombine_u16(half, half); |
291 | } |
292 | #endif |
293 | |
294 | |
295 | template<class VE, unsigned N> SIMDPP_INL |
296 | void i_make_const(uint16<8>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
297 | { |
298 | #if SIMDPP_USE_NULL |
299 | v = detail::null::make_vec<uint16<8>, uint16_t>(e.val(off+0), e.val(off+1), e.val(off+2), e.val(off+3), |
300 | e.val(off+4), e.val(off+5), e.val(off+6), e.val(off+7)); |
301 | #elif SIMDPP_USE_SSE2 |
302 | v = _mm_set_epi16(e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
303 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
304 | #elif SIMDPP_USE_NEON |
305 | uint16_t SIMDPP_ALIGN(16) data[8] = { |
306 | (uint16_t) e.val(off+0), (uint16_t) e.val(off+1), |
307 | (uint16_t) e.val(off+2), (uint16_t) e.val(off+3), |
308 | (uint16_t) e.val(off+4), (uint16_t) e.val(off+5), |
309 | (uint16_t) e.val(off+6), (uint16_t) e.val(off+7) |
310 | }; |
311 | v = vld1q_u16(data); |
312 | #elif SIMDPP_USE_ALTIVEC |
313 | v = (__vector uint16_t){ |
314 | uint16_t(e.val(off+0)), uint16_t(e.val(off+1)), uint16_t(e.val(off+2)), uint16_t(e.val(off+3)), |
315 | uint16_t(e.val(off+4)), uint16_t(e.val(off+5)), uint16_t(e.val(off+6)), uint16_t(e.val(off+7)) |
316 | }; |
317 | #elif SIMDPP_USE_MSA |
318 | v = (v8u16){ |
319 | uint16_t(e.val(off+0)), uint16_t(e.val(off+1)), uint16_t(e.val(off+2)), uint16_t(e.val(off+3)), |
320 | uint16_t(e.val(off+4)), uint16_t(e.val(off+5)), uint16_t(e.val(off+6)), uint16_t(e.val(off+7)) |
321 | }; |
322 | #endif |
323 | } |
324 | |
325 | #if SIMDPP_USE_AVX2 |
326 | template<class VE, unsigned N> SIMDPP_INL |
327 | void i_make_const(uint16<16>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
328 | { |
329 | v = _mm256_set_epi16(e.val(off+15), e.val(off+14), e.val(off+13), e.val(off+12), |
330 | e.val(off+11), e.val(off+10), e.val(off+9), e.val(off+8), |
331 | e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
332 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
333 | } |
334 | #endif |
335 | |
336 | #if SIMDPP_USE_AVX512BW |
337 | SIMDPP_INL uint32_t make_uint32_uint16(uint16_t a1, uint16_t a2) |
338 | { |
339 | return (a1 & 0xffff) | ((a2 & 0xffff) << 16); |
340 | } |
341 | |
342 | template<class VE, unsigned N> SIMDPP_INL |
343 | void i_make_const(uint16<32>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
344 | { |
345 | v = _mm512_set_epi32( |
346 | make_uint32_uint16(e.val(off+30), e.val(off+31)), |
347 | make_uint32_uint16(e.val(off+28), e.val(off+29)), |
348 | make_uint32_uint16(e.val(off+26), e.val(off+27)), |
349 | make_uint32_uint16(e.val(off+24), e.val(off+25)), |
350 | make_uint32_uint16(e.val(off+22), e.val(off+23)), |
351 | make_uint32_uint16(e.val(off+20), e.val(off+21)), |
352 | make_uint32_uint16(e.val(off+18), e.val(off+19)), |
353 | make_uint32_uint16(e.val(off+16), e.val(off+17)), |
354 | make_uint32_uint16(e.val(off+14), e.val(off+15)), |
355 | make_uint32_uint16(e.val(off+12), e.val(off+13)), |
356 | make_uint32_uint16(e.val(off+10), e.val(off+11)), |
357 | make_uint32_uint16(e.val(off+8), e.val(off+9)), |
358 | make_uint32_uint16(e.val(off+6), e.val(off+7)), |
359 | make_uint32_uint16(e.val(off+4), e.val(off+5)), |
360 | make_uint32_uint16(e.val(off+2), e.val(off+3)), |
361 | make_uint32_uint16(e.val(off+0), e.val(off+1))); |
362 | } |
363 | #endif |
364 | |
365 | // ----------------------------------------------------------------------------- |
366 | |
367 | #if SIMDPP_USE_NEON |
368 | template<class VE> SIMDPP_INL |
369 | void i_make_const(uint32<4>& v, const expr_vec_make_const<VE,1>& e, unsigned off) |
370 | { |
371 | uint32_t rv = e.val(off+0); |
372 | v = vld1q_dup_u32(&rv); |
373 | } |
374 | |
375 | template<class VE> SIMDPP_INL |
376 | void i_make_const(uint32<4>& v, const expr_vec_make_const<VE,2>& e, unsigned off) |
377 | { |
378 | uint32_t SIMDPP_ALIGN(8) data[2] = { |
379 | (uint32_t) e.val(off+0), |
380 | (uint32_t) e.val(off+1) |
381 | }; |
382 | uint32x2_t half = vld1_u32(data); |
383 | v = vcombine_u32(half, half); |
384 | } |
385 | #endif |
386 | |
387 | template<class VE, unsigned N> SIMDPP_INL |
388 | void i_make_const(uint32<4>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
389 | { |
390 | #if SIMDPP_USE_NULL |
391 | v = detail::null::make_vec<uint32<4>, uint32_t>(e.val(off+0), e.val(off+1), e.val(off+2), e.val(off+3)); |
392 | #elif SIMDPP_USE_SSE2 |
393 | v = _mm_set_epi32(e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
394 | #elif SIMDPP_USE_NEON |
395 | uint32_t SIMDPP_ALIGN(16) data[4] = { |
396 | (uint32_t) e.val(off+0), (uint32_t) e.val(off+1), |
397 | (uint32_t) e.val(off+2), (uint32_t) e.val(off+3) |
398 | }; |
399 | v = vld1q_u32(data); |
400 | #elif SIMDPP_USE_ALTIVEC |
401 | v = (__vector uint32_t) { uint32_t(e.val(off+0)), uint32_t(e.val(off+1)), |
402 | uint32_t(e.val(off+2)), uint32_t(e.val(off+3)) }; |
403 | #elif SIMDPP_USE_MSA |
404 | v = (v4u32) { uint32_t(e.val(off+0)), uint32_t(e.val(off+1)), |
405 | uint32_t(e.val(off+2)), uint32_t(e.val(off+3)) }; |
406 | #endif |
407 | } |
408 | |
409 | #if SIMDPP_USE_AVX2 |
410 | template<class VE, unsigned N> SIMDPP_INL |
411 | void i_make_const(uint32<8>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
412 | { |
413 | v = _mm256_set_epi32(e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
414 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
415 | } |
416 | #endif |
417 | |
418 | #if SIMDPP_USE_AVX512F |
419 | template<class VE, unsigned N> SIMDPP_INL |
420 | void i_make_const(uint32<16>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
421 | { |
422 | v = _mm512_set_epi32(e.val(off+15), e.val(off+14), e.val(off+13), e.val(off+12), |
423 | e.val(off+11), e.val(off+10), e.val(off+9), e.val(off+8), |
424 | e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
425 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
426 | } |
427 | #endif |
428 | |
429 | // ----------------------------------------------------------------------------- |
430 | |
431 | #if SIMDPP_USE_NEON |
432 | template<class VE> SIMDPP_INL |
433 | void i_make_const(uint64<2>& v, const expr_vec_make_const<VE,1>& e, unsigned off) |
434 | { |
435 | uint64x1_t r0 = vcreate_u64(uint64_t(e.val(off+0))); |
436 | v = vcombine_u64(r0, r0); |
437 | } |
438 | #endif |
439 | |
440 | template<class VE, unsigned N> SIMDPP_INL |
441 | void i_make_const(uint64<2>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
442 | { |
443 | #if SIMDPP_USE_SSE2 |
444 | #if SIMDPP_32_BITS && _MSC_VER |
445 | // MSVC does not support _mm_set_epi64x in 32-bit mode |
446 | uint64_t v1 = e.val(off+1); |
447 | uint64_t v0 = e.val(off+0); |
448 | v = _mm_set_epi32(v1 >> 32, v1 & 0xffffffff, v0 >> 32, v0 & 0xffffffff); |
449 | #else |
450 | v = _mm_set_epi64x(e.val(off+1), e.val(off+0)); |
451 | #endif |
452 | #elif SIMDPP_USE_NEON |
453 | uint64_t SIMDPP_ALIGN(16) data[2] = { |
454 | (uint64_t) e.val(off+0), |
455 | (uint64_t) e.val(off+1) |
456 | }; |
457 | v = vld1q_u64(data); |
458 | #elif SIMDPP_USE_VSX_207 |
459 | __vector uint64_t r = { (uint64_t)e.val(off+0), (uint64_t)e.val(off+1) }; |
460 | v = r; |
461 | #elif SIMDPP_USE_MSA |
462 | v = (v2u64) { uint64_t(e.val(off+0)), uint64_t(e.val(off+1)) }; |
463 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
464 | v = detail::null::make_vec<uint64<2>, uint64_t>(e.val(off+0), e.val(off+1)); |
465 | #endif |
466 | } |
467 | |
468 | #if SIMDPP_USE_AVX2 |
469 | template<class VE, unsigned N> SIMDPP_INL |
470 | void i_make_const(uint64<4>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
471 | { |
472 | #if SIMDPP_32_BITS && _MSC_VER |
473 | // MSVC does not support _mm256_set_epi64x in 32-bit mode |
474 | uint64_t v3 = e.val(off+3); |
475 | uint64_t v2 = e.val(off+2); |
476 | uint64_t v1 = e.val(off+1); |
477 | uint64_t v0 = e.val(off+0); |
478 | v = _mm256_set_epi32(v3 >> 32, v3 & 0xffffffff, v2 >> 32, v2 & 0xffffffff, |
479 | v1 >> 32, v1 & 0xffffffff, v0 >> 32, v0 & 0xffffffff); |
480 | #else |
481 | v = _mm256_set_epi64x(e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
482 | #endif |
483 | } |
484 | #endif |
485 | |
486 | #if SIMDPP_USE_AVX512F |
487 | template<class VE, unsigned N> SIMDPP_INL |
488 | void i_make_const(uint64<8>& v, const expr_vec_make_const<VE,N>& e, unsigned off) |
489 | { |
490 | v = _mm512_set_epi64(e.val(off+7), e.val(off+6), e.val(off+5), e.val(off+4), |
491 | e.val(off+3), e.val(off+2), e.val(off+1), e.val(off+0)); |
492 | } |
493 | #endif |
494 | |
495 | // ----------------------------------------------------------------------------- |
496 | |
497 | template<class V, class VE, unsigned NE> SIMDPP_INL |
498 | void i_make_const(V& v, const expr_vec_make_const<VE,NE>& e, unsigned off) |
499 | { |
500 | for (unsigned i = 0; i < v.vec_length; ++i) { |
501 | i_make_const(v.vec(i), e, off + v.base_length * i); |
502 | } |
503 | } |
504 | |
505 | // ----------------------------------------------------------------------------- |
506 | |
507 | template<class V, class VE, unsigned N> SIMDPP_INL |
508 | V i_make_const_any(const expr_vec_make_const<VE,N>& e) |
509 | { |
510 | typename detail::remove_sign<V>::type r; |
511 | i_make_const(r, e, 0); |
512 | return V(r); |
513 | } |
514 | |
515 | // ----------------------------------------------------------------------------- |
516 | } // namespace insn |
517 | |
518 | template<class V, class VE, unsigned N> SIMDPP_INL |
519 | void construct_eval(V& v, const expr_vec_make_const<VE, N>& e) |
520 | { |
521 | v = insn::i_make_const_any<V>(e); |
522 | } |
523 | |
524 | template<class V> SIMDPP_INL |
525 | void construct_eval(V& v, const expr_vec_make_ones& e) |
526 | { |
527 | (void) e; |
528 | expr_vec_make_const<uint64_t,1> e2; |
529 | e2.a[0] = (uint64_t)-1; |
530 | typename V::uint_vector_type u; |
531 | insn::i_make_const(u, e2, 0); |
532 | v = u; |
533 | } |
534 | |
535 | } // namespace detail |
536 | } // namespace SIMDPP_ARCH_NAMESPACE |
537 | } // namespace simdpp |
538 | |
539 | #if _MSC_VER |
540 | #pragma warning(pop) |
541 | #endif |
542 | |
543 | #endif |
544 | |
545 | |