1 | /*************************************************************************** |
2 | * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * |
3 | * Martin Renou * |
4 | * Copyright (c) QuantStack * |
5 | * Copyright (c) Serge Guelton * |
6 | * * |
7 | * Distributed under the terms of the BSD 3-Clause License. * |
8 | * * |
9 | * The full license is in the file LICENSE, distributed with this software. * |
10 | ****************************************************************************/ |
11 | |
12 | #ifndef XSIMD_NEON64_HPP |
13 | #define XSIMD_NEON64_HPP |
14 | |
15 | #include <complex> |
16 | #include <cstddef> |
17 | #include <tuple> |
18 | |
19 | #include "../types/xsimd_neon64_register.hpp" |
20 | #include "../types/xsimd_utils.hpp" |
21 | |
22 | namespace xsimd |
23 | { |
24 | template <class batch_type, bool... Values> |
25 | struct batch_bool_constant; |
26 | |
27 | namespace kernel |
28 | { |
29 | using namespace types; |
30 | |
31 | /******* |
32 | * all * |
33 | *******/ |
34 | |
35 | template <class A, class T, detail::enable_sized_t<T, 4> = 0> |
36 | inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept |
37 | { |
38 | return vminvq_u32(arg) == ~0U; |
39 | } |
40 | |
41 | template <class A, class T, detail::enable_sized_t<T, 1> = 0> |
42 | inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept |
43 | { |
44 | return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {}); |
45 | } |
46 | |
47 | template <class A, class T, detail::enable_sized_t<T, 2> = 0> |
48 | inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept |
49 | { |
50 | return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {}); |
51 | } |
52 | |
53 | template <class A, class T, detail::enable_sized_t<T, 8> = 0> |
54 | inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept |
55 | { |
56 | return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {}); |
57 | } |
58 | |
59 | /******* |
60 | * any * |
61 | *******/ |
62 | |
63 | template <class A, class T, detail::enable_sized_t<T, 4> = 0> |
64 | inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept |
65 | { |
66 | return vmaxvq_u32(arg) != 0; |
67 | } |
68 | |
69 | template <class A, class T, detail::enable_sized_t<T, 1> = 0> |
70 | inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept |
71 | { |
72 | return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {}); |
73 | } |
74 | |
75 | template <class A, class T, detail::enable_sized_t<T, 2> = 0> |
76 | inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept |
77 | { |
78 | return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {}); |
79 | } |
80 | |
81 | template <class A, class T, detail::enable_sized_t<T, 8> = 0> |
82 | inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept |
83 | { |
84 | return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {}); |
85 | } |
86 | |
87 | /************* |
88 | * broadcast * |
89 | *************/ |
90 | |
91 | // Required to avoid ambiguous call |
92 | template <class A, class T> |
93 | inline batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept |
94 | { |
95 | return broadcast<neon64>(val, neon {}); |
96 | } |
97 | |
98 | template <class A> |
99 | inline batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept |
100 | { |
101 | return vdupq_n_f64(p0: val); |
102 | } |
103 | |
104 | /******* |
105 | * set * |
106 | *******/ |
107 | |
108 | template <class A> |
109 | inline batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept |
110 | { |
111 | return float64x2_t { d0, d1 }; |
112 | } |
113 | |
114 | template <class A> |
115 | inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept |
116 | { |
117 | using register_type = typename batch_bool<double, A>::register_type; |
118 | using unsigned_type = as_unsigned_integer_t<double>; |
119 | return register_type { static_cast<unsigned_type>(b0 ? -1LL : 0LL), |
120 | static_cast<unsigned_type>(b1 ? -1LL : 0LL) }; |
121 | } |
122 | |
123 | /************* |
124 | * from_bool * |
125 | *************/ |
126 | |
127 | template <class A> |
128 | inline batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept |
129 | { |
130 | return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(p0: vdupq_n_f64(p0: 1.)))); |
131 | } |
132 | |
133 | /******** |
134 | * load * |
135 | ********/ |
136 | |
137 | template <class A> |
138 | inline batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept |
139 | { |
140 | return vld1q_f64(src); |
141 | } |
142 | |
143 | template <class A> |
144 | inline batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept |
145 | { |
146 | return load_aligned<A>(src, convert<double>(), A {}); |
147 | } |
148 | |
149 | /********* |
150 | * store * |
151 | *********/ |
152 | |
153 | template <class A> |
154 | inline void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept |
155 | { |
156 | vst1q_f64(dst, src); |
157 | } |
158 | |
159 | template <class A> |
160 | inline void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept |
161 | { |
162 | return store_aligned<A>(dst, src, A {}); |
163 | } |
164 | |
165 | /**************** |
166 | * load_complex * |
167 | ****************/ |
168 | |
169 | template <class A> |
170 | inline batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept |
171 | { |
172 | using real_batch = batch<double, A>; |
173 | const double* buf = reinterpret_cast<const double*>(mem); |
174 | float64x2x2_t tmp = vld2q_f64(buf); |
175 | real_batch real = tmp.val[0], |
176 | imag = tmp.val[1]; |
177 | return batch<std::complex<double>, A> { real, imag }; |
178 | } |
179 | |
180 | template <class A> |
181 | inline batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept |
182 | { |
183 | return load_complex_aligned<A>(mem, cvt, A {}); |
184 | } |
185 | |
186 | /***************** |
187 | * store_complex * |
188 | *****************/ |
189 | |
190 | template <class A> |
191 | inline void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept |
192 | { |
193 | float64x2x2_t tmp; |
194 | tmp.val[0] = src.real(); |
195 | tmp.val[1] = src.imag(); |
196 | double* buf = reinterpret_cast<double*>(dst); |
197 | vst2q_f64(buf, tmp); |
198 | } |
199 | |
200 | template <class A> |
201 | inline void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept |
202 | { |
203 | store_complex_aligned(dst, src, A {}); |
204 | } |
205 | |
206 | /******* |
207 | * neg * |
208 | *******/ |
209 | |
210 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
211 | inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
212 | { |
213 | return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs))); |
214 | } |
215 | |
216 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
217 | inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
218 | { |
219 | return vnegq_s64(rhs); |
220 | } |
221 | |
222 | template <class A> |
223 | inline batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
224 | { |
225 | return vnegq_f64(rhs); |
226 | } |
227 | |
228 | /******* |
229 | * add * |
230 | *******/ |
231 | |
232 | template <class A> |
233 | inline batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
234 | { |
235 | return vaddq_f64(lhs, rhs); |
236 | } |
237 | |
238 | /******** |
239 | * sadd * |
240 | ********/ |
241 | |
242 | template <class A> |
243 | inline batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
244 | { |
245 | return add(lhs, rhs, neon64 {}); |
246 | } |
247 | |
248 | /******* |
249 | * sub * |
250 | *******/ |
251 | |
252 | template <class A> |
253 | inline batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
254 | { |
255 | return vsubq_f64(lhs, rhs); |
256 | } |
257 | |
258 | /******** |
259 | * ssub * |
260 | ********/ |
261 | |
262 | template <class A> |
263 | inline batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
264 | { |
265 | return sub(lhs, rhs, neon64 {}); |
266 | } |
267 | |
268 | /******* |
269 | * mul * |
270 | *******/ |
271 | |
272 | template <class A> |
273 | inline batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
274 | { |
275 | return vmulq_f64(lhs, rhs); |
276 | } |
277 | |
278 | /******* |
279 | * div * |
280 | *******/ |
281 | |
282 | #if defined(XSIMD_FAST_INTEGER_DIVISION) |
283 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
284 | inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
285 | { |
286 | return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs)); |
287 | } |
288 | |
289 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
290 | inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
291 | { |
292 | return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs)); |
293 | } |
294 | #endif |
295 | template <class A> |
296 | inline batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
297 | { |
298 | return vdivq_f64(lhs, rhs); |
299 | } |
300 | |
301 | /****** |
302 | * eq * |
303 | ******/ |
304 | |
305 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
306 | inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
307 | { |
308 | return vceqq_u64(lhs, rhs); |
309 | } |
310 | |
311 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
312 | inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
313 | { |
314 | return vceqq_s64(lhs, rhs); |
315 | } |
316 | |
317 | template <class A> |
318 | inline batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
319 | { |
320 | return vceqq_f64(lhs, rhs); |
321 | } |
322 | |
323 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
324 | inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept |
325 | { |
326 | return vceqq_u64(lhs, rhs); |
327 | } |
328 | |
329 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
330 | inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept |
331 | { |
332 | return vceqq_u64(lhs, rhs); |
333 | } |
334 | |
335 | template <class A> |
336 | inline batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept |
337 | { |
338 | return vceqq_u64(lhs, rhs); |
339 | } |
340 | |
341 | /************* |
342 | * fast_cast * |
343 | *************/ |
344 | namespace detail |
345 | { |
346 | template <class A> |
347 | inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept |
348 | { |
349 | return vcvtq_f64_s64(x); |
350 | } |
351 | |
352 | template <class A> |
353 | inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept |
354 | { |
355 | return vcvtq_f64_u64(x); |
356 | } |
357 | |
358 | template <class A> |
359 | inline batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept |
360 | { |
361 | return vcvtq_s64_f64(x); |
362 | } |
363 | |
364 | template <class A> |
365 | inline batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept |
366 | { |
367 | return vcvtq_u64_f64(x); |
368 | } |
369 | |
370 | } |
371 | |
372 | /****** |
373 | * lt * |
374 | ******/ |
375 | |
376 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
377 | inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
378 | { |
379 | return vcltq_u64(lhs, rhs); |
380 | } |
381 | |
382 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
383 | inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
384 | { |
385 | return vcltq_s64(lhs, rhs); |
386 | } |
387 | |
388 | template <class A> |
389 | inline batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
390 | { |
391 | return vcltq_f64(lhs, rhs); |
392 | } |
393 | |
394 | /****** |
395 | * le * |
396 | ******/ |
397 | |
398 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
399 | inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
400 | { |
401 | return vcleq_u64(lhs, rhs); |
402 | } |
403 | |
404 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
405 | inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
406 | { |
407 | return vcleq_s64(lhs, rhs); |
408 | } |
409 | |
410 | template <class A> |
411 | inline batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
412 | { |
413 | return vcleq_f64(lhs, rhs); |
414 | } |
415 | |
416 | /****** |
417 | * gt * |
418 | ******/ |
419 | |
420 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
421 | inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
422 | { |
423 | return vcgtq_u64(lhs, rhs); |
424 | } |
425 | |
426 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
427 | inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
428 | { |
429 | return vcgtq_s64(lhs, rhs); |
430 | } |
431 | |
432 | template <class A> |
433 | inline batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
434 | { |
435 | return vcgtq_f64(lhs, rhs); |
436 | } |
437 | |
438 | /****** |
439 | * ge * |
440 | ******/ |
441 | |
442 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
443 | inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
444 | { |
445 | return vcgeq_u64(lhs, rhs); |
446 | } |
447 | |
448 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
449 | inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
450 | { |
451 | return vcgeq_s64(lhs, rhs); |
452 | } |
453 | |
454 | template <class A> |
455 | inline batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
456 | { |
457 | return vcgeq_f64(lhs, rhs); |
458 | } |
459 | |
460 | /******************* |
461 | * batch_bool_cast * |
462 | *******************/ |
463 | |
464 | template <class A, class T_out, class T_in> |
465 | inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept |
466 | { |
467 | using register_type = typename batch_bool<T_out, A>::register_type; |
468 | return register_type(self); |
469 | } |
470 | |
471 | /*************** |
472 | * bitwise_and * |
473 | ***************/ |
474 | |
475 | template <class A> |
476 | inline batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
477 | { |
478 | return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs), |
479 | vreinterpretq_u64_f64(rhs))); |
480 | } |
481 | |
482 | template <class A> |
483 | inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept |
484 | { |
485 | return vandq_u64(lhs, rhs); |
486 | } |
487 | |
488 | /************** |
489 | * bitwise_or * |
490 | **************/ |
491 | |
492 | template <class A> |
493 | inline batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
494 | { |
495 | return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs), |
496 | vreinterpretq_u64_f64(rhs))); |
497 | } |
498 | |
499 | template <class A> |
500 | inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept |
501 | { |
502 | return vorrq_u64(lhs, rhs); |
503 | } |
504 | |
505 | /*************** |
506 | * bitwise_xor * |
507 | ***************/ |
508 | |
509 | template <class A> |
510 | inline batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
511 | { |
512 | return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs), |
513 | vreinterpretq_u64_f64(rhs))); |
514 | } |
515 | |
516 | template <class A> |
517 | inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept |
518 | { |
519 | return veorq_u64(lhs, rhs); |
520 | } |
521 | |
522 | /******* |
523 | * neq * |
524 | *******/ |
525 | |
526 | template <class A> |
527 | inline batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept |
528 | { |
529 | return bitwise_xor(lhs, rhs, A {}); |
530 | } |
531 | |
532 | /*************** |
533 | * bitwise_not * |
534 | ***************/ |
535 | |
536 | template <class A> |
537 | inline batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
538 | { |
539 | return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs))); |
540 | } |
541 | |
542 | template <class A> |
543 | inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept |
544 | { |
545 | return detail::bitwise_not_u64(arg: rhs); |
546 | } |
547 | |
548 | /****************** |
549 | * bitwise_andnot * |
550 | ******************/ |
551 | |
552 | template <class A> |
553 | inline batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
554 | { |
555 | return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs), |
556 | vreinterpretq_u64_f64(rhs))); |
557 | } |
558 | |
559 | template <class A> |
560 | inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept |
561 | { |
562 | return vbicq_u64(lhs, rhs); |
563 | } |
564 | |
565 | /******* |
566 | * min * |
567 | *******/ |
568 | |
569 | template <class A> |
570 | inline batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
571 | { |
572 | return vminq_f64(lhs, rhs); |
573 | } |
574 | |
575 | /******* |
576 | * max * |
577 | *******/ |
578 | |
579 | template <class A> |
580 | inline batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
581 | { |
582 | return vmaxq_f64(lhs, rhs); |
583 | } |
584 | |
585 | /******* |
586 | * abs * |
587 | *******/ |
588 | |
589 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
590 | inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
591 | { |
592 | return rhs; |
593 | } |
594 | |
595 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
596 | inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
597 | { |
598 | return vabsq_s64(rhs); |
599 | } |
600 | |
601 | template <class A> |
602 | inline batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
603 | { |
604 | return vabsq_f64(rhs); |
605 | } |
606 | |
607 | template <class A> |
608 | inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self, |
609 | requires_arch<neon64>) noexcept |
610 | { |
611 | return vcvtnq_s32_f32(self); |
612 | } |
613 | |
614 | #if !defined(__GNUC__) |
615 | template <class A> |
616 | inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self, |
617 | requires_arch<neon64>) noexcept |
618 | { |
619 | return vcvtnq_s64_f64(self); |
620 | } |
621 | #endif |
622 | |
623 | /************** |
624 | * reciprocal * |
625 | **************/ |
626 | |
627 | template <class A> |
628 | inline batch<double, A> |
629 | reciprocal(const batch<double, A>& x, |
630 | kernel::requires_arch<neon64>) noexcept |
631 | { |
632 | return vrecpeq_f64(x); |
633 | } |
634 | |
635 | /******** |
636 | * rsqrt * |
637 | ********/ |
638 | |
639 | template <class A> |
640 | inline batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
641 | { |
642 | return vrsqrteq_f64(rhs); |
643 | } |
644 | |
645 | /******** |
646 | * sqrt * |
647 | ********/ |
648 | |
649 | template <class A> |
650 | inline batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
651 | { |
652 | return vsqrtq_f64(rhs); |
653 | } |
654 | |
655 | /******************** |
656 | * Fused operations * |
657 | ********************/ |
658 | |
659 | #ifdef __ARM_FEATURE_FMA |
660 | template <class A> |
661 | inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept |
662 | { |
663 | return vfmaq_f64(z, x, y); |
664 | } |
665 | |
666 | template <class A> |
667 | inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept |
668 | { |
669 | return vfmaq_f64(-z, x, y); |
670 | } |
671 | #endif |
672 | |
673 | /********* |
674 | * haddp * |
675 | *********/ |
676 | |
677 | template <class A> |
678 | inline batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept |
679 | { |
680 | return vpaddq_f64(row[0], row[1]); |
681 | } |
682 | |
683 | /********** |
684 | * insert * |
685 | **********/ |
686 | |
687 | template <class A, size_t I> |
688 | inline batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept |
689 | { |
690 | return vsetq_lane_f64(val, self, I); |
691 | } |
692 | |
693 | /****************** |
694 | * reducer macros * |
695 | ******************/ |
696 | |
697 | // Wrap reducer intrinsics so we can pass them as function pointers |
698 | // - OP: intrinsics name prefix, e.g., vorrq |
699 | |
700 | #define WRAP_REDUCER_INT_EXCLUDING_64(OP) \ |
701 | namespace wrap \ |
702 | { \ |
703 | inline uint8_t OP##_u8(uint8x16_t a) noexcept \ |
704 | { \ |
705 | return ::OP##_u8(a); \ |
706 | } \ |
707 | inline int8_t OP##_s8(int8x16_t a) noexcept \ |
708 | { \ |
709 | return ::OP##_s8(a); \ |
710 | } \ |
711 | inline uint16_t OP##_u16(uint16x8_t a) noexcept \ |
712 | { \ |
713 | return ::OP##_u16(a); \ |
714 | } \ |
715 | inline int16_t OP##_s16(int16x8_t a) noexcept \ |
716 | { \ |
717 | return ::OP##_s16(a); \ |
718 | } \ |
719 | inline uint32_t OP##_u32(uint32x4_t a) noexcept \ |
720 | { \ |
721 | return ::OP##_u32(a); \ |
722 | } \ |
723 | inline int32_t OP##_s32(int32x4_t a) noexcept \ |
724 | { \ |
725 | return ::OP##_s32(a); \ |
726 | } \ |
727 | } |
728 | |
729 | #define WRAP_REDUCER_INT(OP) \ |
730 | WRAP_REDUCER_INT_EXCLUDING_64(OP) \ |
731 | namespace wrap \ |
732 | { \ |
733 | inline uint64_t OP##_u64(uint64x2_t a) noexcept \ |
734 | { \ |
735 | return ::OP##_u64(a); \ |
736 | } \ |
737 | inline int64_t OP##_s64(int64x2_t a) noexcept \ |
738 | { \ |
739 | return ::OP##_s64(a); \ |
740 | } \ |
741 | } |
742 | |
743 | #define WRAP_REDUCER_FLOAT(OP) \ |
744 | namespace wrap \ |
745 | { \ |
746 | inline float OP##_f32(float32x4_t a) noexcept \ |
747 | { \ |
748 | return ::OP##_f32(a); \ |
749 | } \ |
750 | inline double OP##_f64(float64x2_t a) noexcept \ |
751 | { \ |
752 | return ::OP##_f64(a); \ |
753 | } \ |
754 | } |
755 | |
756 | namespace detail |
757 | { |
758 | template <class R> |
759 | struct reducer_return_type_impl; |
760 | |
761 | template <> |
762 | struct reducer_return_type_impl<uint8x16_t> |
763 | { |
764 | using type = uint8_t; |
765 | }; |
766 | |
767 | template <> |
768 | struct reducer_return_type_impl<int8x16_t> |
769 | { |
770 | using type = int8_t; |
771 | }; |
772 | |
773 | template <> |
774 | struct reducer_return_type_impl<uint16x8_t> |
775 | { |
776 | using type = uint16_t; |
777 | }; |
778 | |
779 | template <> |
780 | struct reducer_return_type_impl<int16x8_t> |
781 | { |
782 | using type = int16_t; |
783 | }; |
784 | |
785 | template <> |
786 | struct reducer_return_type_impl<uint32x4_t> |
787 | { |
788 | using type = uint32_t; |
789 | }; |
790 | |
791 | template <> |
792 | struct reducer_return_type_impl<int32x4_t> |
793 | { |
794 | using type = int32_t; |
795 | }; |
796 | |
797 | template <> |
798 | struct reducer_return_type_impl<uint64x2_t> |
799 | { |
800 | using type = uint64_t; |
801 | }; |
802 | |
803 | template <> |
804 | struct reducer_return_type_impl<int64x2_t> |
805 | { |
806 | using type = int64_t; |
807 | }; |
808 | |
809 | template <> |
810 | struct reducer_return_type_impl<float32x4_t> |
811 | { |
812 | using type = float; |
813 | }; |
814 | |
815 | template <> |
816 | struct reducer_return_type_impl<float64x2_t> |
817 | { |
818 | using type = double; |
819 | }; |
820 | |
821 | template <class R> |
822 | using reducer_return_type = typename reducer_return_type_impl<R>::type; |
823 | |
824 | template <class... T> |
825 | struct neon_reducer_dispatcher_impl : neon_dispatcher_base<reducer_return_type, T...> |
826 | { |
827 | }; |
828 | |
829 | using neon_reducer_dispatcher = neon_reducer_dispatcher_impl<uint8x16_t, int8x16_t, |
830 | uint16x8_t, int16x8_t, |
831 | uint32x4_t, int32x4_t, |
832 | uint64x2_t, int64x2_t, |
833 | float32x4_t, float64x2_t>; |
834 | template <class T> |
835 | using enable_neon64_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value, |
836 | int>::type; |
837 | } |
838 | |
839 | /************** |
840 | * reduce_add * |
841 | **************/ |
842 | |
843 | WRAP_REDUCER_INT(vaddvq) |
844 | WRAP_REDUCER_FLOAT(vaddvq) |
845 | |
846 | template <class A, class T, detail::enable_neon64_type_t<T> = 0> |
847 | inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept |
848 | { |
849 | using register_type = typename batch<T, A>::register_type; |
850 | const detail::neon_reducer_dispatcher::unary dispatcher = { |
851 | .m_func: std::make_tuple(args&: wrap::vaddvq_u8, args&: wrap::vaddvq_s8, args&: wrap::vaddvq_u16, args&: wrap::vaddvq_s16, |
852 | args&: wrap::vaddvq_u32, args&: wrap::vaddvq_s32, args&: wrap::vaddvq_u64, args&: wrap::vaddvq_s64, |
853 | args&: wrap::vaddvq_f32, args&: wrap::vaddvq_f64) |
854 | }; |
855 | return dispatcher.apply(register_type(arg)); |
856 | } |
857 | |
858 | /************** |
859 | * reduce_max * |
860 | **************/ |
861 | |
862 | WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq) |
863 | WRAP_REDUCER_FLOAT(vmaxvq) |
864 | |
865 | namespace wrap |
866 | { |
867 | inline uint64_t vmaxvq_u64(uint64x2_t a) noexcept |
868 | { |
869 | return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1)); |
870 | } |
871 | |
872 | inline int64_t vmaxvq_s64(int64x2_t a) noexcept |
873 | { |
874 | return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1)); |
875 | } |
876 | } |
877 | |
878 | template <class A, class T, detail::enable_neon64_type_t<T> = 0> |
879 | inline typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept |
880 | { |
881 | using register_type = typename batch<T, A>::register_type; |
882 | const detail::neon_reducer_dispatcher::unary dispatcher = { |
883 | .m_func: std::make_tuple(args&: wrap::vmaxvq_u8, args&: wrap::vmaxvq_s8, args&: wrap::vmaxvq_u16, args&: wrap::vmaxvq_s16, |
884 | args&: wrap::vmaxvq_u32, args&: wrap::vmaxvq_s32, args&: wrap::vmaxvq_u64, args&: wrap::vmaxvq_s64, |
885 | args&: wrap::vmaxvq_f32, args&: wrap::vmaxvq_f64) |
886 | }; |
887 | return dispatcher.apply(register_type(arg)); |
888 | } |
889 | |
890 | /************** |
891 | * reduce_min * |
892 | **************/ |
893 | |
894 | WRAP_REDUCER_INT_EXCLUDING_64(vminvq) |
895 | WRAP_REDUCER_FLOAT(vminvq) |
896 | |
897 | namespace wrap |
898 | { |
899 | inline uint64_t vminvq_u64(uint64x2_t a) noexcept |
900 | { |
901 | return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1)); |
902 | } |
903 | |
904 | inline int64_t vminvq_s64(int64x2_t a) noexcept |
905 | { |
906 | return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1)); |
907 | } |
908 | } |
909 | |
910 | template <class A, class T, detail::enable_neon64_type_t<T> = 0> |
911 | inline typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept |
912 | { |
913 | using register_type = typename batch<T, A>::register_type; |
914 | const detail::neon_reducer_dispatcher::unary dispatcher = { |
915 | .m_func: std::make_tuple(args&: wrap::vminvq_u8, args&: wrap::vminvq_s8, args&: wrap::vminvq_u16, args&: wrap::vminvq_s16, |
916 | args&: wrap::vminvq_u32, args&: wrap::vminvq_s32, args&: wrap::vminvq_u64, args&: wrap::vminvq_s64, |
917 | args&: wrap::vminvq_f32, args&: wrap::vminvq_f64) |
918 | }; |
919 | return dispatcher.apply(register_type(arg)); |
920 | } |
921 | |
922 | #undef WRAP_REDUCER_INT_EXCLUDING_64 |
923 | #undef WRAP_REDUCER_INT |
924 | #undef WRAP_REDUCER_FLOAT |
925 | |
926 | /********** |
927 | * select * |
928 | **********/ |
929 | |
930 | template <class A> |
931 | inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept |
932 | { |
933 | return vbslq_f64(cond, a, b); |
934 | } |
935 | |
936 | template <class A, bool... b> |
937 | inline batch<double, A> select(batch_bool_constant<batch<double, A>, b...> const&, |
938 | batch<double, A> const& true_br, |
939 | batch<double, A> const& false_br, |
940 | requires_arch<neon64>) noexcept |
941 | { |
942 | return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {}); |
943 | } |
944 | /********** |
945 | * zip_lo * |
946 | **********/ |
947 | |
948 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
949 | inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
950 | { |
951 | return vzip1q_u64(lhs, rhs); |
952 | } |
953 | |
954 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
955 | inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
956 | { |
957 | return vzip1q_s64(lhs, rhs); |
958 | } |
959 | |
960 | template <class A> |
961 | inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
962 | { |
963 | return vzip1q_f64(lhs, rhs); |
964 | } |
965 | |
966 | /********** |
967 | * zip_hi * |
968 | **********/ |
969 | |
970 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
971 | inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
972 | { |
973 | return vzip2q_u64(lhs, rhs); |
974 | } |
975 | |
976 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
977 | inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
978 | { |
979 | return vzip2q_s64(lhs, rhs); |
980 | } |
981 | |
982 | template <class A> |
983 | inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept |
984 | { |
985 | return vzip2q_f64(lhs, rhs); |
986 | } |
987 | |
988 | /**************** |
989 | * extract_pair * |
990 | ****************/ |
991 | |
992 | namespace detail |
993 | { |
994 | template <class A, size_t I, size_t... Is> |
995 | inline batch<double, A> (batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, |
996 | ::xsimd::detail::index_sequence<I, Is...>) noexcept |
997 | { |
998 | if (n == I) |
999 | { |
1000 | return vextq_f64(rhs, lhs, I); |
1001 | } |
1002 | else |
1003 | { |
1004 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1005 | } |
1006 | } |
1007 | } |
1008 | |
1009 | template <class A> |
1010 | inline batch<double, A> (batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept |
1011 | { |
1012 | constexpr std::size_t size = batch<double, A>::size; |
1013 | assert(n < size && "index in bounds" ); |
1014 | return detail::extract_pair(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>()); |
1015 | } |
1016 | |
1017 | /****************** |
1018 | * bitwise_rshift * |
1019 | ******************/ |
1020 | |
1021 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
1022 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept |
1023 | { |
1024 | return bitwise_rshift<A>(lhs, n, neon {}); |
1025 | } |
1026 | |
1027 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
1028 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept |
1029 | { |
1030 | return vshlq_u64(lhs, vnegq_s64(rhs)); |
1031 | } |
1032 | |
1033 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
1034 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept |
1035 | { |
1036 | return bitwise_rshift<A>(lhs, n, neon {}); |
1037 | } |
1038 | |
1039 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
1040 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept |
1041 | { |
1042 | return vshlq_s64(lhs, vnegq_s64(rhs)); |
1043 | } |
1044 | |
1045 | /**************** |
1046 | * bitwise_cast * |
1047 | ****************/ |
1048 | |
1049 | #define WRAP_CAST(SUFFIX, TYPE) \ |
1050 | namespace wrap \ |
1051 | { \ |
1052 | inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept \ |
1053 | { \ |
1054 | return ::vreinterpretq_f64_##SUFFIX(a); \ |
1055 | } \ |
1056 | inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \ |
1057 | { \ |
1058 | return ::vreinterpretq_##SUFFIX##_f64(a); \ |
1059 | } \ |
1060 | } |
1061 | |
1062 | WRAP_CAST(u8, uint8x16_t) |
1063 | WRAP_CAST(s8, int8x16_t) |
1064 | WRAP_CAST(u16, uint16x8_t) |
1065 | WRAP_CAST(s16, int16x8_t) |
1066 | WRAP_CAST(u32, uint32x4_t) |
1067 | WRAP_CAST(s32, int32x4_t) |
1068 | WRAP_CAST(u64, uint64x2_t) |
1069 | WRAP_CAST(s64, int64x2_t) |
1070 | WRAP_CAST(f32, float32x4_t) |
1071 | |
1072 | #undef WRAP_CAST |
1073 | |
1074 | template <class A, class T> |
1075 | inline batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept |
1076 | { |
1077 | using caster_type = detail::bitwise_caster_impl<float64x2_t, |
1078 | uint8x16_t, int8x16_t, |
1079 | uint16x8_t, int16x8_t, |
1080 | uint32x4_t, int32x4_t, |
1081 | uint64x2_t, int64x2_t, |
1082 | float32x4_t>; |
1083 | const caster_type caster = { |
1084 | .m_func: std::make_tuple(args&: wrap::vreinterpretq_f64_u8, args&: wrap::vreinterpretq_f64_s8, args&: wrap::vreinterpretq_f64_u16, args&: wrap::vreinterpretq_f64_s16, |
1085 | args&: wrap::vreinterpretq_f64_u32, args&: wrap::vreinterpretq_f64_s32, args&: wrap::vreinterpretq_f64_u64, args&: wrap::vreinterpretq_f64_s64, |
1086 | args&: wrap::vreinterpretq_f64_f32) |
1087 | }; |
1088 | using register_type = typename batch<T, A>::register_type; |
1089 | return caster.apply(register_type(arg)); |
1090 | } |
1091 | |
1092 | namespace detail |
1093 | { |
1094 | template <class S, class... R> |
1095 | struct bitwise_caster_neon64 |
1096 | { |
1097 | using container_type = std::tuple<R (*)(S)...>; |
1098 | container_type m_func; |
1099 | |
1100 | template <class V> |
1101 | V apply(float64x2_t rhs) const |
1102 | { |
1103 | using func_type = V (*)(float64x2_t); |
1104 | auto func = xsimd::detail::get<func_type>(m_func); |
1105 | return func(rhs); |
1106 | } |
1107 | }; |
1108 | } |
1109 | |
1110 | template <class A, class R> |
1111 | inline batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept |
1112 | { |
1113 | using caster_type = detail::bitwise_caster_neon64<float64x2_t, |
1114 | uint8x16_t, int8x16_t, |
1115 | uint16x8_t, int16x8_t, |
1116 | uint32x4_t, int32x4_t, |
1117 | uint64x2_t, int64x2_t, |
1118 | float32x4_t>; |
1119 | const caster_type caster = { |
1120 | .m_func: std::make_tuple(args&: wrap::vreinterpretq_u8_f64, args&: wrap::vreinterpretq_s8_f64, args&: wrap::vreinterpretq_u16_f64, args&: wrap::vreinterpretq_s16_f64, |
1121 | args&: wrap::vreinterpretq_u32_f64, args&: wrap::vreinterpretq_s32_f64, args&: wrap::vreinterpretq_u64_f64, args&: wrap::vreinterpretq_s64_f64, |
1122 | args&: wrap::vreinterpretq_f32_f64) |
1123 | }; |
1124 | using src_register_type = typename batch<double, A>::register_type; |
1125 | using dst_register_type = typename batch<R, A>::register_type; |
1126 | return caster.apply<dst_register_type>(src_register_type(arg)); |
1127 | } |
1128 | |
1129 | template <class A> |
1130 | inline batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept |
1131 | { |
1132 | return arg; |
1133 | } |
1134 | |
1135 | /********* |
1136 | * isnan * |
1137 | *********/ |
1138 | |
1139 | template <class A> |
1140 | inline batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept |
1141 | { |
1142 | return !(arg == arg); |
1143 | } |
1144 | } |
1145 | |
1146 | template <class batch_type, typename batch_type::value_type... Values> |
1147 | struct batch_constant; |
1148 | |
1149 | namespace kernel |
1150 | { |
1151 | /*********** |
1152 | * swizzle * |
1153 | ***********/ |
1154 | |
1155 | namespace detail |
1156 | { |
1157 | using ::xsimd::batch_constant; |
1158 | using ::xsimd::detail::integer_sequence; |
1159 | using ::xsimd::detail::make_integer_sequence; |
1160 | |
1161 | template <class CB1, class CB2, class IS> |
1162 | struct index_burst_impl; |
1163 | |
1164 | template <class B1, class B2, typename B2::value_type... V, |
1165 | typename B2::value_type... incr> |
1166 | struct index_burst_impl<batch_constant<B1>, batch_constant<B2, V...>, |
1167 | integer_sequence<typename B2::value_type, incr...>> |
1168 | { |
1169 | using type = batch_constant<B2, V...>; |
1170 | }; |
1171 | |
1172 | template <class B1, typename B1::value_type V0, typename B1::value_type... V1, |
1173 | class B2, typename B2::value_type... V2, |
1174 | typename B2::value_type... incr> |
1175 | struct index_burst_impl<batch_constant<B1, V0, V1...>, batch_constant<B2, V2...>, |
1176 | integer_sequence<typename B2::value_type, incr...>> |
1177 | { |
1178 | using value_type = typename B2::value_type; |
1179 | using next_input = batch_constant<B1, V1...>; |
1180 | using next_output = batch_constant<B2, V2..., (V0 + incr)...>; |
1181 | using type = typename index_burst_impl<next_input, next_output, integer_sequence<value_type, incr...>>::type; |
1182 | }; |
1183 | |
1184 | template <class B, class T> |
1185 | struct index_burst; |
1186 | |
1187 | template <class B, typename B::value_type... V, class T> |
1188 | struct index_burst<batch_constant<B, V...>, T> |
1189 | { |
1190 | static constexpr size_t mul = sizeof(typename B::value_type) / sizeof(T); |
1191 | using input = batch_constant<B, (mul * V)...>; |
1192 | using output = batch_constant<batch<T, typename B::arch_type>>; |
1193 | using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type; |
1194 | }; |
1195 | |
1196 | template <class B, class T> |
1197 | using index_burst_t = typename index_burst<B, T>::type; |
1198 | |
1199 | template <class T, class B> |
1200 | inline index_burst_t<B, T> burst_index(B) |
1201 | { |
1202 | return index_burst_t<B, T>(); |
1203 | } |
1204 | } |
1205 | |
1206 | template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7, |
1207 | uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15> |
1208 | inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, |
1209 | batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx, |
1210 | requires_arch<neon64>) noexcept |
1211 | { |
1212 | return vqtbl1q_u8(self, batch<uint8_t, A>(idx)); |
1213 | } |
1214 | |
1215 | template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7, |
1216 | uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15> |
1217 | inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, |
1218 | batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx, |
1219 | requires_arch<neon64>) noexcept |
1220 | { |
1221 | return vqtbl1q_s8(self, batch<uint8_t, A>(idx)); |
1222 | } |
1223 | |
1224 | template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7> |
1225 | inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, |
1226 | batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx, |
1227 | requires_arch<neon64>) noexcept |
1228 | { |
1229 | using batch_type = batch<uint8_t, A>; |
1230 | return vreinterpretq_u16_u8(swizzle<A>(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index<uint8_t>(idx), A())); |
1231 | } |
1232 | |
1233 | template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7> |
1234 | inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, |
1235 | batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx, |
1236 | requires_arch<neon64>) noexcept |
1237 | { |
1238 | using batch_type = batch<int8_t, A>; |
1239 | return vreinterpretq_s16_s8(swizzle<A>(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index<uint8_t>(idx), A())); |
1240 | } |
1241 | |
1242 | template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3> |
1243 | inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, |
1244 | batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx, |
1245 | requires_arch<neon64>) noexcept |
1246 | { |
1247 | using batch_type = batch<uint8_t, A>; |
1248 | return vreinterpretq_u32_u8(swizzle<A>(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index<uint8_t>(idx), A())); |
1249 | } |
1250 | |
1251 | template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3> |
1252 | inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, |
1253 | batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx, |
1254 | requires_arch<neon64>) noexcept |
1255 | { |
1256 | using batch_type = batch<int8_t, A>; |
1257 | return vreinterpretq_s32_s8(swizzle<A>(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index<uint8_t>(idx), A())); |
1258 | } |
1259 | |
1260 | template <class A, uint64_t V0, uint64_t V1> |
1261 | inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, |
1262 | batch_constant<batch<uint64_t, A>, V0, V1> idx, |
1263 | requires_arch<neon64>) noexcept |
1264 | { |
1265 | using batch_type = batch<uint8_t, A>; |
1266 | return vreinterpretq_u64_u8(swizzle<A>(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index<uint8_t>(idx), A())); |
1267 | } |
1268 | |
1269 | template <class A, uint64_t V0, uint64_t V1> |
1270 | inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, |
1271 | batch_constant<batch<uint64_t, A>, V0, V1> idx, |
1272 | requires_arch<neon64>) noexcept |
1273 | { |
1274 | using batch_type = batch<int8_t, A>; |
1275 | return vreinterpretq_s64_s8(swizzle<A>(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index<uint8_t>(idx), A())); |
1276 | } |
1277 | |
1278 | template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3> |
1279 | inline batch<float, A> swizzle(batch<float, A> const& self, |
1280 | batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx, |
1281 | requires_arch<neon64>) noexcept |
1282 | { |
1283 | using batch_type = batch<uint8_t, A>; |
1284 | return vreinterpretq_f32_u8(swizzle<A>(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index<uint8_t>(idx), A())); |
1285 | } |
1286 | |
1287 | template <class A, uint64_t V0, uint64_t V1> |
1288 | inline batch<double, A> swizzle(batch<double, A> const& self, |
1289 | batch_constant<batch<uint64_t, A>, V0, V1> idx, |
1290 | requires_arch<neon64>) noexcept |
1291 | { |
1292 | using batch_type = batch<uint8_t, A>; |
1293 | return vreinterpretq_f64_u8(swizzle<A>(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index<uint8_t>(idx), A())); |
1294 | } |
1295 | |
1296 | template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3> |
1297 | inline batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self, |
1298 | batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx, |
1299 | requires_arch<neon64>) noexcept |
1300 | { |
1301 | return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A())); |
1302 | } |
1303 | |
1304 | template <class A, uint64_t V0, uint64_t V1> |
1305 | inline batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self, |
1306 | batch_constant<batch<uint64_t, A>, V0, V1> idx, |
1307 | requires_arch<neon64>) noexcept |
1308 | { |
1309 | return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A())); |
1310 | } |
1311 | } |
1312 | } |
1313 | |
1314 | #endif |
1315 | |