1/***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3 * Martin Renou *
4 * Copyright (c) QuantStack *
5 * Copyright (c) Serge Guelton *
6 * *
7 * Distributed under the terms of the BSD 3-Clause License. *
8 * *
9 * The full license is in the file LICENSE, distributed with this software. *
10 ****************************************************************************/
11
12#ifndef XSIMD_NEON64_HPP
13#define XSIMD_NEON64_HPP
14
15#include <complex>
16#include <cstddef>
17#include <tuple>
18
19#include "../types/xsimd_neon64_register.hpp"
20#include "../types/xsimd_utils.hpp"
21
22namespace xsimd
23{
24 template <class batch_type, bool... Values>
25 struct batch_bool_constant;
26
27 namespace kernel
28 {
29 using namespace types;
30
31 /*******
32 * all *
33 *******/
34
35 template <class A, class T, detail::enable_sized_t<T, 4> = 0>
36 inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
37 {
38 return vminvq_u32(arg) == ~0U;
39 }
40
41 template <class A, class T, detail::enable_sized_t<T, 1> = 0>
42 inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
43 {
44 return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
45 }
46
47 template <class A, class T, detail::enable_sized_t<T, 2> = 0>
48 inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
49 {
50 return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
51 }
52
53 template <class A, class T, detail::enable_sized_t<T, 8> = 0>
54 inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
55 {
56 return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
57 }
58
59 /*******
60 * any *
61 *******/
62
63 template <class A, class T, detail::enable_sized_t<T, 4> = 0>
64 inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
65 {
66 return vmaxvq_u32(arg) != 0;
67 }
68
69 template <class A, class T, detail::enable_sized_t<T, 1> = 0>
70 inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
71 {
72 return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
73 }
74
75 template <class A, class T, detail::enable_sized_t<T, 2> = 0>
76 inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
77 {
78 return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
79 }
80
81 template <class A, class T, detail::enable_sized_t<T, 8> = 0>
82 inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
83 {
84 return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
85 }
86
87 /*************
88 * broadcast *
89 *************/
90
91 // Required to avoid ambiguous call
92 template <class A, class T>
93 inline batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
94 {
95 return broadcast<neon64>(val, neon {});
96 }
97
98 template <class A>
99 inline batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept
100 {
101 return vdupq_n_f64(p0: val);
102 }
103
104 /*******
105 * set *
106 *******/
107
108 template <class A>
109 inline batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
110 {
111 return float64x2_t { d0, d1 };
112 }
113
114 template <class A>
115 inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
116 {
117 using register_type = typename batch_bool<double, A>::register_type;
118 using unsigned_type = as_unsigned_integer_t<double>;
119 return register_type { static_cast<unsigned_type>(b0 ? -1LL : 0LL),
120 static_cast<unsigned_type>(b1 ? -1LL : 0LL) };
121 }
122
123 /*************
124 * from_bool *
125 *************/
126
127 template <class A>
128 inline batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept
129 {
130 return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(p0: vdupq_n_f64(p0: 1.))));
131 }
132
133 /********
134 * load *
135 ********/
136
137 template <class A>
138 inline batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
139 {
140 return vld1q_f64(src);
141 }
142
143 template <class A>
144 inline batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
145 {
146 return load_aligned<A>(src, convert<double>(), A {});
147 }
148
149 /*********
150 * store *
151 *********/
152
153 template <class A>
154 inline void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
155 {
156 vst1q_f64(dst, src);
157 }
158
159 template <class A>
160 inline void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
161 {
162 return store_aligned<A>(dst, src, A {});
163 }
164
165 /****************
166 * load_complex *
167 ****************/
168
169 template <class A>
170 inline batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept
171 {
172 using real_batch = batch<double, A>;
173 const double* buf = reinterpret_cast<const double*>(mem);
174 float64x2x2_t tmp = vld2q_f64(buf);
175 real_batch real = tmp.val[0],
176 imag = tmp.val[1];
177 return batch<std::complex<double>, A> { real, imag };
178 }
179
180 template <class A>
181 inline batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept
182 {
183 return load_complex_aligned<A>(mem, cvt, A {});
184 }
185
186 /*****************
187 * store_complex *
188 *****************/
189
190 template <class A>
191 inline void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
192 {
193 float64x2x2_t tmp;
194 tmp.val[0] = src.real();
195 tmp.val[1] = src.imag();
196 double* buf = reinterpret_cast<double*>(dst);
197 vst2q_f64(buf, tmp);
198 }
199
200 template <class A>
201 inline void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
202 {
203 store_complex_aligned(dst, src, A {});
204 }
205
206 /*******
207 * neg *
208 *******/
209
210 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
211 inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
212 {
213 return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs)));
214 }
215
216 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
217 inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
218 {
219 return vnegq_s64(rhs);
220 }
221
222 template <class A>
223 inline batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
224 {
225 return vnegq_f64(rhs);
226 }
227
228 /*******
229 * add *
230 *******/
231
232 template <class A>
233 inline batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
234 {
235 return vaddq_f64(lhs, rhs);
236 }
237
238 /********
239 * sadd *
240 ********/
241
242 template <class A>
243 inline batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
244 {
245 return add(lhs, rhs, neon64 {});
246 }
247
248 /*******
249 * sub *
250 *******/
251
252 template <class A>
253 inline batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
254 {
255 return vsubq_f64(lhs, rhs);
256 }
257
258 /********
259 * ssub *
260 ********/
261
262 template <class A>
263 inline batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
264 {
265 return sub(lhs, rhs, neon64 {});
266 }
267
268 /*******
269 * mul *
270 *******/
271
272 template <class A>
273 inline batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
274 {
275 return vmulq_f64(lhs, rhs);
276 }
277
278 /*******
279 * div *
280 *******/
281
282#if defined(XSIMD_FAST_INTEGER_DIVISION)
283 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
284 inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
285 {
286 return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs));
287 }
288
289 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
290 inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
291 {
292 return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs));
293 }
294#endif
295 template <class A>
296 inline batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
297 {
298 return vdivq_f64(lhs, rhs);
299 }
300
301 /******
302 * eq *
303 ******/
304
305 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
306 inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
307 {
308 return vceqq_u64(lhs, rhs);
309 }
310
311 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
312 inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
313 {
314 return vceqq_s64(lhs, rhs);
315 }
316
317 template <class A>
318 inline batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
319 {
320 return vceqq_f64(lhs, rhs);
321 }
322
323 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
324 inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
325 {
326 return vceqq_u64(lhs, rhs);
327 }
328
329 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
330 inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
331 {
332 return vceqq_u64(lhs, rhs);
333 }
334
335 template <class A>
336 inline batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
337 {
338 return vceqq_u64(lhs, rhs);
339 }
340
341 /*************
342 * fast_cast *
343 *************/
344 namespace detail
345 {
346 template <class A>
347 inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
348 {
349 return vcvtq_f64_s64(x);
350 }
351
352 template <class A>
353 inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
354 {
355 return vcvtq_f64_u64(x);
356 }
357
358 template <class A>
359 inline batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept
360 {
361 return vcvtq_s64_f64(x);
362 }
363
364 template <class A>
365 inline batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept
366 {
367 return vcvtq_u64_f64(x);
368 }
369
370 }
371
372 /******
373 * lt *
374 ******/
375
376 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
377 inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
378 {
379 return vcltq_u64(lhs, rhs);
380 }
381
382 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
383 inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
384 {
385 return vcltq_s64(lhs, rhs);
386 }
387
388 template <class A>
389 inline batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
390 {
391 return vcltq_f64(lhs, rhs);
392 }
393
394 /******
395 * le *
396 ******/
397
398 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
399 inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
400 {
401 return vcleq_u64(lhs, rhs);
402 }
403
404 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
405 inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
406 {
407 return vcleq_s64(lhs, rhs);
408 }
409
410 template <class A>
411 inline batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
412 {
413 return vcleq_f64(lhs, rhs);
414 }
415
416 /******
417 * gt *
418 ******/
419
420 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
421 inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
422 {
423 return vcgtq_u64(lhs, rhs);
424 }
425
426 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
427 inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
428 {
429 return vcgtq_s64(lhs, rhs);
430 }
431
432 template <class A>
433 inline batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
434 {
435 return vcgtq_f64(lhs, rhs);
436 }
437
438 /******
439 * ge *
440 ******/
441
442 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
443 inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
444 {
445 return vcgeq_u64(lhs, rhs);
446 }
447
448 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
449 inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
450 {
451 return vcgeq_s64(lhs, rhs);
452 }
453
454 template <class A>
455 inline batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
456 {
457 return vcgeq_f64(lhs, rhs);
458 }
459
460 /*******************
461 * batch_bool_cast *
462 *******************/
463
464 template <class A, class T_out, class T_in>
465 inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept
466 {
467 using register_type = typename batch_bool<T_out, A>::register_type;
468 return register_type(self);
469 }
470
471 /***************
472 * bitwise_and *
473 ***************/
474
475 template <class A>
476 inline batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
477 {
478 return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs),
479 vreinterpretq_u64_f64(rhs)));
480 }
481
482 template <class A>
483 inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
484 {
485 return vandq_u64(lhs, rhs);
486 }
487
488 /**************
489 * bitwise_or *
490 **************/
491
492 template <class A>
493 inline batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
494 {
495 return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs),
496 vreinterpretq_u64_f64(rhs)));
497 }
498
499 template <class A>
500 inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
501 {
502 return vorrq_u64(lhs, rhs);
503 }
504
505 /***************
506 * bitwise_xor *
507 ***************/
508
509 template <class A>
510 inline batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
511 {
512 return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs),
513 vreinterpretq_u64_f64(rhs)));
514 }
515
516 template <class A>
517 inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
518 {
519 return veorq_u64(lhs, rhs);
520 }
521
522 /*******
523 * neq *
524 *******/
525
526 template <class A>
527 inline batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
528 {
529 return bitwise_xor(lhs, rhs, A {});
530 }
531
532 /***************
533 * bitwise_not *
534 ***************/
535
536 template <class A>
537 inline batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
538 {
539 return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs)));
540 }
541
542 template <class A>
543 inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
544 {
545 return detail::bitwise_not_u64(arg: rhs);
546 }
547
548 /******************
549 * bitwise_andnot *
550 ******************/
551
552 template <class A>
553 inline batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
554 {
555 return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs),
556 vreinterpretq_u64_f64(rhs)));
557 }
558
559 template <class A>
560 inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
561 {
562 return vbicq_u64(lhs, rhs);
563 }
564
565 /*******
566 * min *
567 *******/
568
569 template <class A>
570 inline batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
571 {
572 return vminq_f64(lhs, rhs);
573 }
574
575 /*******
576 * max *
577 *******/
578
579 template <class A>
580 inline batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
581 {
582 return vmaxq_f64(lhs, rhs);
583 }
584
585 /*******
586 * abs *
587 *******/
588
589 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
590 inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
591 {
592 return rhs;
593 }
594
595 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
596 inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
597 {
598 return vabsq_s64(rhs);
599 }
600
601 template <class A>
602 inline batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
603 {
604 return vabsq_f64(rhs);
605 }
606
607 template <class A>
608 inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
609 requires_arch<neon64>) noexcept
610 {
611 return vcvtnq_s32_f32(self);
612 }
613
614#if !defined(__GNUC__)
615 template <class A>
616 inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
617 requires_arch<neon64>) noexcept
618 {
619 return vcvtnq_s64_f64(self);
620 }
621#endif
622
623 /**************
624 * reciprocal *
625 **************/
626
627 template <class A>
628 inline batch<double, A>
629 reciprocal(const batch<double, A>& x,
630 kernel::requires_arch<neon64>) noexcept
631 {
632 return vrecpeq_f64(x);
633 }
634
635 /********
636 * rsqrt *
637 ********/
638
639 template <class A>
640 inline batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
641 {
642 return vrsqrteq_f64(rhs);
643 }
644
645 /********
646 * sqrt *
647 ********/
648
649 template <class A>
650 inline batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
651 {
652 return vsqrtq_f64(rhs);
653 }
654
655 /********************
656 * Fused operations *
657 ********************/
658
659#ifdef __ARM_FEATURE_FMA
660 template <class A>
661 inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
662 {
663 return vfmaq_f64(z, x, y);
664 }
665
666 template <class A>
667 inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
668 {
669 return vfmaq_f64(-z, x, y);
670 }
671#endif
672
673 /*********
674 * haddp *
675 *********/
676
677 template <class A>
678 inline batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept
679 {
680 return vpaddq_f64(row[0], row[1]);
681 }
682
683 /**********
684 * insert *
685 **********/
686
687 template <class A, size_t I>
688 inline batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept
689 {
690 return vsetq_lane_f64(val, self, I);
691 }
692
693 /******************
694 * reducer macros *
695 ******************/
696
697 // Wrap reducer intrinsics so we can pass them as function pointers
698 // - OP: intrinsics name prefix, e.g., vorrq
699
700#define WRAP_REDUCER_INT_EXCLUDING_64(OP) \
701 namespace wrap \
702 { \
703 inline uint8_t OP##_u8(uint8x16_t a) noexcept \
704 { \
705 return ::OP##_u8(a); \
706 } \
707 inline int8_t OP##_s8(int8x16_t a) noexcept \
708 { \
709 return ::OP##_s8(a); \
710 } \
711 inline uint16_t OP##_u16(uint16x8_t a) noexcept \
712 { \
713 return ::OP##_u16(a); \
714 } \
715 inline int16_t OP##_s16(int16x8_t a) noexcept \
716 { \
717 return ::OP##_s16(a); \
718 } \
719 inline uint32_t OP##_u32(uint32x4_t a) noexcept \
720 { \
721 return ::OP##_u32(a); \
722 } \
723 inline int32_t OP##_s32(int32x4_t a) noexcept \
724 { \
725 return ::OP##_s32(a); \
726 } \
727 }
728
729#define WRAP_REDUCER_INT(OP) \
730 WRAP_REDUCER_INT_EXCLUDING_64(OP) \
731 namespace wrap \
732 { \
733 inline uint64_t OP##_u64(uint64x2_t a) noexcept \
734 { \
735 return ::OP##_u64(a); \
736 } \
737 inline int64_t OP##_s64(int64x2_t a) noexcept \
738 { \
739 return ::OP##_s64(a); \
740 } \
741 }
742
743#define WRAP_REDUCER_FLOAT(OP) \
744 namespace wrap \
745 { \
746 inline float OP##_f32(float32x4_t a) noexcept \
747 { \
748 return ::OP##_f32(a); \
749 } \
750 inline double OP##_f64(float64x2_t a) noexcept \
751 { \
752 return ::OP##_f64(a); \
753 } \
754 }
755
756 namespace detail
757 {
758 template <class R>
759 struct reducer_return_type_impl;
760
761 template <>
762 struct reducer_return_type_impl<uint8x16_t>
763 {
764 using type = uint8_t;
765 };
766
767 template <>
768 struct reducer_return_type_impl<int8x16_t>
769 {
770 using type = int8_t;
771 };
772
773 template <>
774 struct reducer_return_type_impl<uint16x8_t>
775 {
776 using type = uint16_t;
777 };
778
779 template <>
780 struct reducer_return_type_impl<int16x8_t>
781 {
782 using type = int16_t;
783 };
784
785 template <>
786 struct reducer_return_type_impl<uint32x4_t>
787 {
788 using type = uint32_t;
789 };
790
791 template <>
792 struct reducer_return_type_impl<int32x4_t>
793 {
794 using type = int32_t;
795 };
796
797 template <>
798 struct reducer_return_type_impl<uint64x2_t>
799 {
800 using type = uint64_t;
801 };
802
803 template <>
804 struct reducer_return_type_impl<int64x2_t>
805 {
806 using type = int64_t;
807 };
808
809 template <>
810 struct reducer_return_type_impl<float32x4_t>
811 {
812 using type = float;
813 };
814
815 template <>
816 struct reducer_return_type_impl<float64x2_t>
817 {
818 using type = double;
819 };
820
821 template <class R>
822 using reducer_return_type = typename reducer_return_type_impl<R>::type;
823
824 template <class... T>
825 struct neon_reducer_dispatcher_impl : neon_dispatcher_base<reducer_return_type, T...>
826 {
827 };
828
829 using neon_reducer_dispatcher = neon_reducer_dispatcher_impl<uint8x16_t, int8x16_t,
830 uint16x8_t, int16x8_t,
831 uint32x4_t, int32x4_t,
832 uint64x2_t, int64x2_t,
833 float32x4_t, float64x2_t>;
834 template <class T>
835 using enable_neon64_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value,
836 int>::type;
837 }
838
839 /**************
840 * reduce_add *
841 **************/
842
843 WRAP_REDUCER_INT(vaddvq)
844 WRAP_REDUCER_FLOAT(vaddvq)
845
846 template <class A, class T, detail::enable_neon64_type_t<T> = 0>
847 inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
848 {
849 using register_type = typename batch<T, A>::register_type;
850 const detail::neon_reducer_dispatcher::unary dispatcher = {
851 .m_func: std::make_tuple(args&: wrap::vaddvq_u8, args&: wrap::vaddvq_s8, args&: wrap::vaddvq_u16, args&: wrap::vaddvq_s16,
852 args&: wrap::vaddvq_u32, args&: wrap::vaddvq_s32, args&: wrap::vaddvq_u64, args&: wrap::vaddvq_s64,
853 args&: wrap::vaddvq_f32, args&: wrap::vaddvq_f64)
854 };
855 return dispatcher.apply(register_type(arg));
856 }
857
858 /**************
859 * reduce_max *
860 **************/
861
862 WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq)
863 WRAP_REDUCER_FLOAT(vmaxvq)
864
865 namespace wrap
866 {
867 inline uint64_t vmaxvq_u64(uint64x2_t a) noexcept
868 {
869 return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
870 }
871
872 inline int64_t vmaxvq_s64(int64x2_t a) noexcept
873 {
874 return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
875 }
876 }
877
878 template <class A, class T, detail::enable_neon64_type_t<T> = 0>
879 inline typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
880 {
881 using register_type = typename batch<T, A>::register_type;
882 const detail::neon_reducer_dispatcher::unary dispatcher = {
883 .m_func: std::make_tuple(args&: wrap::vmaxvq_u8, args&: wrap::vmaxvq_s8, args&: wrap::vmaxvq_u16, args&: wrap::vmaxvq_s16,
884 args&: wrap::vmaxvq_u32, args&: wrap::vmaxvq_s32, args&: wrap::vmaxvq_u64, args&: wrap::vmaxvq_s64,
885 args&: wrap::vmaxvq_f32, args&: wrap::vmaxvq_f64)
886 };
887 return dispatcher.apply(register_type(arg));
888 }
889
890 /**************
891 * reduce_min *
892 **************/
893
894 WRAP_REDUCER_INT_EXCLUDING_64(vminvq)
895 WRAP_REDUCER_FLOAT(vminvq)
896
897 namespace wrap
898 {
899 inline uint64_t vminvq_u64(uint64x2_t a) noexcept
900 {
901 return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
902 }
903
904 inline int64_t vminvq_s64(int64x2_t a) noexcept
905 {
906 return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
907 }
908 }
909
910 template <class A, class T, detail::enable_neon64_type_t<T> = 0>
911 inline typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
912 {
913 using register_type = typename batch<T, A>::register_type;
914 const detail::neon_reducer_dispatcher::unary dispatcher = {
915 .m_func: std::make_tuple(args&: wrap::vminvq_u8, args&: wrap::vminvq_s8, args&: wrap::vminvq_u16, args&: wrap::vminvq_s16,
916 args&: wrap::vminvq_u32, args&: wrap::vminvq_s32, args&: wrap::vminvq_u64, args&: wrap::vminvq_s64,
917 args&: wrap::vminvq_f32, args&: wrap::vminvq_f64)
918 };
919 return dispatcher.apply(register_type(arg));
920 }
921
922#undef WRAP_REDUCER_INT_EXCLUDING_64
923#undef WRAP_REDUCER_INT
924#undef WRAP_REDUCER_FLOAT
925
926 /**********
927 * select *
928 **********/
929
930 template <class A>
931 inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept
932 {
933 return vbslq_f64(cond, a, b);
934 }
935
936 template <class A, bool... b>
937 inline batch<double, A> select(batch_bool_constant<batch<double, A>, b...> const&,
938 batch<double, A> const& true_br,
939 batch<double, A> const& false_br,
940 requires_arch<neon64>) noexcept
941 {
942 return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
943 }
944 /**********
945 * zip_lo *
946 **********/
947
948 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
949 inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
950 {
951 return vzip1q_u64(lhs, rhs);
952 }
953
954 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
955 inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
956 {
957 return vzip1q_s64(lhs, rhs);
958 }
959
960 template <class A>
961 inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
962 {
963 return vzip1q_f64(lhs, rhs);
964 }
965
966 /**********
967 * zip_hi *
968 **********/
969
970 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
971 inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
972 {
973 return vzip2q_u64(lhs, rhs);
974 }
975
976 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
977 inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
978 {
979 return vzip2q_s64(lhs, rhs);
980 }
981
982 template <class A>
983 inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
984 {
985 return vzip2q_f64(lhs, rhs);
986 }
987
988 /****************
989 * extract_pair *
990 ****************/
991
992 namespace detail
993 {
994 template <class A, size_t I, size_t... Is>
995 inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n,
996 ::xsimd::detail::index_sequence<I, Is...>) noexcept
997 {
998 if (n == I)
999 {
1000 return vextq_f64(rhs, lhs, I);
1001 }
1002 else
1003 {
1004 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1005 }
1006 }
1007 }
1008
1009 template <class A>
1010 inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept
1011 {
1012 constexpr std::size_t size = batch<double, A>::size;
1013 assert(n < size && "index in bounds");
1014 return detail::extract_pair(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
1015 }
1016
1017 /******************
1018 * bitwise_rshift *
1019 ******************/
1020
1021 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1022 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
1023 {
1024 return bitwise_rshift<A>(lhs, n, neon {});
1025 }
1026
1027 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1028 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
1029 {
1030 return vshlq_u64(lhs, vnegq_s64(rhs));
1031 }
1032
1033 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1034 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
1035 {
1036 return bitwise_rshift<A>(lhs, n, neon {});
1037 }
1038
1039 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1040 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1041 {
1042 return vshlq_s64(lhs, vnegq_s64(rhs));
1043 }
1044
1045 /****************
1046 * bitwise_cast *
1047 ****************/
1048
1049#define WRAP_CAST(SUFFIX, TYPE) \
1050 namespace wrap \
1051 { \
1052 inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept \
1053 { \
1054 return ::vreinterpretq_f64_##SUFFIX(a); \
1055 } \
1056 inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
1057 { \
1058 return ::vreinterpretq_##SUFFIX##_f64(a); \
1059 } \
1060 }
1061
1062 WRAP_CAST(u8, uint8x16_t)
1063 WRAP_CAST(s8, int8x16_t)
1064 WRAP_CAST(u16, uint16x8_t)
1065 WRAP_CAST(s16, int16x8_t)
1066 WRAP_CAST(u32, uint32x4_t)
1067 WRAP_CAST(s32, int32x4_t)
1068 WRAP_CAST(u64, uint64x2_t)
1069 WRAP_CAST(s64, int64x2_t)
1070 WRAP_CAST(f32, float32x4_t)
1071
1072#undef WRAP_CAST
1073
1074 template <class A, class T>
1075 inline batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
1076 {
1077 using caster_type = detail::bitwise_caster_impl<float64x2_t,
1078 uint8x16_t, int8x16_t,
1079 uint16x8_t, int16x8_t,
1080 uint32x4_t, int32x4_t,
1081 uint64x2_t, int64x2_t,
1082 float32x4_t>;
1083 const caster_type caster = {
1084 .m_func: std::make_tuple(args&: wrap::vreinterpretq_f64_u8, args&: wrap::vreinterpretq_f64_s8, args&: wrap::vreinterpretq_f64_u16, args&: wrap::vreinterpretq_f64_s16,
1085 args&: wrap::vreinterpretq_f64_u32, args&: wrap::vreinterpretq_f64_s32, args&: wrap::vreinterpretq_f64_u64, args&: wrap::vreinterpretq_f64_s64,
1086 args&: wrap::vreinterpretq_f64_f32)
1087 };
1088 using register_type = typename batch<T, A>::register_type;
1089 return caster.apply(register_type(arg));
1090 }
1091
1092 namespace detail
1093 {
1094 template <class S, class... R>
1095 struct bitwise_caster_neon64
1096 {
1097 using container_type = std::tuple<R (*)(S)...>;
1098 container_type m_func;
1099
1100 template <class V>
1101 V apply(float64x2_t rhs) const
1102 {
1103 using func_type = V (*)(float64x2_t);
1104 auto func = xsimd::detail::get<func_type>(m_func);
1105 return func(rhs);
1106 }
1107 };
1108 }
1109
1110 template <class A, class R>
1111 inline batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
1112 {
1113 using caster_type = detail::bitwise_caster_neon64<float64x2_t,
1114 uint8x16_t, int8x16_t,
1115 uint16x8_t, int16x8_t,
1116 uint32x4_t, int32x4_t,
1117 uint64x2_t, int64x2_t,
1118 float32x4_t>;
1119 const caster_type caster = {
1120 .m_func: std::make_tuple(args&: wrap::vreinterpretq_u8_f64, args&: wrap::vreinterpretq_s8_f64, args&: wrap::vreinterpretq_u16_f64, args&: wrap::vreinterpretq_s16_f64,
1121 args&: wrap::vreinterpretq_u32_f64, args&: wrap::vreinterpretq_s32_f64, args&: wrap::vreinterpretq_u64_f64, args&: wrap::vreinterpretq_s64_f64,
1122 args&: wrap::vreinterpretq_f32_f64)
1123 };
1124 using src_register_type = typename batch<double, A>::register_type;
1125 using dst_register_type = typename batch<R, A>::register_type;
1126 return caster.apply<dst_register_type>(src_register_type(arg));
1127 }
1128
1129 template <class A>
1130 inline batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
1131 {
1132 return arg;
1133 }
1134
1135 /*********
1136 * isnan *
1137 *********/
1138
1139 template <class A>
1140 inline batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept
1141 {
1142 return !(arg == arg);
1143 }
1144 }
1145
1146 template <class batch_type, typename batch_type::value_type... Values>
1147 struct batch_constant;
1148
1149 namespace kernel
1150 {
1151 /***********
1152 * swizzle *
1153 ***********/
1154
1155 namespace detail
1156 {
1157 using ::xsimd::batch_constant;
1158 using ::xsimd::detail::integer_sequence;
1159 using ::xsimd::detail::make_integer_sequence;
1160
1161 template <class CB1, class CB2, class IS>
1162 struct index_burst_impl;
1163
1164 template <class B1, class B2, typename B2::value_type... V,
1165 typename B2::value_type... incr>
1166 struct index_burst_impl<batch_constant<B1>, batch_constant<B2, V...>,
1167 integer_sequence<typename B2::value_type, incr...>>
1168 {
1169 using type = batch_constant<B2, V...>;
1170 };
1171
1172 template <class B1, typename B1::value_type V0, typename B1::value_type... V1,
1173 class B2, typename B2::value_type... V2,
1174 typename B2::value_type... incr>
1175 struct index_burst_impl<batch_constant<B1, V0, V1...>, batch_constant<B2, V2...>,
1176 integer_sequence<typename B2::value_type, incr...>>
1177 {
1178 using value_type = typename B2::value_type;
1179 using next_input = batch_constant<B1, V1...>;
1180 using next_output = batch_constant<B2, V2..., (V0 + incr)...>;
1181 using type = typename index_burst_impl<next_input, next_output, integer_sequence<value_type, incr...>>::type;
1182 };
1183
1184 template <class B, class T>
1185 struct index_burst;
1186
1187 template <class B, typename B::value_type... V, class T>
1188 struct index_burst<batch_constant<B, V...>, T>
1189 {
1190 static constexpr size_t mul = sizeof(typename B::value_type) / sizeof(T);
1191 using input = batch_constant<B, (mul * V)...>;
1192 using output = batch_constant<batch<T, typename B::arch_type>>;
1193 using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
1194 };
1195
1196 template <class B, class T>
1197 using index_burst_t = typename index_burst<B, T>::type;
1198
1199 template <class T, class B>
1200 inline index_burst_t<B, T> burst_index(B)
1201 {
1202 return index_burst_t<B, T>();
1203 }
1204 }
1205
1206 template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
1207 uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
1208 inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
1209 batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
1210 requires_arch<neon64>) noexcept
1211 {
1212 return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
1213 }
1214
1215 template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
1216 uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
1217 inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
1218 batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
1219 requires_arch<neon64>) noexcept
1220 {
1221 return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
1222 }
1223
1224 template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1225 inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
1226 batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
1227 requires_arch<neon64>) noexcept
1228 {
1229 using batch_type = batch<uint8_t, A>;
1230 return vreinterpretq_u16_u8(swizzle<A>(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index<uint8_t>(idx), A()));
1231 }
1232
1233 template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1234 inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
1235 batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
1236 requires_arch<neon64>) noexcept
1237 {
1238 using batch_type = batch<int8_t, A>;
1239 return vreinterpretq_s16_s8(swizzle<A>(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index<uint8_t>(idx), A()));
1240 }
1241
1242 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1243 inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
1244 batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
1245 requires_arch<neon64>) noexcept
1246 {
1247 using batch_type = batch<uint8_t, A>;
1248 return vreinterpretq_u32_u8(swizzle<A>(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index<uint8_t>(idx), A()));
1249 }
1250
1251 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1252 inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
1253 batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
1254 requires_arch<neon64>) noexcept
1255 {
1256 using batch_type = batch<int8_t, A>;
1257 return vreinterpretq_s32_s8(swizzle<A>(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index<uint8_t>(idx), A()));
1258 }
1259
1260 template <class A, uint64_t V0, uint64_t V1>
1261 inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
1262 batch_constant<batch<uint64_t, A>, V0, V1> idx,
1263 requires_arch<neon64>) noexcept
1264 {
1265 using batch_type = batch<uint8_t, A>;
1266 return vreinterpretq_u64_u8(swizzle<A>(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index<uint8_t>(idx), A()));
1267 }
1268
1269 template <class A, uint64_t V0, uint64_t V1>
1270 inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
1271 batch_constant<batch<uint64_t, A>, V0, V1> idx,
1272 requires_arch<neon64>) noexcept
1273 {
1274 using batch_type = batch<int8_t, A>;
1275 return vreinterpretq_s64_s8(swizzle<A>(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index<uint8_t>(idx), A()));
1276 }
1277
1278 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1279 inline batch<float, A> swizzle(batch<float, A> const& self,
1280 batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
1281 requires_arch<neon64>) noexcept
1282 {
1283 using batch_type = batch<uint8_t, A>;
1284 return vreinterpretq_f32_u8(swizzle<A>(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index<uint8_t>(idx), A()));
1285 }
1286
1287 template <class A, uint64_t V0, uint64_t V1>
1288 inline batch<double, A> swizzle(batch<double, A> const& self,
1289 batch_constant<batch<uint64_t, A>, V0, V1> idx,
1290 requires_arch<neon64>) noexcept
1291 {
1292 using batch_type = batch<uint8_t, A>;
1293 return vreinterpretq_f64_u8(swizzle<A>(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index<uint8_t>(idx), A()));
1294 }
1295
1296 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1297 inline batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
1298 batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
1299 requires_arch<neon64>) noexcept
1300 {
1301 return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
1302 }
1303
1304 template <class A, uint64_t V0, uint64_t V1>
1305 inline batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
1306 batch_constant<batch<uint64_t, A>, V0, V1> idx,
1307 requires_arch<neon64>) noexcept
1308 {
1309 return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
1310 }
1311 }
1312}
1313
1314#endif
1315