1/***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3 * Martin Renou *
4 * Copyright (c) QuantStack *
5 * Copyright (c) Serge Guelton *
6 * *
7 * Distributed under the terms of the BSD 3-Clause License. *
8 * *
9 * The full license is in the file LICENSE, distributed with this software. *
10 ****************************************************************************/
11
12#ifndef XSIMD_NEON_HPP
13#define XSIMD_NEON_HPP
14
15#include <algorithm>
16#include <complex>
17#include <tuple>
18#include <type_traits>
19
20#include "../types/xsimd_neon_register.hpp"
21#include "../types/xsimd_utils.hpp"
22
23// Wrap intrinsics so we can pass them as function pointers
24// - OP: intrinsics name prefix, e.g., vorrq
25// - RT: type traits to deduce intrinsics return types
26#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
27 namespace wrap \
28 { \
29 inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \
30 { \
31 return ::OP##_u8(a, b); \
32 } \
33 inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \
34 { \
35 return ::OP##_s8(a, b); \
36 } \
37 inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
38 { \
39 return ::OP##_u16(a, b); \
40 } \
41 inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
42 { \
43 return ::OP##_s16(a, b); \
44 } \
45 inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
46 { \
47 return ::OP##_u32(a, b); \
48 } \
49 inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
50 { \
51 return ::OP##_s32(a, b); \
52 } \
53 }
54
55#define WRAP_BINARY_INT(OP, RT) \
56 WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
57 namespace wrap \
58 { \
59 inline RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
60 { \
61 return ::OP##_u64(a, b); \
62 } \
63 inline RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept \
64 { \
65 return ::OP##_s64(a, b); \
66 } \
67 }
68
69#define WRAP_BINARY_FLOAT(OP, RT) \
70 namespace wrap \
71 { \
72 inline RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
73 { \
74 return ::OP##_f32(a, b); \
75 } \
76 }
77
78#define WRAP_UNARY_INT_EXCLUDING_64(OP) \
79 namespace wrap \
80 { \
81 inline uint8x16_t OP##_u8(uint8x16_t a) noexcept \
82 { \
83 return ::OP##_u8(a); \
84 } \
85 inline int8x16_t OP##_s8(int8x16_t a) noexcept \
86 { \
87 return ::OP##_s8(a); \
88 } \
89 inline uint16x8_t OP##_u16(uint16x8_t a) noexcept \
90 { \
91 return ::OP##_u16(a); \
92 } \
93 inline int16x8_t OP##_s16(int16x8_t a) noexcept \
94 { \
95 return ::OP##_s16(a); \
96 } \
97 inline uint32x4_t OP##_u32(uint32x4_t a) noexcept \
98 { \
99 return ::OP##_u32(a); \
100 } \
101 inline int32x4_t OP##_s32(int32x4_t a) noexcept \
102 { \
103 return ::OP##_s32(a); \
104 } \
105 }
106
107#define WRAP_UNARY_INT(OP) \
108 WRAP_UNARY_INT_EXCLUDING_64(OP) \
109 namespace wrap \
110 { \
111 inline uint64x2_t OP##_u64(uint64x2_t a) noexcept \
112 { \
113 return ::OP##_u64(a); \
114 } \
115 inline int64x2_t OP##_s64(int64x2_t a) noexcept \
116 { \
117 return ::OP##_s64(a); \
118 } \
119 }
120
121#define WRAP_UNARY_FLOAT(OP) \
122 namespace wrap \
123 { \
124 inline float32x4_t OP##_f32(float32x4_t a) noexcept \
125 { \
126 return ::OP##_f32(a); \
127 } \
128 }
129
130// Dummy identity caster to ease coding
131inline uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
132inline int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
133inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
134inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
135inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
136inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
137inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
138inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
139inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
140
141namespace xsimd
142{
143 template <class batch_type, bool... Values>
144 struct batch_bool_constant;
145
146 namespace kernel
147 {
148 using namespace types;
149
150 namespace detail
151 {
152 template <template <class> class return_type, class... T>
153 struct neon_dispatcher_base
154 {
155 struct unary
156 {
157 using container_type = std::tuple<return_type<T> (*)(T)...>;
158 const container_type m_func;
159
160 template <class U>
161 return_type<U> apply(U rhs) const noexcept
162 {
163 using func_type = return_type<U> (*)(U);
164 auto func = xsimd::detail::get<func_type>(m_func);
165 return func(rhs);
166 }
167 };
168
169 struct binary
170 {
171 using container_type = std::tuple<return_type<T> (*)(T, T)...>;
172 const container_type m_func;
173
174 template <class U>
175 return_type<U> apply(U lhs, U rhs) const noexcept
176 {
177 using func_type = return_type<U> (*)(U, U);
178 auto func = xsimd::detail::get<func_type>(m_func);
179 return func(lhs, rhs);
180 }
181 };
182 };
183
184 /***************************
185 * arithmetic dispatchers *
186 ***************************/
187
188 template <class T>
189 using identity_return_type = T;
190
191 template <class... T>
192 struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...>
193 {
194 };
195
196 using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
197 uint16x8_t, int16x8_t,
198 uint32x4_t, int32x4_t,
199 uint64x2_t, int64x2_t,
200 float32x4_t>;
201
202 using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
203 uint16x8_t, int16x8_t,
204 uint32x4_t, int32x4_t,
205 float32x4_t>;
206
207 /**************************
208 * comparison dispatchers *
209 **************************/
210
211 template <class T>
212 struct comp_return_type_impl;
213
214 template <>
215 struct comp_return_type_impl<uint8x16_t>
216 {
217 using type = uint8x16_t;
218 };
219
220 template <>
221 struct comp_return_type_impl<int8x16_t>
222 {
223 using type = uint8x16_t;
224 };
225
226 template <>
227 struct comp_return_type_impl<uint16x8_t>
228 {
229 using type = uint16x8_t;
230 };
231
232 template <>
233 struct comp_return_type_impl<int16x8_t>
234 {
235 using type = uint16x8_t;
236 };
237
238 template <>
239 struct comp_return_type_impl<uint32x4_t>
240 {
241 using type = uint32x4_t;
242 };
243
244 template <>
245 struct comp_return_type_impl<int32x4_t>
246 {
247 using type = uint32x4_t;
248 };
249
250 template <>
251 struct comp_return_type_impl<uint64x2_t>
252 {
253 using type = uint64x2_t;
254 };
255
256 template <>
257 struct comp_return_type_impl<int64x2_t>
258 {
259 using type = uint64x2_t;
260 };
261
262 template <>
263 struct comp_return_type_impl<float32x4_t>
264 {
265 using type = uint32x4_t;
266 };
267
268 template <class T>
269 using comp_return_type = typename comp_return_type_impl<T>::type;
270
271 template <class... T>
272 struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...>
273 {
274 };
275
276 using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t,
277 uint16x8_t, int16x8_t,
278 uint32x4_t, int32x4_t,
279 float32x4_t>;
280
281 /**************************************
282 * enabling / disabling metafunctions *
283 **************************************/
284
285 template <class T>
286 using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value,
287 int>::type;
288
289 template <class T>
290 using exclude_int64_neon_t
291 = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>::type;
292 }
293
294 /*************
295 * broadcast *
296 *************/
297
298 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
299 inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
300 {
301 return vdupq_n_u8(p0: uint8_t(val));
302 }
303
304 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
305 inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
306 {
307 return vdupq_n_s8(p0: int8_t(val));
308 }
309
310 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
311 inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
312 {
313 return vdupq_n_u16(p0: uint16_t(val));
314 }
315
316 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
317 inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
318 {
319 return vdupq_n_s16(p0: int16_t(val));
320 }
321
322 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
323 inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
324 {
325 return vdupq_n_u32(p0: uint32_t(val));
326 }
327
328 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
329 inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
330 {
331 return vdupq_n_s32(p0: int32_t(val));
332 }
333
334 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
335 inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
336 {
337 return vdupq_n_u64(p0: uint64_t(val));
338 }
339
340 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
341 inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
342 {
343 return vdupq_n_s64(p0: int64_t(val));
344 }
345
346 template <class A>
347 inline batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
348 {
349 return vdupq_n_f32(p0: val);
350 }
351
352 /*******
353 * set *
354 *******/
355
356 template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
357 inline batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
358 {
359 return xsimd::types::detail::neon_vector_type<T> { args... };
360 }
361
362 template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
363 inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
364 {
365 using register_type = typename batch_bool<T, A>::register_type;
366 using unsigned_type = as_unsigned_integer_t<T>;
367 return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
368 }
369
370 template <class A>
371 inline batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
372 {
373 return float32x4_t { f0, f1, f2, f3 };
374 }
375
376 template <class A>
377 inline batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
378 std::complex<float> c0, std::complex<float> c1,
379 std::complex<float> c2, std::complex<float> c3) noexcept
380 {
381 return batch<std::complex<float>>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
382 float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
383 }
384
385 template <class A, class... Args>
386 inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
387 {
388 using register_type = typename batch_bool<float, A>::register_type;
389 using unsigned_type = as_unsigned_integer_t<float>;
390 return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
391 }
392
393 /*************
394 * from_bool *
395 *************/
396
397 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
398 inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
399 {
400 return vandq_u8(arg, vdupq_n_u8(p0: 1));
401 }
402
403 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
404 inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
405 {
406 return vandq_s8(p0: reinterpret_cast<int8x16_t>(arg.data), p1: vdupq_n_s8(p0: 1));
407 }
408
409 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
410 inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
411 {
412 return vandq_u16(arg, vdupq_n_u16(p0: 1));
413 }
414
415 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
416 inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
417 {
418 return vandq_s16(p0: reinterpret_cast<int16x8_t>(arg.data), p1: vdupq_n_s16(p0: 1));
419 }
420
421 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
422 inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
423 {
424 return vandq_u32(arg, vdupq_n_u32(p0: 1));
425 }
426
427 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
428 inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
429 {
430 return vandq_s32(p0: reinterpret_cast<int32x4_t>(arg.data), p1: vdupq_n_s32(p0: 1));
431 }
432
433 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
434 inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
435 {
436 return vandq_u64(arg, vdupq_n_u64(p0: 1));
437 }
438
439 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
440 inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
441 {
442 return vandq_s64(p0: reinterpret_cast<int64x2_t>(arg.data), p1: vdupq_n_s64(p0: 1));
443 }
444
445 template <class A>
446 inline batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
447 {
448 return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(p0: vdupq_n_f32(p0: 1.f))));
449 }
450
451 /********
452 * load *
453 ********/
454
455 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
456 inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
457 {
458 return vld1q_u8((uint8_t*)src);
459 }
460
461 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
462 inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
463 {
464 return vld1q_s8((int8_t*)src);
465 }
466
467 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
468 inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
469 {
470 return vld1q_u16((uint16_t*)src);
471 }
472 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
473 inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
474 {
475 return vld1q_s16((int16_t*)src);
476 }
477 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
478 inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
479 {
480 return vld1q_u32((uint32_t*)src);
481 }
482 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
483 inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
484 {
485 return vld1q_s32((int32_t*)src);
486 }
487 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
488 inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
489 {
490 return vld1q_u64((uint64_t*)src);
491 }
492 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
493 inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
494 {
495 return vld1q_s64((int64_t*)src);
496 }
497
498 template <class A>
499 inline batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
500 {
501 return vld1q_f32(src);
502 }
503
504 template <class A, class T>
505 inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
506 {
507 return load_aligned<A>(src, convert<T>(), A {});
508 }
509
510 /*********
511 * store *
512 *********/
513
514 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
515 inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
516 {
517 vst1q_u8((uint8_t*)dst, src);
518 }
519
520 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
521 inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
522 {
523 vst1q_s8((int8_t*)dst, src);
524 }
525
526 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
527 inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
528 {
529 vst1q_u16((uint16_t*)dst, src);
530 }
531
532 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
533 inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
534 {
535 vst1q_s16((int16_t*)dst, src);
536 }
537
538 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
539 inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
540 {
541 vst1q_u32((uint32_t*)dst, src);
542 }
543
544 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
545 inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
546 {
547 vst1q_s32((int32_t*)dst, src);
548 }
549
550 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
551 inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
552 {
553 vst1q_u64((uint64_t*)dst, src);
554 }
555
556 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
557 inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
558 {
559 vst1q_s64((int64_t*)dst, src);
560 }
561
562 template <class A>
563 inline void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
564 {
565 vst1q_f32(dst, src);
566 }
567
568 template <class A, class T>
569 inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
570 {
571 store_aligned<A>(dst, src, A {});
572 }
573
574 /****************
575 * load_complex *
576 ****************/
577
578 template <class A>
579 inline batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept
580 {
581 using real_batch = batch<float, A>;
582 const float* buf = reinterpret_cast<const float*>(mem);
583 float32x4x2_t tmp = vld2q_f32(buf);
584 real_batch real = tmp.val[0],
585 imag = tmp.val[1];
586 return batch<std::complex<float>, A> { real, imag };
587 }
588
589 template <class A>
590 inline batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
591 {
592 return load_complex_aligned<A>(mem, cvt, A {});
593 }
594
595 /*****************
596 * store_complex *
597 *****************/
598
599 template <class A>
600 inline void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
601 {
602 float32x4x2_t tmp;
603 tmp.val[0] = src.real();
604 tmp.val[1] = src.imag();
605 float* buf = reinterpret_cast<float*>(dst);
606 vst2q_f32(buf, tmp);
607 }
608
609 template <class A>
610 inline void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
611 {
612 store_complex_aligned(dst, src, A {});
613 }
614
615 /*******
616 * neg *
617 *******/
618
619 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
620 inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
621 {
622 return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs)));
623 }
624
625 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
626 inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
627 {
628 return vnegq_s8(rhs);
629 }
630
631 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
632 inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
633 {
634 return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs)));
635 }
636
637 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
638 inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
639 {
640 return vnegq_s16(rhs);
641 }
642
643 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
644 inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
645 {
646 return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs)));
647 }
648
649 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
650 inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
651 {
652 return vnegq_s32(rhs);
653 }
654
655 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
656 inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
657 {
658 return batch<T, A> { -rhs.get(0), -rhs.get(1) };
659 }
660
661 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
662 inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
663 {
664 return batch<T, A> { -rhs.get(0), -rhs.get(1) };
665 }
666
667 template <class A>
668 inline batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
669 {
670 return vnegq_f32(rhs);
671 }
672
673 /*******
674 * add *
675 *******/
676
677 WRAP_BINARY_INT(vaddq, detail::identity_return_type)
678 WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)
679
680 template <class A, class T, detail::enable_neon_type_t<T> = 0>
681 inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
682 {
683 using register_type = typename batch<T, A>::register_type;
684 const detail::neon_dispatcher::binary dispatcher = {
685 .m_func: std::make_tuple(args&: wrap::vaddq_u8, args&: wrap::vaddq_s8, args&: wrap::vaddq_u16, args&: wrap::vaddq_s16,
686 args&: wrap::vaddq_u32, args&: wrap::vaddq_s32, args&: wrap::vaddq_u64, args&: wrap::vaddq_s64,
687 args&: wrap::vaddq_f32)
688 };
689 return dispatcher.apply(register_type(lhs), register_type(rhs));
690 }
691
692 /********
693 * sadd *
694 ********/
695
696 WRAP_BINARY_INT(vqaddq, detail::identity_return_type)
697
698 template <class A, class T, detail::enable_neon_type_t<T> = 0>
699 inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
700 {
701 using register_type = typename batch<T, A>::register_type;
702 const detail::neon_dispatcher::binary dispatcher = {
703 .m_func: std::make_tuple(args&: wrap::vqaddq_u8, args&: wrap::vqaddq_s8, args&: wrap::vqaddq_u16, args&: wrap::vqaddq_s16,
704 args&: wrap::vqaddq_u32, args&: wrap::vqaddq_s32, args&: wrap::vqaddq_u64, args&: wrap::vqaddq_s64,
705 args&: wrap::vaddq_f32)
706 };
707 return dispatcher.apply(register_type(lhs), register_type(rhs));
708 }
709
710 /*******
711 * sub *
712 *******/
713
714 WRAP_BINARY_INT(vsubq, detail::identity_return_type)
715 WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)
716
717 template <class A, class T, detail::enable_neon_type_t<T> = 0>
718 inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
719 {
720 using register_type = typename batch<T, A>::register_type;
721 const detail::neon_dispatcher::binary dispatcher = {
722 .m_func: std::make_tuple(args&: wrap::vsubq_u8, args&: wrap::vsubq_s8, args&: wrap::vsubq_u16, args&: wrap::vsubq_s16,
723 args&: wrap::vsubq_u32, args&: wrap::vsubq_s32, args&: wrap::vsubq_u64, args&: wrap::vsubq_s64,
724 args&: wrap::vsubq_f32)
725 };
726 return dispatcher.apply(register_type(lhs), register_type(rhs));
727 }
728
729 /********
730 * ssub *
731 ********/
732
733 WRAP_BINARY_INT(vqsubq, detail::identity_return_type)
734
735 template <class A, class T, detail::enable_neon_type_t<T> = 0>
736 inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
737 {
738 using register_type = typename batch<T, A>::register_type;
739 const detail::neon_dispatcher::binary dispatcher = {
740 .m_func: std::make_tuple(args&: wrap::vqsubq_u8, args&: wrap::vqsubq_s8, args&: wrap::vqsubq_u16, args&: wrap::vqsubq_s16,
741 args&: wrap::vqsubq_u32, args&: wrap::vqsubq_s32, args&: wrap::vqsubq_u64, args&: wrap::vqsubq_s64,
742 args&: wrap::vsubq_f32)
743 };
744 return dispatcher.apply(register_type(lhs), register_type(rhs));
745 }
746
747 /*******
748 * mul *
749 *******/
750
751 WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type)
752 WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)
753
754 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
755 inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
756 {
757 using register_type = typename batch<T, A>::register_type;
758 const detail::excluding_int64_dispatcher::binary dispatcher = {
759 .m_func: std::make_tuple(args&: wrap::vmulq_u8, args&: wrap::vmulq_s8, args&: wrap::vmulq_u16, args&: wrap::vmulq_s16,
760 args&: wrap::vmulq_u32, args&: wrap::vmulq_s32, args&: wrap::vmulq_f32)
761 };
762 return dispatcher.apply(register_type(lhs), register_type(rhs));
763 }
764
765 /*******
766 * div *
767 *******/
768
769#if defined(XSIMD_FAST_INTEGER_DIVISION)
770 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
771 inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
772 {
773 return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs));
774 }
775
776 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
777 inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
778 {
779 return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs));
780 }
781#endif
782
783 template <class A>
784 inline batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
785 {
786 // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html
787 // get an initial estimate of 1/b.
788 float32x4_t rcp = reciprocal(rhs);
789
790 // use a couple Newton-Raphson steps to refine the estimate. Depending on your
791 // application's accuracy requirements, you may be able to get away with only
792 // one refinement (instead of the two used here). Be sure to test!
793 rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
794 rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
795
796 // and finally, compute a / b = a * (1 / b)
797 return vmulq_f32(lhs, rcp);
798 }
799
800 /******
801 * eq *
802 ******/
803
804 WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type)
805 WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)
806
807 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
808 inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
809 {
810 using register_type = typename batch<T, A>::register_type;
811 const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
812 .m_func: std::make_tuple(args&: wrap::vceqq_u8, args&: wrap::vceqq_s8, args&: wrap::vceqq_u16, args&: wrap::vceqq_s16,
813 args&: wrap::vceqq_u32, args&: wrap::vceqq_s32, args&: wrap::vceqq_f32)
814 };
815 return dispatcher.apply(register_type(lhs), register_type(rhs));
816 }
817
818 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
819 inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
820 {
821 using register_type = typename batch_bool<T, A>::register_type;
822 using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
823 const dispatcher_type dispatcher = {
824 .m_func: std::make_tuple(args&: wrap::vceqq_u8, args&: wrap::vceqq_u16, args&: wrap::vceqq_u32)
825 };
826 return dispatcher.apply(register_type(lhs), register_type(rhs));
827 }
828
829 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
830 inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
831 {
832 return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
833 }
834
835 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
836 inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
837 {
838 return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
839 }
840
841 /*************
842 * fast_cast *
843 *************/
844
845 namespace detail
846 {
847 template <class A>
848 inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
849 {
850 return vcvtq_f32_s32(self);
851 }
852
853 template <class A>
854 inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
855 {
856 return vcvtq_f32_u32(self);
857 }
858
859 template <class A>
860 inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
861 {
862 return vcvtq_s32_f32(self);
863 }
864
865 template <class A>
866 inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
867 {
868 return vcvtq_u32_f32(self);
869 }
870
871 }
872
873 /******
874 * lt *
875 ******/
876
877 WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type)
878 WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)
879
880 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
881 inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
882 {
883 using register_type = typename batch<T, A>::register_type;
884 const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
885 .m_func: std::make_tuple(args&: wrap::vcltq_u8, args&: wrap::vcltq_s8, args&: wrap::vcltq_u16, args&: wrap::vcltq_s16,
886 args&: wrap::vcltq_u32, args&: wrap::vcltq_s32, args&: wrap::vcltq_f32)
887 };
888 return dispatcher.apply(register_type(lhs), register_type(rhs));
889 }
890
891 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
892 inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
893 {
894 return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
895 }
896
897 /******
898 * le *
899 ******/
900
901 WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type)
902 WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)
903
904 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
905 inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
906 {
907 using register_type = typename batch<T, A>::register_type;
908 const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
909 .m_func: std::make_tuple(args&: wrap::vcleq_u8, args&: wrap::vcleq_s8, args&: wrap::vcleq_u16, args&: wrap::vcleq_s16,
910 args&: wrap::vcleq_u32, args&: wrap::vcleq_s32, args&: wrap::vcleq_f32)
911 };
912 return dispatcher.apply(register_type(lhs), register_type(rhs));
913 }
914
915 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
916 inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
917 {
918 return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) });
919 }
920
921 /******
922 * gt *
923 ******/
924
925 WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
926 WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
927
928 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
929 inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
930 {
931 using register_type = typename batch<T, A>::register_type;
932 const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
933 .m_func: std::make_tuple(args&: wrap::vcgtq_u8, args&: wrap::vcgtq_s8, args&: wrap::vcgtq_u16, args&: wrap::vcgtq_s16,
934 args&: wrap::vcgtq_u32, args&: wrap::vcgtq_s32, args&: wrap::vcgtq_f32)
935 };
936 return dispatcher.apply(register_type(lhs), register_type(rhs));
937 }
938
939 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
940 inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
941 {
942 return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) });
943 }
944
945 /******
946 * ge *
947 ******/
948
949 WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type)
950 WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
951
952 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
953 inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
954 {
955 using register_type = typename batch<T, A>::register_type;
956 const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
957 .m_func: std::make_tuple(args&: wrap::vcgeq_u8, args&: wrap::vcgeq_s8, args&: wrap::vcgeq_u16, args&: wrap::vcgeq_s16,
958 args&: wrap::vcgeq_u32, args&: wrap::vcgeq_s32, args&: wrap::vcgeq_f32)
959 };
960 return dispatcher.apply(register_type(lhs), register_type(rhs));
961 }
962
963 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
964 inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
965 {
966 return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
967 }
968
969 /*******************
970 * batch_bool_cast *
971 *******************/
972
973 template <class A, class T_out, class T_in>
974 inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
975 {
976 using register_type = typename batch_bool<T_out, A>::register_type;
977 return register_type(self);
978 }
979
980 /***************
981 * bitwise_and *
982 ***************/
983
984 WRAP_BINARY_INT(vandq, detail::identity_return_type)
985
986 namespace detail
987 {
988 inline float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
989 {
990 return vreinterpretq_f32_u32(p0: vandq_u32(p0: vreinterpretq_u32_f32(p0: lhs),
991 p1: vreinterpretq_u32_f32(p0: rhs)));
992 }
993
994 template <class V>
995 V bitwise_and_neon(V const& lhs, V const& rhs)
996 {
997 const neon_dispatcher::binary dispatcher = {
998 .m_func: std::make_tuple(args&: wrap::vandq_u8, args&: wrap::vandq_s8, args&: wrap::vandq_u16, args&: wrap::vandq_s16,
999 args&: wrap::vandq_u32, args&: wrap::vandq_s32, args&: wrap::vandq_u64, args&: wrap::vandq_s64,
1000 args&: bitwise_and_f32)
1001 };
1002 return dispatcher.apply(lhs, rhs);
1003 }
1004 }
1005
1006 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1007 inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1008 {
1009 using register_type = typename batch<T, A>::register_type;
1010 return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
1011 }
1012
1013 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1014 inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1015 {
1016 using register_type = typename batch_bool<T, A>::register_type;
1017 return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
1018 }
1019
1020 /**************
1021 * bitwise_or *
1022 **************/
1023
1024 WRAP_BINARY_INT(vorrq, detail::identity_return_type)
1025
1026 namespace detail
1027 {
1028 inline float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1029 {
1030 return vreinterpretq_f32_u32(p0: vorrq_u32(p0: vreinterpretq_u32_f32(p0: lhs),
1031 p1: vreinterpretq_u32_f32(p0: rhs)));
1032 }
1033
1034 template <class V>
1035 inline V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
1036 {
1037 const neon_dispatcher::binary dispatcher = {
1038 .m_func: std::make_tuple(args&: wrap::vorrq_u8, args&: wrap::vorrq_s8, args&: wrap::vorrq_u16, args&: wrap::vorrq_s16,
1039 args&: wrap::vorrq_u32, args&: wrap::vorrq_s32, args&: wrap::vorrq_u64, args&: wrap::vorrq_s64,
1040 args&: bitwise_or_f32)
1041 };
1042 return dispatcher.apply(lhs, rhs);
1043 }
1044 }
1045
1046 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1047 inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1048 {
1049 using register_type = typename batch<T, A>::register_type;
1050 return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
1051 }
1052
1053 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1054 inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1055 {
1056 using register_type = typename batch_bool<T, A>::register_type;
1057 return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
1058 }
1059
1060 /***************
1061 * bitwise_xor *
1062 ***************/
1063
1064 WRAP_BINARY_INT(veorq, detail::identity_return_type)
1065
1066 namespace detail
1067 {
1068 inline float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1069 {
1070 return vreinterpretq_f32_u32(p0: veorq_u32(p0: vreinterpretq_u32_f32(p0: lhs),
1071 p1: vreinterpretq_u32_f32(p0: rhs)));
1072 }
1073
1074 template <class V>
1075 inline V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
1076 {
1077 const neon_dispatcher::binary dispatcher = {
1078 .m_func: std::make_tuple(args&: wrap::veorq_u8, args&: wrap::veorq_s8, args&: wrap::veorq_u16, args&: wrap::veorq_s16,
1079 args&: wrap::veorq_u32, args&: wrap::veorq_s32, args&: wrap::veorq_u64, args&: wrap::veorq_s64,
1080 args&: bitwise_xor_f32)
1081 };
1082 return dispatcher.apply(lhs, rhs);
1083 }
1084 }
1085
1086 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1087 inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1088 {
1089 using register_type = typename batch<T, A>::register_type;
1090 return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
1091 }
1092
1093 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1094 inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1095 {
1096 using register_type = typename batch_bool<T, A>::register_type;
1097 return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
1098 }
1099
1100 /*******
1101 * neq *
1102 *******/
1103
1104 template <class A, class T>
1105 inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1106 {
1107 return bitwise_xor(lhs, rhs, A {});
1108 }
1109
1110 /***************
1111 * bitwise_not *
1112 ***************/
1113
1114 WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
1115
1116 namespace detail
1117 {
1118 inline int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
1119 {
1120 return vreinterpretq_s64_s32(p0: vmvnq_s32(p0: vreinterpretq_s32_s64(p0: arg)));
1121 }
1122
1123 inline uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
1124 {
1125 return vreinterpretq_u64_u32(p0: vmvnq_u32(p0: vreinterpretq_u32_u64(p0: arg)));
1126 }
1127
1128 inline float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
1129 {
1130 return vreinterpretq_f32_u32(p0: vmvnq_u32(p0: vreinterpretq_u32_f32(p0: arg)));
1131 }
1132
1133 template <class V>
1134 inline V bitwise_not_neon(V const& arg) noexcept
1135 {
1136 const neon_dispatcher::unary dispatcher = {
1137 .m_func: std::make_tuple(args&: wrap::vmvnq_u8, args&: wrap::vmvnq_s8, args&: wrap::vmvnq_u16, args&: wrap::vmvnq_s16,
1138 args&: wrap::vmvnq_u32, args&: wrap::vmvnq_s32,
1139 args&: bitwise_not_u64, args&: bitwise_not_s64,
1140 args&: bitwise_not_f32)
1141 };
1142 return dispatcher.apply(arg);
1143 }
1144 }
1145
1146 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1147 inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
1148 {
1149 using register_type = typename batch<T, A>::register_type;
1150 return detail::bitwise_not_neon(register_type(arg));
1151 }
1152
1153 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1154 inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
1155 {
1156 using register_type = typename batch_bool<T, A>::register_type;
1157 return detail::bitwise_not_neon(register_type(arg));
1158 }
1159
1160 /******************
1161 * bitwise_andnot *
1162 ******************/
1163
1164 WRAP_BINARY_INT(vbicq, detail::identity_return_type)
1165
1166 namespace detail
1167 {
1168 inline float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1169 {
1170 return vreinterpretq_f32_u32(p0: vbicq_u32(p0: vreinterpretq_u32_f32(p0: lhs), p1: vreinterpretq_u32_f32(p0: rhs)));
1171 }
1172
1173 template <class V>
1174 inline V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
1175 {
1176 const detail::neon_dispatcher::binary dispatcher = {
1177 .m_func: std::make_tuple(args&: wrap::vbicq_u8, args&: wrap::vbicq_s8, args&: wrap::vbicq_u16, args&: wrap::vbicq_s16,
1178 args&: wrap::vbicq_u32, args&: wrap::vbicq_s32, args&: wrap::vbicq_u64, args&: wrap::vbicq_s64,
1179 args&: bitwise_andnot_f32)
1180 };
1181 return dispatcher.apply(lhs, rhs);
1182 }
1183 }
1184
1185 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1186 inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1187 {
1188 using register_type = typename batch<T, A>::register_type;
1189 return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
1190 }
1191
1192 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1193 inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1194 {
1195 using register_type = typename batch_bool<T, A>::register_type;
1196 return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
1197 }
1198
1199 /*******
1200 * min *
1201 *******/
1202
1203 WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type)
1204 WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)
1205
1206 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1207 inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1208 {
1209 using register_type = typename batch<T, A>::register_type;
1210 const detail::excluding_int64_dispatcher::binary dispatcher = {
1211 .m_func: std::make_tuple(args&: wrap::vminq_u8, args&: wrap::vminq_s8, args&: wrap::vminq_u16, args&: wrap::vminq_s16,
1212 args&: wrap::vminq_u32, args&: wrap::vminq_s32, args&: wrap::vminq_f32)
1213 };
1214 return dispatcher.apply(register_type(lhs), register_type(rhs));
1215 }
1216
1217 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1218 inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1219 {
1220 return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) };
1221 }
1222
1223 /*******
1224 * max *
1225 *******/
1226
1227 WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type)
1228 WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)
1229
1230 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1231 inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1232 {
1233 using register_type = typename batch<T, A>::register_type;
1234 const detail::excluding_int64_dispatcher::binary dispatcher = {
1235 .m_func: std::make_tuple(args&: wrap::vmaxq_u8, args&: wrap::vmaxq_s8, args&: wrap::vmaxq_u16, args&: wrap::vmaxq_s16,
1236 args&: wrap::vmaxq_u32, args&: wrap::vmaxq_s32, args&: wrap::vmaxq_f32)
1237 };
1238 return dispatcher.apply(register_type(lhs), register_type(rhs));
1239 }
1240
1241 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1242 inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1243 {
1244 return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) };
1245 }
1246
1247 /*******
1248 * abs *
1249 *******/
1250
1251 namespace wrap
1252 {
1253 inline int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(p0: a); }
1254 inline int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(p0: a); }
1255 inline int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(p0: a); }
1256 }
1257 WRAP_UNARY_FLOAT(vabsq)
1258
1259 namespace detail
1260 {
1261 inline uint8x16_t abs_u8(uint8x16_t arg) noexcept
1262 {
1263 return arg;
1264 }
1265
1266 inline uint16x8_t abs_u16(uint16x8_t arg) noexcept
1267 {
1268 return arg;
1269 }
1270
1271 inline uint32x4_t abs_u32(uint32x4_t arg) noexcept
1272 {
1273 return arg;
1274 }
1275 }
1276
1277 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1278 inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
1279 {
1280 using register_type = typename batch<T, A>::register_type;
1281 const detail::excluding_int64_dispatcher::unary dispatcher = {
1282 .m_func: std::make_tuple(args&: detail::abs_u8, args&: wrap::vabsq_s8, args&: detail::abs_u16, args&: wrap::vabsq_s16,
1283 args&: detail::abs_u32, args&: wrap::vabsq_s32, args&: wrap::vabsq_f32)
1284 };
1285 return dispatcher.apply(register_type(arg));
1286 }
1287
1288 /********
1289 * rsqrt *
1290 ********/
1291
1292 template <class A>
1293 inline batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
1294 {
1295 return vrsqrteq_f32(arg);
1296 }
1297
1298 /********
1299 * sqrt *
1300 ********/
1301
1302 template <class A>
1303 inline batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
1304 {
1305 batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg);
1306 // one iter
1307 sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
1308 batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
1309 batch<float, A> zero(0.f);
1310 return select(arg == zero, zero, sqrt_approx);
1311 }
1312
1313 /********************
1314 * Fused operations *
1315 ********************/
1316
1317#ifdef __ARM_FEATURE_FMA
1318 template <class A>
1319 inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
1320 {
1321 return vfmaq_f32(z, x, y);
1322 }
1323
1324 template <class A>
1325 inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
1326 {
1327 return vfmaq_f32(-z, x, y);
1328 }
1329#endif
1330
1331 /*********
1332 * haddp *
1333 *********/
1334
1335 template <class A>
1336 inline batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
1337 {
1338 // row = (a,b,c,d)
1339 float32x2_t tmp1, tmp2, tmp3;
1340 // tmp1 = (a0 + a2, a1 + a3)
1341 tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0]));
1342 // tmp2 = (b0 + b2, b1 + b3)
1343 tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1]));
1344 // tmp1 = (a0..3, b0..3)
1345 tmp1 = vpadd_f32(p0: tmp1, p1: tmp2);
1346 // tmp2 = (c0 + c2, c1 + c3)
1347 tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2]));
1348 // tmp3 = (d0 + d2, d1 + d3)
1349 tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3]));
1350 // tmp1 = (c0..3, d0..3)
1351 tmp2 = vpadd_f32(p0: tmp2, p1: tmp3);
1352 // return = (a0..3, b0..3, c0..3, d0..3)
1353 return vcombine_f32(p0: tmp1, p1: tmp2);
1354 }
1355
1356 /**************
1357 * reciprocal *
1358 **************/
1359
1360 template <class A>
1361 inline batch<float, A>
1362 reciprocal(const batch<float, A>& x,
1363 kernel::requires_arch<neon>) noexcept
1364 {
1365 return vrecpeq_f32(x);
1366 }
1367
1368 /**********
1369 * insert *
1370 **********/
1371
1372 template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0>
1373 inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1374 {
1375 return vsetq_lane_u8(val, self, I);
1376 }
1377
1378 template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0>
1379 inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1380 {
1381 return vsetq_lane_s8(val, self, I);
1382 }
1383
1384 template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0>
1385 inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1386 {
1387 return vsetq_lane_u16(val, self, I);
1388 }
1389
1390 template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0>
1391 inline batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
1392 {
1393 return vsetq_lane_s16(val, self, I);
1394 }
1395
1396 template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0>
1397 inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1398 {
1399 return vsetq_lane_u32(val, self, I);
1400 }
1401
1402 template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0>
1403 inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1404 {
1405 return vsetq_lane_s32(val, self, I);
1406 }
1407
1408 template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0>
1409 inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1410 {
1411 return vsetq_lane_u64(val, self, I);
1412 }
1413
1414 template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 8> = 0>
1415 inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1416 {
1417 return vsetq_lane_s64(val, self, I);
1418 }
1419
1420 template <class A, size_t I>
1421 inline batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept
1422 {
1423 return vsetq_lane_f32(val, self, I);
1424 }
1425
1426 /********************
1427 * nearbyint_as_int *
1428 *******************/
1429
1430 template <class A>
1431 inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1432 requires_arch<neon>) noexcept
1433 {
1434 /* origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 */
1435 // Contributors to this work are:
1436 // John W. Ratcliff <jratcliffscarab@gmail.com>
1437 // Brandon Rowlett <browlett@nvidia.com>
1438 // Ken Fast <kfast@gdeb.com>
1439 // Eric van Beurden <evanbeurden@nvidia.com>
1440 // Alexander Potylitsin <apotylitsin@nvidia.com>
1441 // Hasindu Gamaarachchi <hasindu2008@gmail.com>
1442 // Jim Huang <jserv@biilabs.io>
1443 // Mark Cheng <marktwtn@biilabs.io>
1444 // Malcolm James MacLeod <malcolm@gulden.com>
1445 // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
1446 // Sebastian Pop <spop@amazon.com>
1447 // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
1448 // Danila Kutenin <danilak@google.com>
1449 // François Turban (JishinMaster) <francois.turban@gmail.com>
1450 // Pei-Hsuan Hung <afcidk@gmail.com>
1451 // Yang-Hao Yuan <yanghau@biilabs.io>
1452 // Syoyo Fujita <syoyo@lighttransport.com>
1453 // Brecht Van Lommel <brecht@blender.org>
1454
1455 /*
1456 * sse2neon is freely redistributable under the MIT License.
1457 *
1458 * Permission is hereby granted, free of charge, to any person obtaining a copy
1459 * of this software and associated documentation files (the "Software"), to deal
1460 * in the Software without restriction, including without limitation the rights
1461 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1462 * copies of the Software, and to permit persons to whom the Software is
1463 * furnished to do so, subject to the following conditions:
1464 *
1465 * The above copyright notice and this permission notice shall be included in
1466 * all copies or substantial portions of the Software.
1467 *
1468 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1469 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1470 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1471 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1472 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1473 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1474 * SOFTWARE.
1475 */
1476
1477 const auto signmask = vdupq_n_u32(p0: 0x80000000);
1478 const auto half = vbslq_f32(signmask, self,
1479 vdupq_n_f32(p0: 0.5f)); /* +/- 0.5 */
1480 const auto r_normal = vcvtq_s32_f32(vaddq_f32(
1481 self, half)); /* round to integer: [a + 0.5]*/
1482 const auto r_trunc = vcvtq_s32_f32(self); /* truncate to integer: [a] */
1483 const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32(
1484 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
1485 const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
1486 vdupq_n_s32(p0: 1)); /* ([a] + {0,1}) & ~1 */
1487 const auto delta = vsubq_f32(
1488 self,
1489 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
1490 const auto is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
1491 return vbslq_s32(is_delta_half, r_even, r_normal);
1492 }
1493
1494 /**************
1495 * reduce_add *
1496 **************/
1497
1498 namespace detail
1499 {
1500 template <class T, class A, class V>
1501 inline T sum_batch(V const& arg) noexcept
1502 {
1503 T res = T(0);
1504 for (std::size_t i = 0; i < batch<T, A>::size; ++i)
1505 {
1506 res += arg[i];
1507 }
1508 return res;
1509 }
1510 }
1511
1512 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1513 inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1514 {
1515 uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg));
1516 tmp = vpadd_u8(p0: tmp, p1: tmp);
1517 tmp = vpadd_u8(p0: tmp, p1: tmp);
1518 tmp = vpadd_u8(p0: tmp, p1: tmp);
1519 return vget_lane_u8(tmp, 0);
1520 }
1521
1522 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1523 inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1524 {
1525 int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg));
1526 tmp = vpadd_s8(p0: tmp, p1: tmp);
1527 tmp = vpadd_s8(p0: tmp, p1: tmp);
1528 tmp = vpadd_s8(p0: tmp, p1: tmp);
1529 return vget_lane_s8(tmp, 0);
1530 }
1531
1532 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1533 inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1534 {
1535 uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg));
1536 tmp = vpadd_u16(p0: tmp, p1: tmp);
1537 tmp = vpadd_u16(p0: tmp, p1: tmp);
1538 return vget_lane_u16(tmp, 0);
1539 }
1540
1541 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1542 inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1543 {
1544 int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg));
1545 tmp = vpadd_s16(p0: tmp, p1: tmp);
1546 tmp = vpadd_s16(p0: tmp, p1: tmp);
1547 return vget_lane_s16(tmp, 0);
1548 }
1549
1550 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1551 inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1552 {
1553 uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg));
1554 tmp = vpadd_u32(p0: tmp, p1: tmp);
1555 return vget_lane_u32(tmp, 0);
1556 }
1557
1558 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1559 inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1560 {
1561 int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg));
1562 tmp = vpadd_s32(p0: tmp, p1: tmp);
1563 return vget_lane_s32(tmp, 0);
1564 }
1565
1566 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1567 inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1568 {
1569 return arg.get(0) + arg.get(1);
1570 }
1571
1572 template <class A>
1573 inline float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept
1574 {
1575 float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg));
1576 tmp = vpadd_f32(p0: tmp, p1: tmp);
1577 return vget_lane_f32(tmp, 0);
1578 }
1579
1580 /**************
1581 * reduce_max *
1582 **************/
1583
1584 // Using generic implementation because ARM doe snot provide intrinsics
1585 // for this operation
1586
1587 /**************
1588 * reduce_min *
1589 **************/
1590
1591 // Using generic implementation because ARM doe snot provide intrinsics
1592 // for this operation
1593
1594 /**********
1595 * select *
1596 **********/
1597
1598 namespace wrap
1599 {
1600 inline uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(p0: a, p1: b, p2: c); }
1601 inline int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(p0: a, p1: b, p2: c); }
1602 inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(p0: a, p1: b, p2: c); }
1603 inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(p0: a, p1: b, p2: c); }
1604 inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(p0: a, p1: b, p2: c); }
1605 inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(p0: a, p1: b, p2: c); }
1606 inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(p0: a, p1: b, p2: c); }
1607 inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(p0: a, p1: b, p2: c); }
1608 inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(p0: a, p1: b, p2: c); }
1609 }
1610
1611 namespace detail
1612 {
1613 template <class... T>
1614 struct neon_select_dispatcher_impl
1615 {
1616 using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>;
1617 const container_type m_func;
1618
1619 template <class U>
1620 U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept
1621 {
1622 using func_type = U (*)(comp_return_type<U>, U, U);
1623 auto func = xsimd::detail::get<func_type>(m_func);
1624 return func(cond, lhs, rhs);
1625 }
1626 };
1627
1628 using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t,
1629 uint16x8_t, int16x8_t,
1630 uint32x4_t, int32x4_t,
1631 uint64x2_t, int64x2_t,
1632 float32x4_t>;
1633 }
1634
1635 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1636 inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept
1637 {
1638 using bool_register_type = typename batch_bool<T, A>::register_type;
1639 using register_type = typename batch<T, A>::register_type;
1640 const detail::neon_select_dispatcher dispatcher = {
1641 .m_func: std::make_tuple(args&: wrap::vbslq_u8, args&: wrap::vbslq_s8, args&: wrap::vbslq_u16, args&: wrap::vbslq_s16,
1642 args&: wrap::vbslq_u32, args&: wrap::vbslq_s32, args&: wrap::vbslq_u64, args&: wrap::vbslq_s64,
1643 args&: wrap::vbslq_f32)
1644 };
1645 return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b));
1646 }
1647
1648 template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
1649 inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
1650 {
1651 return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
1652 }
1653
1654 /**********
1655 * zip_lo *
1656 **********/
1657
1658 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1659 inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1660 {
1661 uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs));
1662 return vcombine_u8(p0: tmp.val[0], p1: tmp.val[1]);
1663 }
1664
1665 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1666 inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1667 {
1668 int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs));
1669 return vcombine_s8(p0: tmp.val[0], p1: tmp.val[1]);
1670 }
1671
1672 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1673 inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1674 {
1675 uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs));
1676 return vcombine_u16(p0: tmp.val[0], p1: tmp.val[1]);
1677 }
1678
1679 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1680 inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1681 {
1682 int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs));
1683 return vcombine_s16(p0: tmp.val[0], p1: tmp.val[1]);
1684 }
1685
1686 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1687 inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1688 {
1689 uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs));
1690 return vcombine_u32(p0: tmp.val[0], p1: tmp.val[1]);
1691 }
1692
1693 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1694 inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1695 {
1696 int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs));
1697 return vcombine_s32(p0: tmp.val[0], p1: tmp.val[1]);
1698 }
1699
1700 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1701 inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1702 {
1703 return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs));
1704 }
1705
1706 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1707 inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1708 {
1709 return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs));
1710 }
1711
1712 template <class A>
1713 inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
1714 {
1715 float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs));
1716 return vcombine_f32(p0: tmp.val[0], p1: tmp.val[1]);
1717 }
1718
1719 /**********
1720 * zip_hi *
1721 **********/
1722
1723 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1724 inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1725 {
1726 uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs));
1727 return vcombine_u8(p0: tmp.val[0], p1: tmp.val[1]);
1728 }
1729
1730 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1731 inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1732 {
1733 int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs));
1734 return vcombine_s8(p0: tmp.val[0], p1: tmp.val[1]);
1735 }
1736
1737 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1738 inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1739 {
1740 uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs));
1741 return vcombine_u16(p0: tmp.val[0], p1: tmp.val[1]);
1742 }
1743
1744 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1745 inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1746 {
1747 int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs));
1748 return vcombine_s16(p0: tmp.val[0], p1: tmp.val[1]);
1749 }
1750
1751 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1752 inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1753 {
1754 uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs));
1755 return vcombine_u32(p0: tmp.val[0], p1: tmp.val[1]);
1756 }
1757
1758 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1759 inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1760 {
1761 int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs));
1762 return vcombine_s32(p0: tmp.val[0], p1: tmp.val[1]);
1763 }
1764
1765 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1766 inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1767 {
1768 return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs));
1769 }
1770
1771 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1772 inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1773 {
1774 return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs));
1775 }
1776
1777 template <class A>
1778 inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
1779 {
1780 float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs));
1781 return vcombine_f32(p0: tmp.val[0], p1: tmp.val[1]);
1782 }
1783
1784 /****************
1785 * extract_pair *
1786 ****************/
1787
1788 namespace detail
1789 {
1790 template <class A, class T>
1791 inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
1792 {
1793 assert(false && "extract_pair out of bounds");
1794 return batch<T, A> {};
1795 }
1796
1797 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
1798 inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1799 {
1800 if (n == I)
1801 {
1802 return vextq_u8(rhs, lhs, I);
1803 }
1804 else
1805 {
1806 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1807 }
1808 }
1809
1810 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 1> = 0>
1811 inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1812 {
1813 if (n == I)
1814 {
1815 return vextq_s8(rhs, lhs, I);
1816 }
1817 else
1818 {
1819 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1820 }
1821 }
1822
1823 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
1824 inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1825 {
1826 if (n == I)
1827 {
1828 return vextq_u16(rhs, lhs, I);
1829 }
1830 else
1831 {
1832 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1833 }
1834 }
1835
1836 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 2> = 0>
1837 inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1838 {
1839 if (n == I)
1840 {
1841 return vextq_s16(rhs, lhs, I);
1842 }
1843 else
1844 {
1845 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1846 }
1847 }
1848
1849 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
1850 inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1851 {
1852 if (n == I)
1853 {
1854 return vextq_u32(rhs, lhs, I);
1855 }
1856 else
1857 {
1858 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1859 }
1860 }
1861
1862 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 4> = 0>
1863 inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1864 {
1865 if (n == I)
1866 {
1867 return vextq_s32(rhs, lhs, I);
1868 }
1869 else
1870 {
1871 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1872 }
1873 }
1874
1875 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
1876 inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1877 {
1878 if (n == I)
1879 {
1880 return vextq_u64(rhs, lhs, I);
1881 }
1882 else
1883 {
1884 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1885 }
1886 }
1887
1888 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 8> = 0>
1889 inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1890 {
1891 if (n == I)
1892 {
1893 return vextq_s64(rhs, lhs, I);
1894 }
1895 else
1896 {
1897 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1898 }
1899 }
1900
1901 template <class A, size_t I, size_t... Is>
1902 inline batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1903 {
1904 if (n == I)
1905 {
1906 return vextq_f32(rhs, lhs, I);
1907 }
1908 else
1909 {
1910 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1911 }
1912 }
1913
1914 template <class A, class T, size_t... Is>
1915 inline batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
1916 {
1917 if (n == 0)
1918 {
1919 return rhs;
1920 }
1921 else
1922 {
1923 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1924 }
1925 }
1926 }
1927
1928 template <class A, class T>
1929 inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept
1930 {
1931 constexpr std::size_t size = batch<T, A>::size;
1932 assert(n < size && "index in bounds");
1933 return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
1934 }
1935
1936 /******************
1937 * bitwise_lshift *
1938 ******************/
1939
1940 namespace detail
1941 {
1942 template <class A, class T>
1943 inline batch<T, A> bitwise_lshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
1944 {
1945 assert(false && "bitwise_lshift out of bounds");
1946 return batch<T, A> {};
1947 }
1948
1949 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
1950 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
1951 {
1952 if (n == I)
1953 {
1954 return vshlq_n_u8(lhs, I);
1955 }
1956 else
1957 {
1958 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
1959 }
1960 }
1961
1962 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
1963 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
1964 {
1965 if (n == I)
1966 {
1967 return vshlq_n_s8(lhs, I);
1968 }
1969 else
1970 {
1971 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
1972 }
1973 }
1974
1975 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
1976 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
1977 {
1978 if (n == I)
1979 {
1980 return vshlq_n_u16(lhs, I);
1981 }
1982 else
1983 {
1984 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
1985 }
1986 }
1987
1988 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
1989 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
1990 {
1991 if (n == I)
1992 {
1993 return vshlq_n_s16(lhs, I);
1994 }
1995 else
1996 {
1997 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
1998 }
1999 }
2000
2001 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2002 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2003 {
2004 if (n == I)
2005 {
2006 return vshlq_n_u32(lhs, I);
2007 }
2008 else
2009 {
2010 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2011 }
2012 }
2013
2014 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
2015 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2016 {
2017 if (n == I)
2018 {
2019 return vshlq_n_s32(lhs, I);
2020 }
2021 else
2022 {
2023 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2024 }
2025 }
2026
2027 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2028 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2029 {
2030 if (n == I)
2031 {
2032 return vshlq_n_u64(lhs, I);
2033 }
2034 else
2035 {
2036 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2037 }
2038 }
2039
2040 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
2041 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2042 {
2043 if (n == I)
2044 {
2045 return vshlq_n_s64(lhs, I);
2046 }
2047 else
2048 {
2049 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2050 }
2051 }
2052
2053 template <class A, class T, int... Is>
2054 inline batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
2055 {
2056 if (n == 0)
2057 {
2058 return lhs;
2059 }
2060 else
2061 {
2062 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2063 }
2064 }
2065 }
2066
2067 template <class A, class T>
2068 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
2069 {
2070 constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
2071 assert(0 <= n && n < size && "index in bounds");
2072 return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
2073 }
2074
2075 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2076 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2077 {
2078 return vshlq_u8(lhs, rhs);
2079 }
2080
2081 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2082 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2083 {
2084 return vshlq_s8(lhs, rhs);
2085 }
2086
2087 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2088 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2089 {
2090 return vshlq_u16(lhs, rhs);
2091 }
2092
2093 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2094 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2095 {
2096 return vshlq_s16(lhs, rhs);
2097 }
2098
2099 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2100 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2101 {
2102 return vshlq_u32(lhs, rhs);
2103 }
2104
2105 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2106 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2107 {
2108 return vshlq_s32(lhs, rhs);
2109 }
2110
2111 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
2112 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2113 {
2114 return vshlq_u64(lhs, rhs);
2115 }
2116
2117 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
2118 inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2119 {
2120 return vshlq_s64(lhs, rhs);
2121 }
2122
2123 /******************
2124 * bitwise_rshift *
2125 ******************/
2126
2127 namespace detail
2128 {
2129 template <class A, class T>
2130 inline batch<T, A> bitwise_rshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
2131 {
2132 assert(false && "bitwise_rshift out of bounds");
2133 return batch<T, A> {};
2134 }
2135
2136 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
2137 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2138 {
2139 if (n == I)
2140 {
2141 return vshrq_n_u8(lhs, I);
2142 }
2143 else
2144 {
2145 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2146 }
2147 }
2148
2149 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
2150 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2151 {
2152 if (n == I)
2153 {
2154 return vshrq_n_s8(lhs, I);
2155 }
2156 else
2157 {
2158 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2159 }
2160 }
2161
2162 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
2163 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2164 {
2165 if (n == I)
2166 {
2167 return vshrq_n_u16(lhs, I);
2168 }
2169 else
2170 {
2171 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2172 }
2173 }
2174
2175 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
2176 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2177 {
2178 if (n == I)
2179 {
2180 return vshrq_n_s16(lhs, I);
2181 }
2182 else
2183 {
2184 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2185 }
2186 }
2187
2188 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2189 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2190 {
2191 if (n == I)
2192 {
2193 return vshrq_n_u32(lhs, I);
2194 }
2195 else
2196 {
2197 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2198 }
2199 }
2200
2201 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
2202 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2203 {
2204 if (n == I)
2205 {
2206 return vshrq_n_s32(lhs, I);
2207 }
2208 else
2209 {
2210 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2211 }
2212 }
2213
2214 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2215 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2216 {
2217 if (n == I)
2218 {
2219 return vshrq_n_u64(lhs, I);
2220 }
2221 else
2222 {
2223 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2224 }
2225 }
2226
2227 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
2228 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2229 {
2230 if (n == I)
2231 {
2232 return vshrq_n_s64(lhs, I);
2233 }
2234 else
2235 {
2236 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2237 }
2238 }
2239
2240 template <class A, class T, int... Is>
2241 inline batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
2242 {
2243 if (n == 0)
2244 {
2245 return lhs;
2246 }
2247 else
2248 {
2249 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2250 }
2251 }
2252 }
2253
2254 template <class A, class T>
2255 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
2256 {
2257 constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
2258 assert(0 <= n && n < size && "index in bounds");
2259 return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
2260 }
2261
2262 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2263 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2264 {
2265 return vshlq_u8(lhs, vnegq_s8(rhs));
2266 }
2267
2268 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2269 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2270 {
2271 return vshlq_s8(lhs, vnegq_s8(rhs));
2272 }
2273
2274 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2275 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2276 {
2277 return vshlq_u16(lhs, vnegq_s16(rhs));
2278 }
2279
2280 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2281 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2282 {
2283 return vshlq_s16(lhs, vnegq_s16(rhs));
2284 }
2285
2286 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2287 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2288 {
2289 return vshlq_u32(lhs, vnegq_s32(rhs));
2290 }
2291
2292 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2293 inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2294 {
2295 return vshlq_s32(lhs, vnegq_s32(rhs));
2296 }
2297
2298 // Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7
2299
2300 /*******
2301 * all *
2302 *******/
2303
2304 template <class A, class T, detail::enable_sized_t<T, 8> = 0>
2305 inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2306 {
2307 uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg));
2308 return vget_lane_u64(tmp, 0) == ~0ULL;
2309 }
2310
2311 template <class A, class T, detail::enable_sized_t<T, 1> = 0>
2312 inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2313 {
2314 return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
2315 }
2316
2317 template <class A, class T, detail::enable_sized_t<T, 2> = 0>
2318 inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2319 {
2320 return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
2321 }
2322
2323 template <class A, class T, detail::enable_sized_t<T, 4> = 0>
2324 inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2325 {
2326 return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
2327 }
2328
2329 /*******
2330 * any *
2331 *******/
2332
2333 template <class A, class T, detail::enable_sized_t<T, 8> = 0>
2334 inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2335 {
2336 uint32x2_t tmp = vqmovn_u64(arg);
2337 return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0;
2338 }
2339
2340 template <class A, class T, detail::enable_sized_t<T, 1> = 0>
2341 inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2342 {
2343 return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
2344 }
2345
2346 template <class A, class T, detail::enable_sized_t<T, 2> = 0>
2347 inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2348 {
2349 return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
2350 }
2351
2352 template <class A, class T, detail::enable_sized_t<T, 4> = 0>
2353 inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2354 {
2355 return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
2356 }
2357
2358 /****************
2359 * bitwise_cast *
2360 ****************/
2361
2362#define WRAP_CAST(SUFFIX, TYPE) \
2363 namespace wrap \
2364 { \
2365 inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept \
2366 { \
2367 return ::vreinterpretq_##SUFFIX##_u8(a); \
2368 } \
2369 inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept \
2370 { \
2371 return ::vreinterpretq_##SUFFIX##_s8(a); \
2372 } \
2373 inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept \
2374 { \
2375 return ::vreinterpretq_##SUFFIX##_u16(a); \
2376 } \
2377 inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept \
2378 { \
2379 return ::vreinterpretq_##SUFFIX##_s16(a); \
2380 } \
2381 inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept \
2382 { \
2383 return ::vreinterpretq_##SUFFIX##_u32(a); \
2384 } \
2385 inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept \
2386 { \
2387 return ::vreinterpretq_##SUFFIX##_s32(a); \
2388 } \
2389 inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept \
2390 { \
2391 return ::vreinterpretq_##SUFFIX##_u64(a); \
2392 } \
2393 inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept \
2394 { \
2395 return ::vreinterpretq_##SUFFIX##_s64(a); \
2396 } \
2397 inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
2398 { \
2399 return ::vreinterpretq_##SUFFIX##_f32(a); \
2400 } \
2401 }
2402
2403 WRAP_CAST(u8, uint8x16_t)
2404 WRAP_CAST(s8, int8x16_t)
2405 WRAP_CAST(u16, uint16x8_t)
2406 WRAP_CAST(s16, int16x8_t)
2407 WRAP_CAST(u32, uint32x4_t)
2408 WRAP_CAST(s32, int32x4_t)
2409 WRAP_CAST(u64, uint64x2_t)
2410 WRAP_CAST(s64, int64x2_t)
2411 WRAP_CAST(f32, float32x4_t)
2412
2413#undef WRAP_CAST
2414
2415 namespace detail
2416 {
2417 template <class R, class... T>
2418 struct bitwise_caster_impl
2419 {
2420 using container_type = std::tuple<R (*)(T)...>;
2421 container_type m_func;
2422
2423 template <class U>
2424 R apply(U rhs) const noexcept
2425 {
2426 using func_type = R (*)(U);
2427 auto func = xsimd::detail::get<func_type>(m_func);
2428 return func(rhs);
2429 }
2430 };
2431
2432 template <class R, class... T>
2433 inline const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept
2434 {
2435 return { std::make_tuple(arg...) };
2436 }
2437
2438 template <class... T>
2439 struct type_list
2440 {
2441 };
2442
2443 template <class RTL, class TTL>
2444 struct bitwise_caster;
2445
2446 template <class... R, class... T>
2447 struct bitwise_caster<type_list<R...>, type_list<T...>>
2448 {
2449 using container_type = std::tuple<bitwise_caster_impl<R, T...>...>;
2450 container_type m_caster;
2451
2452 template <class V, class U>
2453 V apply(U rhs) const noexcept
2454 {
2455 using caster_type = bitwise_caster_impl<V, T...>;
2456 auto caster = xsimd::detail::get<caster_type>(m_caster);
2457 return caster.apply(rhs);
2458 }
2459 };
2460
2461 template <class... T>
2462 using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>;
2463
2464 using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t,
2465 uint16x8_t, int16x8_t,
2466 uint32x4_t, int32x4_t,
2467 uint64x2_t, int64x2_t,
2468 float32x4_t>;
2469 }
2470
2471 template <class A, class T, class R>
2472 inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
2473 {
2474 const detail::neon_bitwise_caster caster = {
2475 .m_caster: std::make_tuple(
2476 args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u8_u8, arg: wrap::vreinterpretq_u8_s8, arg: wrap::vreinterpretq_u8_u16, arg: wrap::vreinterpretq_u8_s16,
2477 arg: wrap::vreinterpretq_u8_u32, arg: wrap::vreinterpretq_u8_s32, arg: wrap::vreinterpretq_u8_u64, arg: wrap::vreinterpretq_u8_s64,
2478 arg: wrap::vreinterpretq_u8_f32),
2479 args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s8_u8, arg: wrap::vreinterpretq_s8_s8, arg: wrap::vreinterpretq_s8_u16, arg: wrap::vreinterpretq_s8_s16,
2480 arg: wrap::vreinterpretq_s8_u32, arg: wrap::vreinterpretq_s8_s32, arg: wrap::vreinterpretq_s8_u64, arg: wrap::vreinterpretq_s8_s64,
2481 arg: wrap::vreinterpretq_s8_f32),
2482 args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u16_u8, arg: wrap::vreinterpretq_u16_s8, arg: wrap::vreinterpretq_u16_u16, arg: wrap::vreinterpretq_u16_s16,
2483 arg: wrap::vreinterpretq_u16_u32, arg: wrap::vreinterpretq_u16_s32, arg: wrap::vreinterpretq_u16_u64, arg: wrap::vreinterpretq_u16_s64,
2484 arg: wrap::vreinterpretq_u16_f32),
2485 args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s16_u8, arg: wrap::vreinterpretq_s16_s8, arg: wrap::vreinterpretq_s16_u16, arg: wrap::vreinterpretq_s16_s16,
2486 arg: wrap::vreinterpretq_s16_u32, arg: wrap::vreinterpretq_s16_s32, arg: wrap::vreinterpretq_s16_u64, arg: wrap::vreinterpretq_s16_s64,
2487 arg: wrap::vreinterpretq_s16_f32),
2488 args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u32_u8, arg: wrap::vreinterpretq_u32_s8, arg: wrap::vreinterpretq_u32_u16, arg: wrap::vreinterpretq_u32_s16,
2489 arg: wrap::vreinterpretq_u32_u32, arg: wrap::vreinterpretq_u32_s32, arg: wrap::vreinterpretq_u32_u64, arg: wrap::vreinterpretq_u32_s64,
2490 arg: wrap::vreinterpretq_u32_f32),
2491 args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s32_u8, arg: wrap::vreinterpretq_s32_s8, arg: wrap::vreinterpretq_s32_u16, arg: wrap::vreinterpretq_s32_s16,
2492 arg: wrap::vreinterpretq_s32_u32, arg: wrap::vreinterpretq_s32_s32, arg: wrap::vreinterpretq_s32_u64, arg: wrap::vreinterpretq_s32_s64,
2493 arg: wrap::vreinterpretq_s32_f32),
2494 args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u64_u8, arg: wrap::vreinterpretq_u64_s8, arg: wrap::vreinterpretq_u64_u16, arg: wrap::vreinterpretq_u64_s16,
2495 arg: wrap::vreinterpretq_u64_u32, arg: wrap::vreinterpretq_u64_s32, arg: wrap::vreinterpretq_u64_u64, arg: wrap::vreinterpretq_u64_s64,
2496 arg: wrap::vreinterpretq_u64_f32),
2497 args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s64_u8, arg: wrap::vreinterpretq_s64_s8, arg: wrap::vreinterpretq_s64_u16, arg: wrap::vreinterpretq_s64_s16,
2498 arg: wrap::vreinterpretq_s64_u32, arg: wrap::vreinterpretq_s64_s32, arg: wrap::vreinterpretq_s64_u64, arg: wrap::vreinterpretq_s64_s64,
2499 arg: wrap::vreinterpretq_s64_f32),
2500 args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_f32_u8, arg: wrap::vreinterpretq_f32_s8, arg: wrap::vreinterpretq_f32_u16, arg: wrap::vreinterpretq_f32_s16,
2501 arg: wrap::vreinterpretq_f32_u32, arg: wrap::vreinterpretq_f32_s32, arg: wrap::vreinterpretq_f32_u64, arg: wrap::vreinterpretq_f32_s64,
2502 arg: wrap::vreinterpretq_f32_f32))
2503 };
2504 using src_register_type = typename batch<T, A>::register_type;
2505 using dst_register_type = typename batch<R, A>::register_type;
2506 return caster.apply<dst_register_type>(src_register_type(arg));
2507 }
2508
2509 /*********
2510 * isnan *
2511 *********/
2512
2513 template <class A>
2514 inline batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept
2515 {
2516 return !(arg == arg);
2517 }
2518
2519 // slide_left
2520 namespace detail
2521 {
2522 template <size_t N>
2523 struct slider_left
2524 {
2525 template <class A, class T>
2526 inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2527 {
2528 const auto left = vdupq_n_u8(p0: 0);
2529 const auto right = bitwise_cast<batch<uint8_t, A>>(x).data;
2530 const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N));
2531 return bitwise_cast<batch<T, A>>(res);
2532 }
2533 };
2534
2535 template <>
2536 struct slider_left<0>
2537 {
2538 template <class A, class T>
2539 inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2540 {
2541 return x;
2542 }
2543 };
2544 } // namespace detail
2545
2546 template <size_t N, class A, class T>
2547 inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept
2548 {
2549 return detail::slider_left<N> {}(x, A {});
2550 }
2551
2552 // slide_right
2553 namespace detail
2554 {
2555 template <size_t N>
2556 struct slider_right
2557 {
2558 template <class A, class T>
2559 inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2560 {
2561 const auto left = bitwise_cast<batch<uint8_t, A>>(x).data;
2562 const auto right = vdupq_n_u8(p0: 0);
2563 const batch<uint8_t, A> res(vextq_u8(left, right, N));
2564 return bitwise_cast<batch<T, A>>(res);
2565 }
2566 };
2567
2568 template <>
2569 struct slider_right<16>
2570 {
2571 template <class A, class T>
2572 inline batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept
2573 {
2574 return batch<T, A> {};
2575 }
2576 };
2577 } // namespace detail
2578
2579 template <size_t N, class A, class T>
2580 inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept
2581 {
2582 return detail::slider_right<N> {}(x, A {});
2583 }
2584 }
2585
2586 template <class batch_type, typename batch_type::value_type... Values>
2587 struct batch_constant;
2588
2589 namespace kernel
2590 {
2591 /***********
2592 * swizzle *
2593 ***********/
2594
2595 template <class A, class T, class I, I... idx>
2596 inline batch<T, A> swizzle(batch<T, A> const& self,
2597 batch_constant<batch<I, A>, idx...>,
2598 requires_arch<neon>) noexcept
2599 {
2600 static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
2601 std::array<T, batch<T, A>::size> data;
2602 self.store_aligned(data.data());
2603 return set(batch<T, A>(), A(), data[idx]...);
2604 }
2605 }
2606}
2607
2608#undef WRAP_BINARY_INT_EXCLUDING_64
2609#undef WRAP_BINARY_INT
2610#undef WRAP_BINARY_FLOAT
2611#undef WRAP_UNARY_INT_EXCLUDING_64
2612#undef WRAP_UNARY_INT
2613#undef WRAP_UNARY_FLOAT
2614
2615#endif
2616