xsimd_neon.hpp source code [Velox/build/_deps/xsimd-src/include/xsimd/arch/xsimd_neon.hpp]

1	/***************************************************************************
2	* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3	* Martin Renou *
4	* Copyright (c) QuantStack *
5	* Copyright (c) Serge Guelton *
6	* *
7	* Distributed under the terms of the BSD 3-Clause License. *
8	* *
9	* The full license is in the file LICENSE, distributed with this software. *
10	****************************************************************************/
11
12	#ifndef XSIMD_NEON_HPP
13	#define XSIMD_NEON_HPP
14
15	#include <algorithm>
16	#include <complex>
17	#include <tuple>
18	#include <type_traits>
19
20	#include "../types/xsimd_neon_register.hpp"
21	#include "../types/xsimd_utils.hpp"
22
23	// Wrap intrinsics so we can pass them as function pointers
24	// - OP: intrinsics name prefix, e.g., vorrq
25	// - RT: type traits to deduce intrinsics return types
26	#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
27	namespace wrap \
28	{ \
29	inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \
30	{ \
31	return ::OP##_u8(a, b); \
32	} \
33	inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \
34	{ \
35	return ::OP##_s8(a, b); \
36	} \
37	inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
38	{ \
39	return ::OP##_u16(a, b); \
40	} \
41	inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
42	{ \
43	return ::OP##_s16(a, b); \
44	} \
45	inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
46	{ \
47	return ::OP##_u32(a, b); \
48	} \
49	inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
50	{ \
51	return ::OP##_s32(a, b); \
52	} \
53	}
54
55	#define WRAP_BINARY_INT(OP, RT) \
56	WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
57	namespace wrap \
58	{ \
59	inline RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
60	{ \
61	return ::OP##_u64(a, b); \
62	} \
63	inline RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept \
64	{ \
65	return ::OP##_s64(a, b); \
66	} \
67	}
68
69	#define WRAP_BINARY_FLOAT(OP, RT) \
70	namespace wrap \
71	{ \
72	inline RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
73	{ \
74	return ::OP##_f32(a, b); \
75	} \
76	}
77
78	#define WRAP_UNARY_INT_EXCLUDING_64(OP) \
79	namespace wrap \
80	{ \
81	inline uint8x16_t OP##_u8(uint8x16_t a) noexcept \
82	{ \
83	return ::OP##_u8(a); \
84	} \
85	inline int8x16_t OP##_s8(int8x16_t a) noexcept \
86	{ \
87	return ::OP##_s8(a); \
88	} \
89	inline uint16x8_t OP##_u16(uint16x8_t a) noexcept \
90	{ \
91	return ::OP##_u16(a); \
92	} \
93	inline int16x8_t OP##_s16(int16x8_t a) noexcept \
94	{ \
95	return ::OP##_s16(a); \
96	} \
97	inline uint32x4_t OP##_u32(uint32x4_t a) noexcept \
98	{ \
99	return ::OP##_u32(a); \
100	} \
101	inline int32x4_t OP##_s32(int32x4_t a) noexcept \
102	{ \
103	return ::OP##_s32(a); \
104	} \
105	}
106
107	#define WRAP_UNARY_INT(OP) \
108	WRAP_UNARY_INT_EXCLUDING_64(OP) \
109	namespace wrap \
110	{ \
111	inline uint64x2_t OP##_u64(uint64x2_t a) noexcept \
112	{ \
113	return ::OP##_u64(a); \
114	} \
115	inline int64x2_t OP##_s64(int64x2_t a) noexcept \
116	{ \
117	return ::OP##_s64(a); \
118	} \
119	}
120
121	#define WRAP_UNARY_FLOAT(OP) \
122	namespace wrap \
123	{ \
124	inline float32x4_t OP##_f32(float32x4_t a) noexcept \
125	{ \
126	return ::OP##_f32(a); \
127	} \
128	}
129
130	// Dummy identity caster to ease coding
131	inline uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
132	inline int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
133	inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
134	inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
135	inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
136	inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
137	inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
138	inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
139	inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
140
141	namespace xsimd
142	{
143	template <class batch_type, bool... Values>
144	struct batch_bool_constant;
145
146	namespace kernel
147	{
148	using namespace types;
149
150	namespace detail
151	{
152	template <template <class> class return_type, class... T>
153	struct neon_dispatcher_base
154	{
155	struct unary
156	{
157	using container_type = std::tuple<return_type<T> (*)(T)...>;
158	const container_type m_func;
159
160	template <class U>
161	return_type<U> apply(U rhs) const noexcept
162	{
163	using func_type = return_type<U> (*)(U);
164	auto func = xsimd::detail::get<func_type>(m_func);
165	return func(rhs);
166	}
167	};
168
169	struct binary
170	{
171	using container_type = std::tuple<return_type<T> (*)(T, T)...>;
172	const container_type m_func;
173
174	template <class U>
175	return_type<U> apply(U lhs, U rhs) const noexcept
176	{
177	using func_type = return_type<U> (*)(U, U);
178	auto func = xsimd::detail::get<func_type>(m_func);
179	return func(lhs, rhs);
180	}
181	};
182	};
183
184	/***************************
185	* arithmetic dispatchers *
186	***************************/
187
188	template <class T>
189	using identity_return_type = T;
190
191	template <class... T>
192	struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...>
193	{
194	};
195
196	using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
197	uint16x8_t, int16x8_t,
198	uint32x4_t, int32x4_t,
199	uint64x2_t, int64x2_t,
200	float32x4_t>;
201
202	using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
203	uint16x8_t, int16x8_t,
204	uint32x4_t, int32x4_t,
205	float32x4_t>;
206
207	/**************************
208	* comparison dispatchers *
209	**************************/
210
211	template <class T>
212	struct comp_return_type_impl;
213
214	template <>
215	struct comp_return_type_impl<uint8x16_t>
216	{
217	using type = uint8x16_t;
218	};
219
220	template <>
221	struct comp_return_type_impl<int8x16_t>
222	{
223	using type = uint8x16_t;
224	};
225
226	template <>
227	struct comp_return_type_impl<uint16x8_t>
228	{
229	using type = uint16x8_t;
230	};
231
232	template <>
233	struct comp_return_type_impl<int16x8_t>
234	{
235	using type = uint16x8_t;
236	};
237
238	template <>
239	struct comp_return_type_impl<uint32x4_t>
240	{
241	using type = uint32x4_t;
242	};
243
244	template <>
245	struct comp_return_type_impl<int32x4_t>
246	{
247	using type = uint32x4_t;
248	};
249
250	template <>
251	struct comp_return_type_impl<uint64x2_t>
252	{
253	using type = uint64x2_t;
254	};
255
256	template <>
257	struct comp_return_type_impl<int64x2_t>
258	{
259	using type = uint64x2_t;
260	};
261
262	template <>
263	struct comp_return_type_impl<float32x4_t>
264	{
265	using type = uint32x4_t;
266	};
267
268	template <class T>
269	using comp_return_type = typename comp_return_type_impl<T>::type;
270
271	template <class... T>
272	struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...>
273	{
274	};
275
276	using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t,
277	uint16x8_t, int16x8_t,
278	uint32x4_t, int32x4_t,
279	float32x4_t>;
280
281	/**************************************
282	* enabling / disabling metafunctions *
283	**************************************/
284
285	template <class T>
286	using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value \|\| std::is_same<T, float>::value,
287	int>::type;
288
289	template <class T>
290	using exclude_int64_neon_t
291	= typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != `8`) \|\| std::is_same<T, float>::value, int>::type;
292	}
293
294	/*************
295	* broadcast *
296	*************/
297
298	template <class A, class T, detail::enable_sized_unsigned_t<T, `1`> = `0`>
299	inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
300	{
301	return vdupq_n_u8(p0: uint8_t(val));
302	}
303
304	template <class A, class T, detail::enable_sized_signed_t<T, `1`> = `0`>
305	inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
306	{
307	return vdupq_n_s8(p0: int8_t(val));
308	}
309
310	template <class A, class T, detail::enable_sized_unsigned_t<T, `2`> = `0`>
311	inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
312	{
313	return vdupq_n_u16(p0: uint16_t(val));
314	}
315
316	template <class A, class T, detail::enable_sized_signed_t<T, `2`> = `0`>
317	inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
318	{
319	return vdupq_n_s16(p0: int16_t(val));
320	}
321
322	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
323	inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
324	{
325	return vdupq_n_u32(p0: uint32_t(val));
326	}
327
328	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
329	inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
330	{
331	return vdupq_n_s32(p0: int32_t(val));
332	}
333
334	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
335	inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
336	{
337	return vdupq_n_u64(p0: uint64_t(val));
338	}
339
340	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
341	inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
342	{
343	return vdupq_n_s64(p0: int64_t(val));
344	}
345
346	template <class A>
347	inline batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
348	{
349	return vdupq_n_f32(p0: val);
350	}
351
352	/*******
353	* set *
354	*******/
355
356	template <class A, class T, class... Args, detail::enable_integral_t<T> = `0`>
357	inline batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
358	{
359	return xsimd::types::detail::neon_vector_type<T> { args... };
360	}
361
362	template <class A, class T, class... Args, detail::enable_integral_t<T> = `0`>
363	inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
364	{
365	using register_type = typename batch_bool<T, A>::register_type;
366	using unsigned_type = as_unsigned_integer_t<T>;
367	return register_type { static_cast<unsigned_type>(args ? -`1LL` : `0LL`)... };
368	}
369
370	template <class A>
371	inline batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
372	{
373	return float32x4_t { f0, f1, f2, f3 };
374	}
375
376	template <class A>
377	inline batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
378	std::complex<float> c0, std::complex<float> c1,
379	std::complex<float> c2, std::complex<float> c3) noexcept
380	{
381	return batch<std::complex<float>>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
382	float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
383	}
384
385	template <class A, class... Args>
386	inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
387	{
388	using register_type = typename batch_bool<float, A>::register_type;
389	using unsigned_type = as_unsigned_integer_t<float>;
390	return register_type { static_cast<unsigned_type>(args ? -`1LL` : `0LL`)... };
391	}
392
393	/*************
394	* from_bool *
395	*************/
396
397	template <class A, class T, detail::enable_sized_unsigned_t<T, `1`> = `0`>
398	inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
399	{
400	return vandq_u8(arg, vdupq_n_u8(p0: `1`));
401	}
402
403	template <class A, class T, detail::enable_sized_signed_t<T, `1`> = `0`>
404	inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
405	{
406	return vandq_s8(p0: reinterpret_cast<int8x16_t>(arg.data), p1: vdupq_n_s8(p0: `1`));
407	}
408
409	template <class A, class T, detail::enable_sized_unsigned_t<T, `2`> = `0`>
410	inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
411	{
412	return vandq_u16(arg, vdupq_n_u16(p0: `1`));
413	}
414
415	template <class A, class T, detail::enable_sized_signed_t<T, `2`> = `0`>
416	inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
417	{
418	return vandq_s16(p0: reinterpret_cast<int16x8_t>(arg.data), p1: vdupq_n_s16(p0: `1`));
419	}
420
421	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
422	inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
423	{
424	return vandq_u32(arg, vdupq_n_u32(p0: `1`));
425	}
426
427	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
428	inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
429	{
430	return vandq_s32(p0: reinterpret_cast<int32x4_t>(arg.data), p1: vdupq_n_s32(p0: `1`));
431	}
432
433	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
434	inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
435	{
436	return vandq_u64(arg, vdupq_n_u64(p0: `1`));
437	}
438
439	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
440	inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
441	{
442	return vandq_s64(p0: reinterpret_cast<int64x2_t>(arg.data), p1: vdupq_n_s64(p0: `1`));
443	}
444
445	template <class A>
446	inline batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
447	{
448	return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(p0: vdupq_n_f32(p0: `1.f`))));
449	}
450
451	/********
452	* load *
453	********/
454
455	template <class A, class T, detail::enable_sized_unsigned_t<T, `1`> = `0`>
456	inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
457	{
458	return vld1q_u8((uint8_t*)src);
459	}
460
461	template <class A, class T, detail::enable_sized_signed_t<T, `1`> = `0`>
462	inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
463	{
464	return vld1q_s8((int8_t*)src);
465	}
466
467	template <class A, class T, detail::enable_sized_unsigned_t<T, `2`> = `0`>
468	inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
469	{
470	return vld1q_u16((uint16_t*)src);
471	}
472	template <class A, class T, detail::enable_sized_signed_t<T, `2`> = `0`>
473	inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
474	{
475	return vld1q_s16((int16_t*)src);
476	}
477	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
478	inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
479	{
480	return vld1q_u32((uint32_t*)src);
481	}
482	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
483	inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
484	{
485	return vld1q_s32((int32_t*)src);
486	}
487	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
488	inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
489	{
490	return vld1q_u64((uint64_t*)src);
491	}
492	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
493	inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
494	{
495	return vld1q_s64((int64_t*)src);
496	}
497
498	template <class A>
499	inline batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
500	{
501	return vld1q_f32(src);
502	}
503
504	template <class A, class T>
505	inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
506	{
507	return load_aligned<A>(src, convert<T>(), A {});
508	}
509
510	/*********
511	* store *
512	*********/
513
514	template <class A, class T, detail::enable_sized_unsigned_t<T, `1`> = `0`>
515	inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
516	{
517	vst1q_u8((uint8_t*)dst, src);
518	}
519
520	template <class A, class T, detail::enable_sized_signed_t<T, `1`> = `0`>
521	inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
522	{
523	vst1q_s8((int8_t*)dst, src);
524	}
525
526	template <class A, class T, detail::enable_sized_unsigned_t<T, `2`> = `0`>
527	inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
528	{
529	vst1q_u16((uint16_t*)dst, src);
530	}
531
532	template <class A, class T, detail::enable_sized_signed_t<T, `2`> = `0`>
533	inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
534	{
535	vst1q_s16((int16_t*)dst, src);
536	}
537
538	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
539	inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
540	{
541	vst1q_u32((uint32_t*)dst, src);
542	}
543
544	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
545	inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
546	{
547	vst1q_s32((int32_t*)dst, src);
548	}
549
550	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
551	inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
552	{
553	vst1q_u64((uint64_t*)dst, src);
554	}
555
556	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
557	inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
558	{
559	vst1q_s64((int64_t*)dst, src);
560	}
561
562	template <class A>
563	inline void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
564	{
565	vst1q_f32(dst, src);
566	}
567
568	template <class A, class T>
569	inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
570	{
571	store_aligned<A>(dst, src, A {});
572	}
573
574	/****************
575	* load_complex *
576	****************/
577
578	template <class A>
579	inline batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept
580	{
581	using real_batch = batch<float, A>;
582	const float* buf = reinterpret_cast<const float*>(mem);
583	float32x4x2_t tmp = vld2q_f32(buf);
584	real_batch real = tmp.val[`0`],
585	imag = tmp.val[`1`];
586	return batch<std::complex<float>, A> { real, imag };
587	}
588
589	template <class A>
590	inline batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
591	{
592	return load_complex_aligned<A>(mem, cvt, A {});
593	}
594
595	/*****************
596	* store_complex *
597	*****************/
598
599	template <class A>
600	inline void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
601	{
602	float32x4x2_t tmp;
603	tmp.val[`0`] = src.real();
604	tmp.val[`1`] = src.imag();
605	float* buf = reinterpret_cast<float*>(dst);
606	vst2q_f32(buf, tmp);
607	}
608
609	template <class A>
610	inline void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
611	{
612	store_complex_aligned(dst, src, A {});
613	}
614
615	/*******
616	* neg *
617	*******/
618
619	template <class A, class T, detail::enable_sized_unsigned_t<T, `1`> = `0`>
620	inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
621	{
622	return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs)));
623	}
624
625	template <class A, class T, detail::enable_sized_signed_t<T, `1`> = `0`>
626	inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
627	{
628	return vnegq_s8(rhs);
629	}
630
631	template <class A, class T, detail::enable_sized_unsigned_t<T, `2`> = `0`>
632	inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
633	{
634	return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs)));
635	}
636
637	template <class A, class T, detail::enable_sized_signed_t<T, `2`> = `0`>
638	inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
639	{
640	return vnegq_s16(rhs);
641	}
642
643	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
644	inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
645	{
646	return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs)));
647	}
648
649	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
650	inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
651	{
652	return vnegq_s32(rhs);
653	}
654
655	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
656	inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
657	{
658	return batch<T, A> { -rhs.get(`0`), -rhs.get(`1`) };
659	}
660
661	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
662	inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
663	{
664	return batch<T, A> { -rhs.get(`0`), -rhs.get(`1`) };
665	}
666
667	template <class A>
668	inline batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
669	{
670	return vnegq_f32(rhs);
671	}
672
673	/*******
674	* add *
675	*******/
676
677	WRAP_BINARY_INT(vaddq, detail::identity_return_type)
678	WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)
679
680	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
681	inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
682	{
683	using register_type = typename batch<T, A>::register_type;
684	const detail::neon_dispatcher::binary dispatcher = {
685	.m_func: std::make_tuple(args&: wrap::vaddq_u8, args&: wrap::vaddq_s8, args&: wrap::vaddq_u16, args&: wrap::vaddq_s16,
686	args&: wrap::vaddq_u32, args&: wrap::vaddq_s32, args&: wrap::vaddq_u64, args&: wrap::vaddq_s64,
687	args&: wrap::vaddq_f32)
688	};
689	return dispatcher.apply(register_type(lhs), register_type(rhs));
690	}
691
692	/********
693	* sadd *
694	********/
695
696	WRAP_BINARY_INT(vqaddq, detail::identity_return_type)
697
698	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
699	inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
700	{
701	using register_type = typename batch<T, A>::register_type;
702	const detail::neon_dispatcher::binary dispatcher = {
703	.m_func: std::make_tuple(args&: wrap::vqaddq_u8, args&: wrap::vqaddq_s8, args&: wrap::vqaddq_u16, args&: wrap::vqaddq_s16,
704	args&: wrap::vqaddq_u32, args&: wrap::vqaddq_s32, args&: wrap::vqaddq_u64, args&: wrap::vqaddq_s64,
705	args&: wrap::vaddq_f32)
706	};
707	return dispatcher.apply(register_type(lhs), register_type(rhs));
708	}
709
710	/*******
711	* sub *
712	*******/
713
714	WRAP_BINARY_INT(vsubq, detail::identity_return_type)
715	WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)
716
717	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
718	inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
719	{
720	using register_type = typename batch<T, A>::register_type;
721	const detail::neon_dispatcher::binary dispatcher = {
722	.m_func: std::make_tuple(args&: wrap::vsubq_u8, args&: wrap::vsubq_s8, args&: wrap::vsubq_u16, args&: wrap::vsubq_s16,
723	args&: wrap::vsubq_u32, args&: wrap::vsubq_s32, args&: wrap::vsubq_u64, args&: wrap::vsubq_s64,
724	args&: wrap::vsubq_f32)
725	};
726	return dispatcher.apply(register_type(lhs), register_type(rhs));
727	}
728
729	/********
730	* ssub *
731	********/
732
733	WRAP_BINARY_INT(vqsubq, detail::identity_return_type)
734
735	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
736	inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
737	{
738	using register_type = typename batch<T, A>::register_type;
739	const detail::neon_dispatcher::binary dispatcher = {
740	.m_func: std::make_tuple(args&: wrap::vqsubq_u8, args&: wrap::vqsubq_s8, args&: wrap::vqsubq_u16, args&: wrap::vqsubq_s16,
741	args&: wrap::vqsubq_u32, args&: wrap::vqsubq_s32, args&: wrap::vqsubq_u64, args&: wrap::vqsubq_s64,
742	args&: wrap::vsubq_f32)
743	};
744	return dispatcher.apply(register_type(lhs), register_type(rhs));
745	}
746
747	/*******
748	* mul *
749	*******/
750
751	WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type)
752	WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)
753
754	template <class A, class T, detail::exclude_int64_neon_t<T> = `0`>
755	inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
756	{
757	using register_type = typename batch<T, A>::register_type;
758	const detail::excluding_int64_dispatcher::binary dispatcher = {
759	.m_func: std::make_tuple(args&: wrap::vmulq_u8, args&: wrap::vmulq_s8, args&: wrap::vmulq_u16, args&: wrap::vmulq_s16,
760	args&: wrap::vmulq_u32, args&: wrap::vmulq_s32, args&: wrap::vmulq_f32)
761	};
762	return dispatcher.apply(register_type(lhs), register_type(rhs));
763	}
764
765	/*******
766	* div *
767	*******/
768
769	#if defined(XSIMD_FAST_INTEGER_DIVISION)
770	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
771	inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
772	{
773	return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs));
774	}
775
776	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
777	inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
778	{
779	return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs));
780	}
781	#endif
782
783	template <class A>
784	inline batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
785	{
786	// from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html
787	// get an initial estimate of 1/b.
788	float32x4_t rcp = reciprocal(rhs);
789
790	// use a couple Newton-Raphson steps to refine the estimate. Depending on your
791	// application's accuracy requirements, you may be able to get away with only
792	// one refinement (instead of the two used here). Be sure to test!
793	rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
794	rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
795
796	// and finally, compute a / b = a (1 / b)*
797	return vmulq_f32(lhs, rcp);
798	}
799
800	/******
801	* eq *
802	******/
803
804	WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type)
805	WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)
806
807	template <class A, class T, detail::exclude_int64_neon_t<T> = `0`>
808	inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
809	{
810	using register_type = typename batch<T, A>::register_type;
811	const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
812	.m_func: std::make_tuple(args&: wrap::vceqq_u8, args&: wrap::vceqq_s8, args&: wrap::vceqq_u16, args&: wrap::vceqq_s16,
813	args&: wrap::vceqq_u32, args&: wrap::vceqq_s32, args&: wrap::vceqq_f32)
814	};
815	return dispatcher.apply(register_type(lhs), register_type(rhs));
816	}
817
818	template <class A, class T, detail::exclude_int64_neon_t<T> = `0`>
819	inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
820	{
821	using register_type = typename batch_bool<T, A>::register_type;
822	using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
823	const dispatcher_type dispatcher = {
824	.m_func: std::make_tuple(args&: wrap::vceqq_u8, args&: wrap::vceqq_u16, args&: wrap::vceqq_u32)
825	};
826	return dispatcher.apply(register_type(lhs), register_type(rhs));
827	}
828
829	template <class A, class T, detail::enable_sized_integral_t<T, `8`> = `0`>
830	inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
831	{
832	return batch_bool<T, A>({ lhs.get(`0`) == rhs.get(`0`), lhs.get(`1`) == rhs.get(`1`) });
833	}
834
835	template <class A, class T, detail::enable_sized_integral_t<T, `8`> = `0`>
836	inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
837	{
838	return batch_bool<T, A>({ lhs.get(`0`) == rhs.get(`0`), lhs.get(`1`) == rhs.get(`1`) });
839	}
840
841	/*************
842	* fast_cast *
843	*************/
844
845	namespace detail
846	{
847	template <class A>
848	inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
849	{
850	return vcvtq_f32_s32(self);
851	}
852
853	template <class A>
854	inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
855	{
856	return vcvtq_f32_u32(self);
857	}
858
859	template <class A>
860	inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
861	{
862	return vcvtq_s32_f32(self);
863	}
864
865	template <class A>
866	inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
867	{
868	return vcvtq_u32_f32(self);
869	}
870
871	}
872
873	/******
874	* lt *
875	******/
876
877	WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type)
878	WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)
879
880	template <class A, class T, detail::exclude_int64_neon_t<T> = `0`>
881	inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
882	{
883	using register_type = typename batch<T, A>::register_type;
884	const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
885	.m_func: std::make_tuple(args&: wrap::vcltq_u8, args&: wrap::vcltq_s8, args&: wrap::vcltq_u16, args&: wrap::vcltq_s16,
886	args&: wrap::vcltq_u32, args&: wrap::vcltq_s32, args&: wrap::vcltq_f32)
887	};
888	return dispatcher.apply(register_type(lhs), register_type(rhs));
889	}
890
891	template <class A, class T, detail::enable_sized_integral_t<T, `8`> = `0`>
892	inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
893	{
894	return batch_bool<T, A>({ lhs.get(`0`) < rhs.get(`0`), lhs.get(`1`) < rhs.get(`1`) });
895	}
896
897	/******
898	* le *
899	******/
900
901	WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type)
902	WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)
903
904	template <class A, class T, detail::exclude_int64_neon_t<T> = `0`>
905	inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
906	{
907	using register_type = typename batch<T, A>::register_type;
908	const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
909	.m_func: std::make_tuple(args&: wrap::vcleq_u8, args&: wrap::vcleq_s8, args&: wrap::vcleq_u16, args&: wrap::vcleq_s16,
910	args&: wrap::vcleq_u32, args&: wrap::vcleq_s32, args&: wrap::vcleq_f32)
911	};
912	return dispatcher.apply(register_type(lhs), register_type(rhs));
913	}
914
915	template <class A, class T, detail::enable_sized_integral_t<T, `8`> = `0`>
916	inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
917	{
918	return batch_bool<T, A>({ lhs.get(`0`) <= rhs.get(`0`), lhs.get(`1`) <= rhs.get(`1`) });
919	}
920
921	/******
922	* gt *
923	******/
924
925	WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
926	WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
927
928	template <class A, class T, detail::exclude_int64_neon_t<T> = `0`>
929	inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
930	{
931	using register_type = typename batch<T, A>::register_type;
932	const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
933	.m_func: std::make_tuple(args&: wrap::vcgtq_u8, args&: wrap::vcgtq_s8, args&: wrap::vcgtq_u16, args&: wrap::vcgtq_s16,
934	args&: wrap::vcgtq_u32, args&: wrap::vcgtq_s32, args&: wrap::vcgtq_f32)
935	};
936	return dispatcher.apply(register_type(lhs), register_type(rhs));
937	}
938
939	template <class A, class T, detail::enable_sized_integral_t<T, `8`> = `0`>
940	inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
941	{
942	return batch_bool<T, A>({ lhs.get(`0`) > rhs.get(`0`), lhs.get(`1`) > rhs.get(`1`) });
943	}
944
945	/******
946	* ge *
947	******/
948
949	WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type)
950	WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
951
952	template <class A, class T, detail::exclude_int64_neon_t<T> = `0`>
953	inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
954	{
955	using register_type = typename batch<T, A>::register_type;
956	const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
957	.m_func: std::make_tuple(args&: wrap::vcgeq_u8, args&: wrap::vcgeq_s8, args&: wrap::vcgeq_u16, args&: wrap::vcgeq_s16,
958	args&: wrap::vcgeq_u32, args&: wrap::vcgeq_s32, args&: wrap::vcgeq_f32)
959	};
960	return dispatcher.apply(register_type(lhs), register_type(rhs));
961	}
962
963	template <class A, class T, detail::enable_sized_integral_t<T, `8`> = `0`>
964	inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
965	{
966	return batch_bool<T, A>({ lhs.get(`0`) >= rhs.get(`0`), lhs.get(`1`) >= rhs.get(`1`) });
967	}
968
969	/*******************
970	* batch_bool_cast *
971	*******************/
972
973	template <class A, class T_out, class T_in>
974	inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
975	{
976	using register_type = typename batch_bool<T_out, A>::register_type;
977	return register_type(self);
978	}
979
980	/***************
981	* bitwise_and *
982	***************/
983
984	WRAP_BINARY_INT(vandq, detail::identity_return_type)
985
986	namespace detail
987	{
988	inline float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
989	{
990	return vreinterpretq_f32_u32(p0: vandq_u32(p0: vreinterpretq_u32_f32(p0: lhs),
991	p1: vreinterpretq_u32_f32(p0: rhs)));
992	}
993
994	template <class V>
995	V bitwise_and_neon(V const& lhs, V const& rhs)
996	{
997	const neon_dispatcher::binary dispatcher = {
998	.m_func: std::make_tuple(args&: wrap::vandq_u8, args&: wrap::vandq_s8, args&: wrap::vandq_u16, args&: wrap::vandq_s16,
999	args&: wrap::vandq_u32, args&: wrap::vandq_s32, args&: wrap::vandq_u64, args&: wrap::vandq_s64,
1000	args&: bitwise_and_f32)
1001	};
1002	return dispatcher.apply(lhs, rhs);
1003	}
1004	}
1005
1006	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1007	inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1008	{
1009	using register_type = typename batch<T, A>::register_type;
1010	return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
1011	}
1012
1013	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1014	inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1015	{
1016	using register_type = typename batch_bool<T, A>::register_type;
1017	return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
1018	}
1019
1020	/**************
1021	* bitwise_or *
1022	**************/
1023
1024	WRAP_BINARY_INT(vorrq, detail::identity_return_type)
1025
1026	namespace detail
1027	{
1028	inline float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1029	{
1030	return vreinterpretq_f32_u32(p0: vorrq_u32(p0: vreinterpretq_u32_f32(p0: lhs),
1031	p1: vreinterpretq_u32_f32(p0: rhs)));
1032	}
1033
1034	template <class V>
1035	inline V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
1036	{
1037	const neon_dispatcher::binary dispatcher = {
1038	.m_func: std::make_tuple(args&: wrap::vorrq_u8, args&: wrap::vorrq_s8, args&: wrap::vorrq_u16, args&: wrap::vorrq_s16,
1039	args&: wrap::vorrq_u32, args&: wrap::vorrq_s32, args&: wrap::vorrq_u64, args&: wrap::vorrq_s64,
1040	args&: bitwise_or_f32)
1041	};
1042	return dispatcher.apply(lhs, rhs);
1043	}
1044	}
1045
1046	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1047	inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1048	{
1049	using register_type = typename batch<T, A>::register_type;
1050	return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
1051	}
1052
1053	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1054	inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1055	{
1056	using register_type = typename batch_bool<T, A>::register_type;
1057	return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
1058	}
1059
1060	/***************
1061	* bitwise_xor *
1062	***************/
1063
1064	WRAP_BINARY_INT(veorq, detail::identity_return_type)
1065
1066	namespace detail
1067	{
1068	inline float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1069	{
1070	return vreinterpretq_f32_u32(p0: veorq_u32(p0: vreinterpretq_u32_f32(p0: lhs),
1071	p1: vreinterpretq_u32_f32(p0: rhs)));
1072	}
1073
1074	template <class V>
1075	inline V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
1076	{
1077	const neon_dispatcher::binary dispatcher = {
1078	.m_func: std::make_tuple(args&: wrap::veorq_u8, args&: wrap::veorq_s8, args&: wrap::veorq_u16, args&: wrap::veorq_s16,
1079	args&: wrap::veorq_u32, args&: wrap::veorq_s32, args&: wrap::veorq_u64, args&: wrap::veorq_s64,
1080	args&: bitwise_xor_f32)
1081	};
1082	return dispatcher.apply(lhs, rhs);
1083	}
1084	}
1085
1086	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1087	inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1088	{
1089	using register_type = typename batch<T, A>::register_type;
1090	return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
1091	}
1092
1093	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1094	inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1095	{
1096	using register_type = typename batch_bool<T, A>::register_type;
1097	return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
1098	}
1099
1100	/*******
1101	* neq *
1102	*******/
1103
1104	template <class A, class T>
1105	inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1106	{
1107	return bitwise_xor(lhs, rhs, A {});
1108	}
1109
1110	/***************
1111	* bitwise_not *
1112	***************/
1113
1114	WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
1115
1116	namespace detail
1117	{
1118	inline int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
1119	{
1120	return vreinterpretq_s64_s32(p0: vmvnq_s32(p0: vreinterpretq_s32_s64(p0: arg)));
1121	}
1122
1123	inline uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
1124	{
1125	return vreinterpretq_u64_u32(p0: vmvnq_u32(p0: vreinterpretq_u32_u64(p0: arg)));
1126	}
1127
1128	inline float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
1129	{
1130	return vreinterpretq_f32_u32(p0: vmvnq_u32(p0: vreinterpretq_u32_f32(p0: arg)));
1131	}
1132
1133	template <class V>
1134	inline V bitwise_not_neon(V const& arg) noexcept
1135	{
1136	const neon_dispatcher::unary dispatcher = {
1137	.m_func: std::make_tuple(args&: wrap::vmvnq_u8, args&: wrap::vmvnq_s8, args&: wrap::vmvnq_u16, args&: wrap::vmvnq_s16,
1138	args&: wrap::vmvnq_u32, args&: wrap::vmvnq_s32,
1139	args&: bitwise_not_u64, args&: bitwise_not_s64,
1140	args&: bitwise_not_f32)
1141	};
1142	return dispatcher.apply(arg);
1143	}
1144	}
1145
1146	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1147	inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
1148	{
1149	using register_type = typename batch<T, A>::register_type;
1150	return detail::bitwise_not_neon(register_type(arg));
1151	}
1152
1153	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1154	inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
1155	{
1156	using register_type = typename batch_bool<T, A>::register_type;
1157	return detail::bitwise_not_neon(register_type(arg));
1158	}
1159
1160	/******************
1161	* bitwise_andnot *
1162	******************/
1163
1164	WRAP_BINARY_INT(vbicq, detail::identity_return_type)
1165
1166	namespace detail
1167	{
1168	inline float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1169	{
1170	return vreinterpretq_f32_u32(p0: vbicq_u32(p0: vreinterpretq_u32_f32(p0: lhs), p1: vreinterpretq_u32_f32(p0: rhs)));
1171	}
1172
1173	template <class V>
1174	inline V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
1175	{
1176	const detail::neon_dispatcher::binary dispatcher = {
1177	.m_func: std::make_tuple(args&: wrap::vbicq_u8, args&: wrap::vbicq_s8, args&: wrap::vbicq_u16, args&: wrap::vbicq_s16,
1178	args&: wrap::vbicq_u32, args&: wrap::vbicq_s32, args&: wrap::vbicq_u64, args&: wrap::vbicq_s64,
1179	args&: bitwise_andnot_f32)
1180	};
1181	return dispatcher.apply(lhs, rhs);
1182	}
1183	}
1184
1185	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1186	inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1187	{
1188	using register_type = typename batch<T, A>::register_type;
1189	return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
1190	}
1191
1192	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1193	inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1194	{
1195	using register_type = typename batch_bool<T, A>::register_type;
1196	return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
1197	}
1198
1199	/*******
1200	* min *
1201	*******/
1202
1203	WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type)
1204	WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)
1205
1206	template <class A, class T, detail::exclude_int64_neon_t<T> = `0`>
1207	inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1208	{
1209	using register_type = typename batch<T, A>::register_type;
1210	const detail::excluding_int64_dispatcher::binary dispatcher = {
1211	.m_func: std::make_tuple(args&: wrap::vminq_u8, args&: wrap::vminq_s8, args&: wrap::vminq_u16, args&: wrap::vminq_s16,
1212	args&: wrap::vminq_u32, args&: wrap::vminq_s32, args&: wrap::vminq_f32)
1213	};
1214	return dispatcher.apply(register_type(lhs), register_type(rhs));
1215	}
1216
1217	template <class A, class T, detail::enable_sized_integral_t<T, `8`> = `0`>
1218	inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1219	{
1220	return { std::min(lhs.get(`0`), rhs.get(`0`)), std::min(lhs.get(`1`), rhs.get(`1`)) };
1221	}
1222
1223	/*******
1224	* max *
1225	*******/
1226
1227	WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type)
1228	WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)
1229
1230	template <class A, class T, detail::exclude_int64_neon_t<T> = `0`>
1231	inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1232	{
1233	using register_type = typename batch<T, A>::register_type;
1234	const detail::excluding_int64_dispatcher::binary dispatcher = {
1235	.m_func: std::make_tuple(args&: wrap::vmaxq_u8, args&: wrap::vmaxq_s8, args&: wrap::vmaxq_u16, args&: wrap::vmaxq_s16,
1236	args&: wrap::vmaxq_u32, args&: wrap::vmaxq_s32, args&: wrap::vmaxq_f32)
1237	};
1238	return dispatcher.apply(register_type(lhs), register_type(rhs));
1239	}
1240
1241	template <class A, class T, detail::enable_sized_integral_t<T, `8`> = `0`>
1242	inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1243	{
1244	return { std::max(lhs.get(`0`), rhs.get(`0`)), std::max(lhs.get(`1`), rhs.get(`1`)) };
1245	}
1246
1247	/*******
1248	* abs *
1249	*******/
1250
1251	namespace wrap
1252	{
1253	inline int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(p0: a); }
1254	inline int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(p0: a); }
1255	inline int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(p0: a); }
1256	}
1257	WRAP_UNARY_FLOAT(vabsq)
1258
1259	namespace detail
1260	{
1261	inline uint8x16_t abs_u8(uint8x16_t arg) noexcept
1262	{
1263	return arg;
1264	}
1265
1266	inline uint16x8_t abs_u16(uint16x8_t arg) noexcept
1267	{
1268	return arg;
1269	}
1270
1271	inline uint32x4_t abs_u32(uint32x4_t arg) noexcept
1272	{
1273	return arg;
1274	}
1275	}
1276
1277	template <class A, class T, detail::exclude_int64_neon_t<T> = `0`>
1278	inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
1279	{
1280	using register_type = typename batch<T, A>::register_type;
1281	const detail::excluding_int64_dispatcher::unary dispatcher = {
1282	.m_func: std::make_tuple(args&: detail::abs_u8, args&: wrap::vabsq_s8, args&: detail::abs_u16, args&: wrap::vabsq_s16,
1283	args&: detail::abs_u32, args&: wrap::vabsq_s32, args&: wrap::vabsq_f32)
1284	};
1285	return dispatcher.apply(register_type(arg));
1286	}
1287
1288	/********
1289	* rsqrt *
1290	********/
1291
1292	template <class A>
1293	inline batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
1294	{
1295	return vrsqrteq_f32(arg);
1296	}
1297
1298	/********
1299	* sqrt *
1300	********/
1301
1302	template <class A>
1303	inline batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
1304	{
1305	batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg);
1306	// one iter
1307	sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
1308	batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
1309	batch<float, A> zero(`0.f`);
1310	return select(arg == zero, zero, sqrt_approx);
1311	}
1312
1313	/********************
1314	* Fused operations *
1315	********************/
1316
1317	#ifdef __ARM_FEATURE_FMA
1318	template <class A>
1319	inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
1320	{
1321	return vfmaq_f32(z, x, y);
1322	}
1323
1324	template <class A>
1325	inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
1326	{
1327	return vfmaq_f32(-z, x, y);
1328	}
1329	#endif
1330
1331	/*********
1332	* haddp *
1333	*********/
1334
1335	template <class A>
1336	inline batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
1337	{
1338	// row = (a,b,c,d)
1339	float32x2_t tmp1, tmp2, tmp3;
1340	// tmp1 = (a0 + a2, a1 + a3)
1341	tmp1 = vpadd_f32(vget_low_f32(row[`0`]), vget_high_f32(row[`0`]));
1342	// tmp2 = (b0 + b2, b1 + b3)
1343	tmp2 = vpadd_f32(vget_low_f32(row[`1`]), vget_high_f32(row[`1`]));
1344	// tmp1 = (a0..3, b0..3)
1345	tmp1 = vpadd_f32(p0: tmp1, p1: tmp2);
1346	// tmp2 = (c0 + c2, c1 + c3)
1347	tmp2 = vpadd_f32(vget_low_f32(row[`2`]), vget_high_f32(row[`2`]));
1348	// tmp3 = (d0 + d2, d1 + d3)
1349	tmp3 = vpadd_f32(vget_low_f32(row[`3`]), vget_high_f32(row[`3`]));
1350	// tmp1 = (c0..3, d0..3)
1351	tmp2 = vpadd_f32(p0: tmp2, p1: tmp3);
1352	// return = (a0..3, b0..3, c0..3, d0..3)
1353	return vcombine_f32(p0: tmp1, p1: tmp2);
1354	}
1355
1356	/**************
1357	* reciprocal *
1358	**************/
1359
1360	template <class A>
1361	inline batch<float, A>
1362	reciprocal(const batch<float, A>& x,
1363	kernel::requires_arch<neon>) noexcept
1364	{
1365	return vrecpeq_f32(x);
1366	}
1367
1368	/**********
1369	* insert *
1370	**********/
1371
1372	template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, `1`> = `0`>
1373	inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1374	{
1375	return vsetq_lane_u8(val, self, I);
1376	}
1377
1378	template <class A, class T, size_t I, detail::enable_sized_signed_t<T, `1`> = `0`>
1379	inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1380	{
1381	return vsetq_lane_s8(val, self, I);
1382	}
1383
1384	template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, `2`> = `0`>
1385	inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1386	{
1387	return vsetq_lane_u16(val, self, I);
1388	}
1389
1390	template <class A, class T, size_t I, detail::enable_sized_signed_t<T, `2`> = `0`>
1391	inline batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
1392	{
1393	return vsetq_lane_s16(val, self, I);
1394	}
1395
1396	template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, `4`> = `0`>
1397	inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1398	{
1399	return vsetq_lane_u32(val, self, I);
1400	}
1401
1402	template <class A, class T, size_t I, detail::enable_sized_signed_t<T, `4`> = `0`>
1403	inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1404	{
1405	return vsetq_lane_s32(val, self, I);
1406	}
1407
1408	template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, `8`> = `0`>
1409	inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1410	{
1411	return vsetq_lane_u64(val, self, I);
1412	}
1413
1414	template <class A, class T, size_t I, detail::enable_sized_signed_t<T, `8`> = `0`>
1415	inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1416	{
1417	return vsetq_lane_s64(val, self, I);
1418	}
1419
1420	template <class A, size_t I>
1421	inline batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept
1422	{
1423	return vsetq_lane_f32(val, self, I);
1424	}
1425
1426	/********************
1427	* nearbyint_as_int *
1428	*******************/
1429
1430	template <class A>
1431	inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1432	requires_arch<neon>) noexcept
1433	{
1434	/ origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 /
1435	// Contributors to this work are:
1436	// John W. Ratcliff <jratcliffscarab@gmail.com>
1437	// Brandon Rowlett <browlett@nvidia.com>
1438	// Ken Fast <kfast@gdeb.com>
1439	// Eric van Beurden <evanbeurden@nvidia.com>
1440	// Alexander Potylitsin <apotylitsin@nvidia.com>
1441	// Hasindu Gamaarachchi <hasindu2008@gmail.com>
1442	// Jim Huang <jserv@biilabs.io>
1443	// Mark Cheng <marktwtn@biilabs.io>
1444	// Malcolm James MacLeod <malcolm@gulden.com>
1445	// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
1446	// Sebastian Pop <spop@amazon.com>
1447	// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
1448	// Danila Kutenin <danilak@google.com>
1449	// François Turban (JishinMaster) <francois.turban@gmail.com>
1450	// Pei-Hsuan Hung <afcidk@gmail.com>
1451	// Yang-Hao Yuan <yanghau@biilabs.io>
1452	// Syoyo Fujita <syoyo@lighttransport.com>
1453	// Brecht Van Lommel <brecht@blender.org>
1454
1455	/*
1456	* sse2neon is freely redistributable under the MIT License.
1457	*
1458	* Permission is hereby granted, free of charge, to any person obtaining a copy
1459	* of this software and associated documentation files (the "Software"), to deal
1460	* in the Software without restriction, including without limitation the rights
1461	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1462	* copies of the Software, and to permit persons to whom the Software is
1463	* furnished to do so, subject to the following conditions:
1464	*
1465	* The above copyright notice and this permission notice shall be included in
1466	* all copies or substantial portions of the Software.
1467	*
1468	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1469	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1470	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1471	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1472	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1473	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1474	* SOFTWARE.
1475	*/
1476
1477	const auto signmask = vdupq_n_u32(p0: `0x80000000`);
1478	const auto half = vbslq_f32(signmask, self,
1479	vdupq_n_f32(p0: `0.5f`)); / +/- 0.5 /
1480	const auto r_normal = vcvtq_s32_f32(vaddq_f32(
1481	self, half)); / round to integer: [a + 0.5]/
1482	const auto r_trunc = vcvtq_s32_f32(self); / truncate to integer: [a] /
1483	const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32(
1484	vreinterpretq_u32_s32(vnegq_s32(r_trunc)), `31`)); / 1 or 0 /
1485	const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
1486	vdupq_n_s32(p0: `1`)); / ([a] + {0,1}) & ~1 /
1487	const auto delta = vsubq_f32(
1488	self,
1489	vcvtq_f32_s32(r_trunc)); / compute delta: delta = (a - [a]) /
1490	const auto is_delta_half = vceqq_f32(delta, half); / delta == +/- 0.5 /
1491	return vbslq_s32(is_delta_half, r_even, r_normal);
1492	}
1493
1494	/**************
1495	* reduce_add *
1496	**************/
1497
1498	namespace detail
1499	{
1500	template <class T, class A, class V>
1501	inline T sum_batch(V const& arg) noexcept
1502	{
1503	T res = T(`0`);
1504	for (std::size_t i = `0`; i < batch<T, A>::size; ++i)
1505	{
1506	res += arg[i];
1507	}
1508	return res;
1509	}
1510	}
1511
1512	template <class A, class T, detail::enable_sized_unsigned_t<T, `1`> = `0`>
1513	inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1514	{
1515	uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg));
1516	tmp = vpadd_u8(p0: tmp, p1: tmp);
1517	tmp = vpadd_u8(p0: tmp, p1: tmp);
1518	tmp = vpadd_u8(p0: tmp, p1: tmp);
1519	return vget_lane_u8(tmp, `0`);
1520	}
1521
1522	template <class A, class T, detail::enable_sized_signed_t<T, `1`> = `0`>
1523	inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1524	{
1525	int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg));
1526	tmp = vpadd_s8(p0: tmp, p1: tmp);
1527	tmp = vpadd_s8(p0: tmp, p1: tmp);
1528	tmp = vpadd_s8(p0: tmp, p1: tmp);
1529	return vget_lane_s8(tmp, `0`);
1530	}
1531
1532	template <class A, class T, detail::enable_sized_unsigned_t<T, `2`> = `0`>
1533	inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1534	{
1535	uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg));
1536	tmp = vpadd_u16(p0: tmp, p1: tmp);
1537	tmp = vpadd_u16(p0: tmp, p1: tmp);
1538	return vget_lane_u16(tmp, `0`);
1539	}
1540
1541	template <class A, class T, detail::enable_sized_signed_t<T, `2`> = `0`>
1542	inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1543	{
1544	int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg));
1545	tmp = vpadd_s16(p0: tmp, p1: tmp);
1546	tmp = vpadd_s16(p0: tmp, p1: tmp);
1547	return vget_lane_s16(tmp, `0`);
1548	}
1549
1550	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
1551	inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1552	{
1553	uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg));
1554	tmp = vpadd_u32(p0: tmp, p1: tmp);
1555	return vget_lane_u32(tmp, `0`);
1556	}
1557
1558	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
1559	inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1560	{
1561	int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg));
1562	tmp = vpadd_s32(p0: tmp, p1: tmp);
1563	return vget_lane_s32(tmp, `0`);
1564	}
1565
1566	template <class A, class T, detail::enable_sized_integral_t<T, `8`> = `0`>
1567	inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1568	{
1569	return arg.get(`0`) + arg.get(`1`);
1570	}
1571
1572	template <class A>
1573	inline float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept
1574	{
1575	float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg));
1576	tmp = vpadd_f32(p0: tmp, p1: tmp);
1577	return vget_lane_f32(tmp, `0`);
1578	}
1579
1580	/**************
1581	* reduce_max *
1582	**************/
1583
1584	// Using generic implementation because ARM doe snot provide intrinsics
1585	// for this operation
1586
1587	/**************
1588	* reduce_min *
1589	**************/
1590
1591	// Using generic implementation because ARM doe snot provide intrinsics
1592	// for this operation
1593
1594	/**********
1595	* select *
1596	**********/
1597
1598	namespace wrap
1599	{
1600	inline uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(p0: a, p1: b, p2: c); }
1601	inline int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(p0: a, p1: b, p2: c); }
1602	inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(p0: a, p1: b, p2: c); }
1603	inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(p0: a, p1: b, p2: c); }
1604	inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(p0: a, p1: b, p2: c); }
1605	inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(p0: a, p1: b, p2: c); }
1606	inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(p0: a, p1: b, p2: c); }
1607	inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(p0: a, p1: b, p2: c); }
1608	inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(p0: a, p1: b, p2: c); }
1609	}
1610
1611	namespace detail
1612	{
1613	template <class... T>
1614	struct neon_select_dispatcher_impl
1615	{
1616	using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>;
1617	const container_type m_func;
1618
1619	template <class U>
1620	U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept
1621	{
1622	using func_type = U (*)(comp_return_type<U>, U, U);
1623	auto func = xsimd::detail::get<func_type>(m_func);
1624	return func(cond, lhs, rhs);
1625	}
1626	};
1627
1628	using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t,
1629	uint16x8_t, int16x8_t,
1630	uint32x4_t, int32x4_t,
1631	uint64x2_t, int64x2_t,
1632	float32x4_t>;
1633	}
1634
1635	template <class A, class T, detail::enable_neon_type_t<T> = `0`>
1636	inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept
1637	{
1638	using bool_register_type = typename batch_bool<T, A>::register_type;
1639	using register_type = typename batch<T, A>::register_type;
1640	const detail::neon_select_dispatcher dispatcher = {
1641	.m_func: std::make_tuple(args&: wrap::vbslq_u8, args&: wrap::vbslq_s8, args&: wrap::vbslq_u16, args&: wrap::vbslq_s16,
1642	args&: wrap::vbslq_u32, args&: wrap::vbslq_s32, args&: wrap::vbslq_u64, args&: wrap::vbslq_s64,
1643	args&: wrap::vbslq_f32)
1644	};
1645	return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b));
1646	}
1647
1648	template <class A, class T, bool... b, detail::enable_neon_type_t<T> = `0`>
1649	inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
1650	{
1651	return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
1652	}
1653
1654	/**********
1655	* zip_lo *
1656	**********/
1657
1658	template <class A, class T, detail::enable_sized_unsigned_t<T, `1`> = `0`>
1659	inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1660	{
1661	uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs));
1662	return vcombine_u8(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1663	}
1664
1665	template <class A, class T, detail::enable_sized_signed_t<T, `1`> = `0`>
1666	inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1667	{
1668	int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs));
1669	return vcombine_s8(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1670	}
1671
1672	template <class A, class T, detail::enable_sized_unsigned_t<T, `2`> = `0`>
1673	inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1674	{
1675	uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs));
1676	return vcombine_u16(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1677	}
1678
1679	template <class A, class T, detail::enable_sized_signed_t<T, `2`> = `0`>
1680	inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1681	{
1682	int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs));
1683	return vcombine_s16(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1684	}
1685
1686	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
1687	inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1688	{
1689	uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs));
1690	return vcombine_u32(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1691	}
1692
1693	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
1694	inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1695	{
1696	int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs));
1697	return vcombine_s32(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1698	}
1699
1700	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
1701	inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1702	{
1703	return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs));
1704	}
1705
1706	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
1707	inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1708	{
1709	return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs));
1710	}
1711
1712	template <class A>
1713	inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
1714	{
1715	float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs));
1716	return vcombine_f32(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1717	}
1718
1719	/**********
1720	* zip_hi *
1721	**********/
1722
1723	template <class A, class T, detail::enable_sized_unsigned_t<T, `1`> = `0`>
1724	inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1725	{
1726	uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs));
1727	return vcombine_u8(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1728	}
1729
1730	template <class A, class T, detail::enable_sized_signed_t<T, `1`> = `0`>
1731	inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1732	{
1733	int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs));
1734	return vcombine_s8(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1735	}
1736
1737	template <class A, class T, detail::enable_sized_unsigned_t<T, `2`> = `0`>
1738	inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1739	{
1740	uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs));
1741	return vcombine_u16(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1742	}
1743
1744	template <class A, class T, detail::enable_sized_signed_t<T, `2`> = `0`>
1745	inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1746	{
1747	int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs));
1748	return vcombine_s16(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1749	}
1750
1751	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
1752	inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1753	{
1754	uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs));
1755	return vcombine_u32(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1756	}
1757
1758	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
1759	inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1760	{
1761	int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs));
1762	return vcombine_s32(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1763	}
1764
1765	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
1766	inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1767	{
1768	return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs));
1769	}
1770
1771	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
1772	inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1773	{
1774	return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs));
1775	}
1776
1777	template <class A>
1778	inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
1779	{
1780	float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs));
1781	return vcombine_f32(p0: tmp.val[`0`], p1: tmp.val[`1`]);
1782	}
1783
1784	/****************
1785	* extract_pair *
1786	****************/
1787
1788	namespace detail
1789	{
1790	template <class A, class T>
1791	inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /rhs/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
1792	{
1793	assert(false && "extract_pair out of bounds");
1794	return batch<T, A> {};
1795	}
1796
1797	template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, `1`> = `0`>
1798	inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1799	{
1800	if (n == I)
1801	{
1802	return vextq_u8(rhs, lhs, I);
1803	}
1804	else
1805	{
1806	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1807	}
1808	}
1809
1810	template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, `1`> = `0`>
1811	inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1812	{
1813	if (n == I)
1814	{
1815	return vextq_s8(rhs, lhs, I);
1816	}
1817	else
1818	{
1819	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1820	}
1821	}
1822
1823	template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, `2`> = `0`>
1824	inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1825	{
1826	if (n == I)
1827	{
1828	return vextq_u16(rhs, lhs, I);
1829	}
1830	else
1831	{
1832	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1833	}
1834	}
1835
1836	template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, `2`> = `0`>
1837	inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1838	{
1839	if (n == I)
1840	{
1841	return vextq_s16(rhs, lhs, I);
1842	}
1843	else
1844	{
1845	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1846	}
1847	}
1848
1849	template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, `4`> = `0`>
1850	inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1851	{
1852	if (n == I)
1853	{
1854	return vextq_u32(rhs, lhs, I);
1855	}
1856	else
1857	{
1858	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1859	}
1860	}
1861
1862	template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, `4`> = `0`>
1863	inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1864	{
1865	if (n == I)
1866	{
1867	return vextq_s32(rhs, lhs, I);
1868	}
1869	else
1870	{
1871	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1872	}
1873	}
1874
1875	template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, `8`> = `0`>
1876	inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1877	{
1878	if (n == I)
1879	{
1880	return vextq_u64(rhs, lhs, I);
1881	}
1882	else
1883	{
1884	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1885	}
1886	}
1887
1888	template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, `8`> = `0`>
1889	inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1890	{
1891	if (n == I)
1892	{
1893	return vextq_s64(rhs, lhs, I);
1894	}
1895	else
1896	{
1897	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1898	}
1899	}
1900
1901	template <class A, size_t I, size_t... Is>
1902	inline batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1903	{
1904	if (n == I)
1905	{
1906	return vextq_f32(rhs, lhs, I);
1907	}
1908	else
1909	{
1910	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1911	}
1912	}
1913
1914	template <class A, class T, size_t... Is>
1915	inline batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<`0`, Is...>) noexcept
1916	{
1917	if (n == `0`)
1918	{
1919	return rhs;
1920	}
1921	else
1922	{
1923	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1924	}
1925	}
1926	}
1927
1928	template <class A, class T>
1929	inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept
1930	{
1931	constexpr std::size_t size = batch<T, A>::size;
1932	assert(n < size && "index in bounds");
1933	return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
1934	}
1935
1936	/******************
1937	* bitwise_lshift *
1938	******************/
1939
1940	namespace detail
1941	{
1942	template <class A, class T>
1943	inline batch<T, A> bitwise_lshift(batch<T, A> const& /lhs/, int /n/, ::xsimd::detail::int_sequence<>) noexcept
1944	{
1945	assert(false && "bitwise_lshift out of bounds");
1946	return batch<T, A> {};
1947	}
1948
1949	template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, `1`> = `0`>
1950	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
1951	{
1952	if (n == I)
1953	{
1954	return vshlq_n_u8(lhs, I);
1955	}
1956	else
1957	{
1958	return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
1959	}
1960	}
1961
1962	template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, `1`> = `0`>
1963	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
1964	{
1965	if (n == I)
1966	{
1967	return vshlq_n_s8(lhs, I);
1968	}
1969	else
1970	{
1971	return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
1972	}
1973	}
1974
1975	template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, `2`> = `0`>
1976	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
1977	{
1978	if (n == I)
1979	{
1980	return vshlq_n_u16(lhs, I);
1981	}
1982	else
1983	{
1984	return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
1985	}
1986	}
1987
1988	template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, `2`> = `0`>
1989	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
1990	{
1991	if (n == I)
1992	{
1993	return vshlq_n_s16(lhs, I);
1994	}
1995	else
1996	{
1997	return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
1998	}
1999	}
2000
2001	template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, `4`> = `0`>
2002	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2003	{
2004	if (n == I)
2005	{
2006	return vshlq_n_u32(lhs, I);
2007	}
2008	else
2009	{
2010	return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2011	}
2012	}
2013
2014	template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, `4`> = `0`>
2015	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2016	{
2017	if (n == I)
2018	{
2019	return vshlq_n_s32(lhs, I);
2020	}
2021	else
2022	{
2023	return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2024	}
2025	}
2026
2027	template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, `8`> = `0`>
2028	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2029	{
2030	if (n == I)
2031	{
2032	return vshlq_n_u64(lhs, I);
2033	}
2034	else
2035	{
2036	return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2037	}
2038	}
2039
2040	template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, `8`> = `0`>
2041	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2042	{
2043	if (n == I)
2044	{
2045	return vshlq_n_s64(lhs, I);
2046	}
2047	else
2048	{
2049	return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2050	}
2051	}
2052
2053	template <class A, class T, int... Is>
2054	inline batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<`0`, Is...>) noexcept
2055	{
2056	if (n == `0`)
2057	{
2058	return lhs;
2059	}
2060	else
2061	{
2062	return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2063	}
2064	}
2065	}
2066
2067	template <class A, class T>
2068	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
2069	{
2070	constexpr int size = sizeof(typename batch<T, A>::value_type) * `8`;
2071	assert(`0` <= n && n < size && "index in bounds");
2072	return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
2073	}
2074
2075	template <class A, class T, detail::enable_sized_unsigned_t<T, `1`> = `0`>
2076	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2077	{
2078	return vshlq_u8(lhs, rhs);
2079	}
2080
2081	template <class A, class T, detail::enable_sized_signed_t<T, `1`> = `0`>
2082	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2083	{
2084	return vshlq_s8(lhs, rhs);
2085	}
2086
2087	template <class A, class T, detail::enable_sized_unsigned_t<T, `2`> = `0`>
2088	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2089	{
2090	return vshlq_u16(lhs, rhs);
2091	}
2092
2093	template <class A, class T, detail::enable_sized_signed_t<T, `2`> = `0`>
2094	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2095	{
2096	return vshlq_s16(lhs, rhs);
2097	}
2098
2099	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
2100	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2101	{
2102	return vshlq_u32(lhs, rhs);
2103	}
2104
2105	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
2106	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2107	{
2108	return vshlq_s32(lhs, rhs);
2109	}
2110
2111	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
2112	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2113	{
2114	return vshlq_u64(lhs, rhs);
2115	}
2116
2117	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
2118	inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2119	{
2120	return vshlq_s64(lhs, rhs);
2121	}
2122
2123	/******************
2124	* bitwise_rshift *
2125	******************/
2126
2127	namespace detail
2128	{
2129	template <class A, class T>
2130	inline batch<T, A> bitwise_rshift(batch<T, A> const& /lhs/, int /n/, ::xsimd::detail::int_sequence<>) noexcept
2131	{
2132	assert(false && "bitwise_rshift out of bounds");
2133	return batch<T, A> {};
2134	}
2135
2136	template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, `1`> = `0`>
2137	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2138	{
2139	if (n == I)
2140	{
2141	return vshrq_n_u8(lhs, I);
2142	}
2143	else
2144	{
2145	return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2146	}
2147	}
2148
2149	template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, `1`> = `0`>
2150	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2151	{
2152	if (n == I)
2153	{
2154	return vshrq_n_s8(lhs, I);
2155	}
2156	else
2157	{
2158	return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2159	}
2160	}
2161
2162	template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, `2`> = `0`>
2163	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2164	{
2165	if (n == I)
2166	{
2167	return vshrq_n_u16(lhs, I);
2168	}
2169	else
2170	{
2171	return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2172	}
2173	}
2174
2175	template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, `2`> = `0`>
2176	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2177	{
2178	if (n == I)
2179	{
2180	return vshrq_n_s16(lhs, I);
2181	}
2182	else
2183	{
2184	return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2185	}
2186	}
2187
2188	template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, `4`> = `0`>
2189	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2190	{
2191	if (n == I)
2192	{
2193	return vshrq_n_u32(lhs, I);
2194	}
2195	else
2196	{
2197	return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2198	}
2199	}
2200
2201	template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, `4`> = `0`>
2202	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2203	{
2204	if (n == I)
2205	{
2206	return vshrq_n_s32(lhs, I);
2207	}
2208	else
2209	{
2210	return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2211	}
2212	}
2213
2214	template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, `8`> = `0`>
2215	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2216	{
2217	if (n == I)
2218	{
2219	return vshrq_n_u64(lhs, I);
2220	}
2221	else
2222	{
2223	return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2224	}
2225	}
2226
2227	template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, `8`> = `0`>
2228	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2229	{
2230	if (n == I)
2231	{
2232	return vshrq_n_s64(lhs, I);
2233	}
2234	else
2235	{
2236	return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2237	}
2238	}
2239
2240	template <class A, class T, int... Is>
2241	inline batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<`0`, Is...>) noexcept
2242	{
2243	if (n == `0`)
2244	{
2245	return lhs;
2246	}
2247	else
2248	{
2249	return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2250	}
2251	}
2252	}
2253
2254	template <class A, class T>
2255	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
2256	{
2257	constexpr int size = sizeof(typename batch<T, A>::value_type) * `8`;
2258	assert(`0` <= n && n < size && "index in bounds");
2259	return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
2260	}
2261
2262	template <class A, class T, detail::enable_sized_unsigned_t<T, `1`> = `0`>
2263	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2264	{
2265	return vshlq_u8(lhs, vnegq_s8(rhs));
2266	}
2267
2268	template <class A, class T, detail::enable_sized_signed_t<T, `1`> = `0`>
2269	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2270	{
2271	return vshlq_s8(lhs, vnegq_s8(rhs));
2272	}
2273
2274	template <class A, class T, detail::enable_sized_unsigned_t<T, `2`> = `0`>
2275	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2276	{
2277	return vshlq_u16(lhs, vnegq_s16(rhs));
2278	}
2279
2280	template <class A, class T, detail::enable_sized_signed_t<T, `2`> = `0`>
2281	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2282	{
2283	return vshlq_s16(lhs, vnegq_s16(rhs));
2284	}
2285
2286	template <class A, class T, detail::enable_sized_unsigned_t<T, `4`> = `0`>
2287	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2288	{
2289	return vshlq_u32(lhs, vnegq_s32(rhs));
2290	}
2291
2292	template <class A, class T, detail::enable_sized_signed_t<T, `4`> = `0`>
2293	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2294	{
2295	return vshlq_s32(lhs, vnegq_s32(rhs));
2296	}
2297
2298	// Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7
2299
2300	/*******
2301	* all *
2302	*******/
2303
2304	template <class A, class T, detail::enable_sized_t<T, `8`> = `0`>
2305	inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2306	{
2307	uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg));
2308	return vget_lane_u64(tmp, `0`) == ~`0ULL`;
2309	}
2310
2311	template <class A, class T, detail::enable_sized_t<T, `1`> = `0`>
2312	inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2313	{
2314	return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
2315	}
2316
2317	template <class A, class T, detail::enable_sized_t<T, `2`> = `0`>
2318	inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2319	{
2320	return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
2321	}
2322
2323	template <class A, class T, detail::enable_sized_t<T, `4`> = `0`>
2324	inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2325	{
2326	return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
2327	}
2328
2329	/*******
2330	* any *
2331	*******/
2332
2333	template <class A, class T, detail::enable_sized_t<T, `8`> = `0`>
2334	inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2335	{
2336	uint32x2_t tmp = vqmovn_u64(arg);
2337	return vget_lane_u64(vreinterpret_u64_u32(tmp), `0`) != `0`;
2338	}
2339
2340	template <class A, class T, detail::enable_sized_t<T, `1`> = `0`>
2341	inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2342	{
2343	return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
2344	}
2345
2346	template <class A, class T, detail::enable_sized_t<T, `2`> = `0`>
2347	inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2348	{
2349	return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
2350	}
2351
2352	template <class A, class T, detail::enable_sized_t<T, `4`> = `0`>
2353	inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2354	{
2355	return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
2356	}
2357
2358	/****************
2359	* bitwise_cast *
2360	****************/
2361
2362	#define WRAP_CAST(SUFFIX, TYPE) \
2363	namespace wrap \
2364	{ \
2365	inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept \
2366	{ \
2367	return ::vreinterpretq_##SUFFIX##_u8(a); \
2368	} \
2369	inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept \
2370	{ \
2371	return ::vreinterpretq_##SUFFIX##_s8(a); \
2372	} \
2373	inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept \
2374	{ \
2375	return ::vreinterpretq_##SUFFIX##_u16(a); \
2376	} \
2377	inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept \
2378	{ \
2379	return ::vreinterpretq_##SUFFIX##_s16(a); \
2380	} \
2381	inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept \
2382	{ \
2383	return ::vreinterpretq_##SUFFIX##_u32(a); \
2384	} \
2385	inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept \
2386	{ \
2387	return ::vreinterpretq_##SUFFIX##_s32(a); \
2388	} \
2389	inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept \
2390	{ \
2391	return ::vreinterpretq_##SUFFIX##_u64(a); \
2392	} \
2393	inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept \
2394	{ \
2395	return ::vreinterpretq_##SUFFIX##_s64(a); \
2396	} \
2397	inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
2398	{ \
2399	return ::vreinterpretq_##SUFFIX##_f32(a); \
2400	} \
2401	}
2402
2403	WRAP_CAST(u8, uint8x16_t)
2404	WRAP_CAST(s8, int8x16_t)
2405	WRAP_CAST(u16, uint16x8_t)
2406	WRAP_CAST(s16, int16x8_t)
2407	WRAP_CAST(u32, uint32x4_t)
2408	WRAP_CAST(s32, int32x4_t)
2409	WRAP_CAST(u64, uint64x2_t)
2410	WRAP_CAST(s64, int64x2_t)
2411	WRAP_CAST(f32, float32x4_t)
2412
2413	#undef WRAP_CAST
2414
2415	namespace detail
2416	{
2417	template <class R, class... T>
2418	struct bitwise_caster_impl
2419	{
2420	using container_type = std::tuple<R (*)(T)...>;
2421	container_type m_func;
2422
2423	template <class U>
2424	R apply(U rhs) const noexcept
2425	{
2426	using func_type = R (*)(U);
2427	auto func = xsimd::detail::get<func_type>(m_func);
2428	return func(rhs);
2429	}
2430	};
2431
2432	template <class R, class... T>
2433	inline const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (... arg)(T)) noexcept*
2434	{
2435	return { std::make_tuple(arg...) };
2436	}
2437
2438	template <class... T>
2439	struct type_list
2440	{
2441	};
2442
2443	template <class RTL, class TTL>
2444	struct bitwise_caster;
2445
2446	template <class... R, class... T>
2447	struct bitwise_caster<type_list<R...>, type_list<T...>>
2448	{
2449	using container_type = std::tuple<bitwise_caster_impl<R, T...>...>;
2450	container_type m_caster;
2451
2452	template <class V, class U>
2453	V apply(U rhs) const noexcept
2454	{
2455	using caster_type = bitwise_caster_impl<V, T...>;
2456	auto caster = xsimd::detail::get<caster_type>(m_caster);
2457	return caster.apply(rhs);
2458	}
2459	};
2460
2461	template <class... T>
2462	using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>;
2463
2464	using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t,
2465	uint16x8_t, int16x8_t,
2466	uint32x4_t, int32x4_t,
2467	uint64x2_t, int64x2_t,
2468	float32x4_t>;
2469	}
2470
2471	template <class A, class T, class R>
2472	inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
2473	{
2474	const detail::neon_bitwise_caster caster = {
2475	.m_caster: std::make_tuple(
2476	args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u8_u8, arg: wrap::vreinterpretq_u8_s8, arg: wrap::vreinterpretq_u8_u16, arg: wrap::vreinterpretq_u8_s16,
2477	arg: wrap::vreinterpretq_u8_u32, arg: wrap::vreinterpretq_u8_s32, arg: wrap::vreinterpretq_u8_u64, arg: wrap::vreinterpretq_u8_s64,
2478	arg: wrap::vreinterpretq_u8_f32),
2479	args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s8_u8, arg: wrap::vreinterpretq_s8_s8, arg: wrap::vreinterpretq_s8_u16, arg: wrap::vreinterpretq_s8_s16,
2480	arg: wrap::vreinterpretq_s8_u32, arg: wrap::vreinterpretq_s8_s32, arg: wrap::vreinterpretq_s8_u64, arg: wrap::vreinterpretq_s8_s64,
2481	arg: wrap::vreinterpretq_s8_f32),
2482	args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u16_u8, arg: wrap::vreinterpretq_u16_s8, arg: wrap::vreinterpretq_u16_u16, arg: wrap::vreinterpretq_u16_s16,
2483	arg: wrap::vreinterpretq_u16_u32, arg: wrap::vreinterpretq_u16_s32, arg: wrap::vreinterpretq_u16_u64, arg: wrap::vreinterpretq_u16_s64,
2484	arg: wrap::vreinterpretq_u16_f32),
2485	args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s16_u8, arg: wrap::vreinterpretq_s16_s8, arg: wrap::vreinterpretq_s16_u16, arg: wrap::vreinterpretq_s16_s16,
2486	arg: wrap::vreinterpretq_s16_u32, arg: wrap::vreinterpretq_s16_s32, arg: wrap::vreinterpretq_s16_u64, arg: wrap::vreinterpretq_s16_s64,
2487	arg: wrap::vreinterpretq_s16_f32),
2488	args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u32_u8, arg: wrap::vreinterpretq_u32_s8, arg: wrap::vreinterpretq_u32_u16, arg: wrap::vreinterpretq_u32_s16,
2489	arg: wrap::vreinterpretq_u32_u32, arg: wrap::vreinterpretq_u32_s32, arg: wrap::vreinterpretq_u32_u64, arg: wrap::vreinterpretq_u32_s64,
2490	arg: wrap::vreinterpretq_u32_f32),
2491	args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s32_u8, arg: wrap::vreinterpretq_s32_s8, arg: wrap::vreinterpretq_s32_u16, arg: wrap::vreinterpretq_s32_s16,
2492	arg: wrap::vreinterpretq_s32_u32, arg: wrap::vreinterpretq_s32_s32, arg: wrap::vreinterpretq_s32_u64, arg: wrap::vreinterpretq_s32_s64,
2493	arg: wrap::vreinterpretq_s32_f32),
2494	args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u64_u8, arg: wrap::vreinterpretq_u64_s8, arg: wrap::vreinterpretq_u64_u16, arg: wrap::vreinterpretq_u64_s16,
2495	arg: wrap::vreinterpretq_u64_u32, arg: wrap::vreinterpretq_u64_s32, arg: wrap::vreinterpretq_u64_u64, arg: wrap::vreinterpretq_u64_s64,
2496	arg: wrap::vreinterpretq_u64_f32),
2497	args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s64_u8, arg: wrap::vreinterpretq_s64_s8, arg: wrap::vreinterpretq_s64_u16, arg: wrap::vreinterpretq_s64_s16,
2498	arg: wrap::vreinterpretq_s64_u32, arg: wrap::vreinterpretq_s64_s32, arg: wrap::vreinterpretq_s64_u64, arg: wrap::vreinterpretq_s64_s64,
2499	arg: wrap::vreinterpretq_s64_f32),
2500	args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_f32_u8, arg: wrap::vreinterpretq_f32_s8, arg: wrap::vreinterpretq_f32_u16, arg: wrap::vreinterpretq_f32_s16,
2501	arg: wrap::vreinterpretq_f32_u32, arg: wrap::vreinterpretq_f32_s32, arg: wrap::vreinterpretq_f32_u64, arg: wrap::vreinterpretq_f32_s64,
2502	arg: wrap::vreinterpretq_f32_f32))
2503	};
2504	using src_register_type = typename batch<T, A>::register_type;
2505	using dst_register_type = typename batch<R, A>::register_type;
2506	return caster.apply<dst_register_type>(src_register_type(arg));
2507	}
2508
2509	/*********
2510	* isnan *
2511	*********/
2512
2513	template <class A>
2514	inline batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept
2515	{
2516	return !(arg == arg);
2517	}
2518
2519	// slide_left
2520	namespace detail
2521	{
2522	template <size_t N>
2523	struct slider_left
2524	{
2525	template <class A, class T>
2526	inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2527	{
2528	const auto left = vdupq_n_u8(p0: `0`);
2529	const auto right = bitwise_cast<batch<uint8_t, A>>(x).data;
2530	const batch<uint8_t, A> res(vextq_u8(left, right, `16` - N));
2531	return bitwise_cast<batch<T, A>>(res);
2532	}
2533	};
2534
2535	template <>
2536	struct slider_left<`0`>
2537	{
2538	template <class A, class T>
2539	inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2540	{
2541	return x;
2542	}
2543	};
2544	} // namespace detail
2545
2546	template <size_t N, class A, class T>
2547	inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept
2548	{
2549	return detail::slider_left<N> {}(x, A {});
2550	}
2551
2552	// slide_right
2553	namespace detail
2554	{
2555	template <size_t N>
2556	struct slider_right
2557	{
2558	template <class A, class T>
2559	inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2560	{
2561	const auto left = bitwise_cast<batch<uint8_t, A>>(x).data;
2562	const auto right = vdupq_n_u8(p0: `0`);
2563	const batch<uint8_t, A> res(vextq_u8(left, right, N));
2564	return bitwise_cast<batch<T, A>>(res);
2565	}
2566	};
2567
2568	template <>
2569	struct slider_right<`16`>
2570	{
2571	template <class A, class T>
2572	inline batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept
2573	{
2574	return batch<T, A> {};
2575	}
2576	};
2577	} // namespace detail
2578
2579	template <size_t N, class A, class T>
2580	inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept
2581	{
2582	return detail::slider_right<N> {}(x, A {});
2583	}
2584	}
2585
2586	template <class batch_type, typename batch_type::value_type... Values>
2587	struct batch_constant;
2588
2589	namespace kernel
2590	{
2591	/***********
2592	* swizzle *
2593	***********/
2594
2595	template <class A, class T, class I, I... idx>
2596	inline batch<T, A> swizzle(batch<T, A> const& self,
2597	batch_constant<batch<I, A>, idx...>,
2598	requires_arch<neon>) noexcept
2599	{
2600	static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
2601	std::array<T, batch<T, A>::size> data;
2602	self.store_aligned(data.data());
2603	return set(batch<T, A>(), A(), data[idx]...);
2604	}
2605	}
2606	}
2607
2608	#undef WRAP_BINARY_INT_EXCLUDING_64
2609	#undef WRAP_BINARY_INT
2610	#undef WRAP_BINARY_FLOAT
2611	#undef WRAP_UNARY_INT_EXCLUDING_64
2612	#undef WRAP_UNARY_INT
2613	#undef WRAP_UNARY_FLOAT
2614
2615	#endif
2616

Browse the source code of Velox/build/_deps/xsimd-src/include/xsimd/arch/xsimd_neon.hpp