xsimd_neon64.hpp source code [Velox/build/_deps/xsimd-src/include/xsimd/arch/xsimd_neon64.hpp]

1	/***************************************************************************
2	* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3	* Martin Renou *
4	* Copyright (c) QuantStack *
5	* Copyright (c) Serge Guelton *
6	* *
7	* Distributed under the terms of the BSD 3-Clause License. *
8	* *
9	* The full license is in the file LICENSE, distributed with this software. *
10	****************************************************************************/
11
12	#ifndef XSIMD_NEON64_HPP
13	#define XSIMD_NEON64_HPP
14
15	#include <complex>
16	#include <cstddef>
17	#include <tuple>
18
19	#include "../types/xsimd_neon64_register.hpp"
20	#include "../types/xsimd_utils.hpp"
21
22	namespace xsimd
23	{
24	template <class batch_type, bool... Values>
25	struct batch_bool_constant;
26
27	namespace kernel
28	{
29	using namespace types;
30
31	/*******
32	* all *
33	*******/
34
35	template <class A, class T, detail::enable_sized_t<T, `4`> = `0`>
36	inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
37	{
38	return vminvq_u32(arg) == ~`0U`;
39	}
40
41	template <class A, class T, detail::enable_sized_t<T, `1`> = `0`>
42	inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
43	{
44	return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
45	}
46
47	template <class A, class T, detail::enable_sized_t<T, `2`> = `0`>
48	inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
49	{
50	return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
51	}
52
53	template <class A, class T, detail::enable_sized_t<T, `8`> = `0`>
54	inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
55	{
56	return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
57	}
58
59	/*******
60	* any *
61	*******/
62
63	template <class A, class T, detail::enable_sized_t<T, `4`> = `0`>
64	inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
65	{
66	return vmaxvq_u32(arg) != `0`;
67	}
68
69	template <class A, class T, detail::enable_sized_t<T, `1`> = `0`>
70	inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
71	{
72	return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
73	}
74
75	template <class A, class T, detail::enable_sized_t<T, `2`> = `0`>
76	inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
77	{
78	return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
79	}
80
81	template <class A, class T, detail::enable_sized_t<T, `8`> = `0`>
82	inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
83	{
84	return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
85	}
86
87	/*************
88	* broadcast *
89	*************/
90
91	// Required to avoid ambiguous call
92	template <class A, class T>
93	inline batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
94	{
95	return broadcast<neon64>(val, neon {});
96	}
97
98	template <class A>
99	inline batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept
100	{
101	return vdupq_n_f64(p0: val);
102	}
103
104	/*******
105	* set *
106	*******/
107
108	template <class A>
109	inline batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
110	{
111	return float64x2_t { d0, d1 };
112	}
113
114	template <class A>
115	inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
116	{
117	using register_type = typename batch_bool<double, A>::register_type;
118	using unsigned_type = as_unsigned_integer_t<double>;
119	return register_type { static_cast<unsigned_type>(b0 ? -`1LL` : `0LL`),
120	static_cast<unsigned_type>(b1 ? -`1LL` : `0LL`) };
121	}
122
123	/*************
124	* from_bool *
125	*************/
126
127	template <class A>
128	inline batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept
129	{
130	return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(p0: vdupq_n_f64(p0: `1.`))));
131	}
132
133	/********
134	* load *
135	********/
136
137	template <class A>
138	inline batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
139	{
140	return vld1q_f64(src);
141	}
142
143	template <class A>
144	inline batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
145	{
146	return load_aligned<A>(src, convert<double>(), A {});
147	}
148
149	/*********
150	* store *
151	*********/
152
153	template <class A>
154	inline void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
155	{
156	vst1q_f64(dst, src);
157	}
158
159	template <class A>
160	inline void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
161	{
162	return store_aligned<A>(dst, src, A {});
163	}
164
165	/****************
166	* load_complex *
167	****************/
168
169	template <class A>
170	inline batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept
171	{
172	using real_batch = batch<double, A>;
173	const double* buf = reinterpret_cast<const double*>(mem);
174	float64x2x2_t tmp = vld2q_f64(buf);
175	real_batch real = tmp.val[`0`],
176	imag = tmp.val[`1`];
177	return batch<std::complex<double>, A> { real, imag };
178	}
179
180	template <class A>
181	inline batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept
182	{
183	return load_complex_aligned<A>(mem, cvt, A {});
184	}
185
186	/*****************
187	* store_complex *
188	*****************/
189
190	template <class A>
191	inline void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
192	{
193	float64x2x2_t tmp;
194	tmp.val[`0`] = src.real();
195	tmp.val[`1`] = src.imag();
196	double* buf = reinterpret_cast<double*>(dst);
197	vst2q_f64(buf, tmp);
198	}
199
200	template <class A>
201	inline void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
202	{
203	store_complex_aligned(dst, src, A {});
204	}
205
206	/*******
207	* neg *
208	*******/
209
210	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
211	inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
212	{
213	return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs)));
214	}
215
216	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
217	inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
218	{
219	return vnegq_s64(rhs);
220	}
221
222	template <class A>
223	inline batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
224	{
225	return vnegq_f64(rhs);
226	}
227
228	/*******
229	* add *
230	*******/
231
232	template <class A>
233	inline batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
234	{
235	return vaddq_f64(lhs, rhs);
236	}
237
238	/********
239	* sadd *
240	********/
241
242	template <class A>
243	inline batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
244	{
245	return add(lhs, rhs, neon64 {});
246	}
247
248	/*******
249	* sub *
250	*******/
251
252	template <class A>
253	inline batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
254	{
255	return vsubq_f64(lhs, rhs);
256	}
257
258	/********
259	* ssub *
260	********/
261
262	template <class A>
263	inline batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
264	{
265	return sub(lhs, rhs, neon64 {});
266	}
267
268	/*******
269	* mul *
270	*******/
271
272	template <class A>
273	inline batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
274	{
275	return vmulq_f64(lhs, rhs);
276	}
277
278	/*******
279	* div *
280	*******/
281
282	#if defined(XSIMD_FAST_INTEGER_DIVISION)
283	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
284	inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
285	{
286	return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs));
287	}
288
289	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
290	inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
291	{
292	return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs));
293	}
294	#endif
295	template <class A>
296	inline batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
297	{
298	return vdivq_f64(lhs, rhs);
299	}
300
301	/******
302	* eq *
303	******/
304
305	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
306	inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
307	{
308	return vceqq_u64(lhs, rhs);
309	}
310
311	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
312	inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
313	{
314	return vceqq_s64(lhs, rhs);
315	}
316
317	template <class A>
318	inline batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
319	{
320	return vceqq_f64(lhs, rhs);
321	}
322
323	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
324	inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
325	{
326	return vceqq_u64(lhs, rhs);
327	}
328
329	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
330	inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
331	{
332	return vceqq_u64(lhs, rhs);
333	}
334
335	template <class A>
336	inline batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
337	{
338	return vceqq_u64(lhs, rhs);
339	}
340
341	/*************
342	* fast_cast *
343	*************/
344	namespace detail
345	{
346	template <class A>
347	inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
348	{
349	return vcvtq_f64_s64(x);
350	}
351
352	template <class A>
353	inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
354	{
355	return vcvtq_f64_u64(x);
356	}
357
358	template <class A>
359	inline batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept
360	{
361	return vcvtq_s64_f64(x);
362	}
363
364	template <class A>
365	inline batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept
366	{
367	return vcvtq_u64_f64(x);
368	}
369
370	}
371
372	/******
373	* lt *
374	******/
375
376	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
377	inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
378	{
379	return vcltq_u64(lhs, rhs);
380	}
381
382	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
383	inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
384	{
385	return vcltq_s64(lhs, rhs);
386	}
387
388	template <class A>
389	inline batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
390	{
391	return vcltq_f64(lhs, rhs);
392	}
393
394	/******
395	* le *
396	******/
397
398	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
399	inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
400	{
401	return vcleq_u64(lhs, rhs);
402	}
403
404	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
405	inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
406	{
407	return vcleq_s64(lhs, rhs);
408	}
409
410	template <class A>
411	inline batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
412	{
413	return vcleq_f64(lhs, rhs);
414	}
415
416	/******
417	* gt *
418	******/
419
420	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
421	inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
422	{
423	return vcgtq_u64(lhs, rhs);
424	}
425
426	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
427	inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
428	{
429	return vcgtq_s64(lhs, rhs);
430	}
431
432	template <class A>
433	inline batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
434	{
435	return vcgtq_f64(lhs, rhs);
436	}
437
438	/******
439	* ge *
440	******/
441
442	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
443	inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
444	{
445	return vcgeq_u64(lhs, rhs);
446	}
447
448	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
449	inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
450	{
451	return vcgeq_s64(lhs, rhs);
452	}
453
454	template <class A>
455	inline batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
456	{
457	return vcgeq_f64(lhs, rhs);
458	}
459
460	/*******************
461	* batch_bool_cast *
462	*******************/
463
464	template <class A, class T_out, class T_in>
465	inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept
466	{
467	using register_type = typename batch_bool<T_out, A>::register_type;
468	return register_type(self);
469	}
470
471	/***************
472	* bitwise_and *
473	***************/
474
475	template <class A>
476	inline batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
477	{
478	return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs),
479	vreinterpretq_u64_f64(rhs)));
480	}
481
482	template <class A>
483	inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
484	{
485	return vandq_u64(lhs, rhs);
486	}
487
488	/**************
489	* bitwise_or *
490	**************/
491
492	template <class A>
493	inline batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
494	{
495	return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs),
496	vreinterpretq_u64_f64(rhs)));
497	}
498
499	template <class A>
500	inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
501	{
502	return vorrq_u64(lhs, rhs);
503	}
504
505	/***************
506	* bitwise_xor *
507	***************/
508
509	template <class A>
510	inline batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
511	{
512	return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs),
513	vreinterpretq_u64_f64(rhs)));
514	}
515
516	template <class A>
517	inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
518	{
519	return veorq_u64(lhs, rhs);
520	}
521
522	/*******
523	* neq *
524	*******/
525
526	template <class A>
527	inline batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
528	{
529	return bitwise_xor(lhs, rhs, A {});
530	}
531
532	/***************
533	* bitwise_not *
534	***************/
535
536	template <class A>
537	inline batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
538	{
539	return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs)));
540	}
541
542	template <class A>
543	inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
544	{
545	return detail::bitwise_not_u64(arg: rhs);
546	}
547
548	/******************
549	* bitwise_andnot *
550	******************/
551
552	template <class A>
553	inline batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
554	{
555	return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs),
556	vreinterpretq_u64_f64(rhs)));
557	}
558
559	template <class A>
560	inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
561	{
562	return vbicq_u64(lhs, rhs);
563	}
564
565	/*******
566	* min *
567	*******/
568
569	template <class A>
570	inline batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
571	{
572	return vminq_f64(lhs, rhs);
573	}
574
575	/*******
576	* max *
577	*******/
578
579	template <class A>
580	inline batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
581	{
582	return vmaxq_f64(lhs, rhs);
583	}
584
585	/*******
586	* abs *
587	*******/
588
589	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
590	inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
591	{
592	return rhs;
593	}
594
595	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
596	inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
597	{
598	return vabsq_s64(rhs);
599	}
600
601	template <class A>
602	inline batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
603	{
604	return vabsq_f64(rhs);
605	}
606
607	template <class A>
608	inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
609	requires_arch<neon64>) noexcept
610	{
611	return vcvtnq_s32_f32(self);
612	}
613
614	#if !defined(__GNUC__)
615	template <class A>
616	inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
617	requires_arch<neon64>) noexcept
618	{
619	return vcvtnq_s64_f64(self);
620	}
621	#endif
622
623	/**************
624	* reciprocal *
625	**************/
626
627	template <class A>
628	inline batch<double, A>
629	reciprocal(const batch<double, A>& x,
630	kernel::requires_arch<neon64>) noexcept
631	{
632	return vrecpeq_f64(x);
633	}
634
635	/********
636	* rsqrt *
637	********/
638
639	template <class A>
640	inline batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
641	{
642	return vrsqrteq_f64(rhs);
643	}
644
645	/********
646	* sqrt *
647	********/
648
649	template <class A>
650	inline batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
651	{
652	return vsqrtq_f64(rhs);
653	}
654
655	/********************
656	* Fused operations *
657	********************/
658
659	#ifdef __ARM_FEATURE_FMA
660	template <class A>
661	inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
662	{
663	return vfmaq_f64(z, x, y);
664	}
665
666	template <class A>
667	inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
668	{
669	return vfmaq_f64(-z, x, y);
670	}
671	#endif
672
673	/*********
674	* haddp *
675	*********/
676
677	template <class A>
678	inline batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept
679	{
680	return vpaddq_f64(row[`0`], row[`1`]);
681	}
682
683	/**********
684	* insert *
685	**********/
686
687	template <class A, size_t I>
688	inline batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept
689	{
690	return vsetq_lane_f64(val, self, I);
691	}
692
693	/******************
694	* reducer macros *
695	******************/
696
697	// Wrap reducer intrinsics so we can pass them as function pointers
698	// - OP: intrinsics name prefix, e.g., vorrq
699
700	#define WRAP_REDUCER_INT_EXCLUDING_64(OP) \
701	namespace wrap \
702	{ \
703	inline uint8_t OP##_u8(uint8x16_t a) noexcept \
704	{ \
705	return ::OP##_u8(a); \
706	} \
707	inline int8_t OP##_s8(int8x16_t a) noexcept \
708	{ \
709	return ::OP##_s8(a); \
710	} \
711	inline uint16_t OP##_u16(uint16x8_t a) noexcept \
712	{ \
713	return ::OP##_u16(a); \
714	} \
715	inline int16_t OP##_s16(int16x8_t a) noexcept \
716	{ \
717	return ::OP##_s16(a); \
718	} \
719	inline uint32_t OP##_u32(uint32x4_t a) noexcept \
720	{ \
721	return ::OP##_u32(a); \
722	} \
723	inline int32_t OP##_s32(int32x4_t a) noexcept \
724	{ \
725	return ::OP##_s32(a); \
726	} \
727	}
728
729	#define WRAP_REDUCER_INT(OP) \
730	WRAP_REDUCER_INT_EXCLUDING_64(OP) \
731	namespace wrap \
732	{ \
733	inline uint64_t OP##_u64(uint64x2_t a) noexcept \
734	{ \
735	return ::OP##_u64(a); \
736	} \
737	inline int64_t OP##_s64(int64x2_t a) noexcept \
738	{ \
739	return ::OP##_s64(a); \
740	} \
741	}
742
743	#define WRAP_REDUCER_FLOAT(OP) \
744	namespace wrap \
745	{ \
746	inline float OP##_f32(float32x4_t a) noexcept \
747	{ \
748	return ::OP##_f32(a); \
749	} \
750	inline double OP##_f64(float64x2_t a) noexcept \
751	{ \
752	return ::OP##_f64(a); \
753	} \
754	}
755
756	namespace detail
757	{
758	template <class R>
759	struct reducer_return_type_impl;
760
761	template <>
762	struct reducer_return_type_impl<uint8x16_t>
763	{
764	using type = uint8_t;
765	};
766
767	template <>
768	struct reducer_return_type_impl<int8x16_t>
769	{
770	using type = int8_t;
771	};
772
773	template <>
774	struct reducer_return_type_impl<uint16x8_t>
775	{
776	using type = uint16_t;
777	};
778
779	template <>
780	struct reducer_return_type_impl<int16x8_t>
781	{
782	using type = int16_t;
783	};
784
785	template <>
786	struct reducer_return_type_impl<uint32x4_t>
787	{
788	using type = uint32_t;
789	};
790
791	template <>
792	struct reducer_return_type_impl<int32x4_t>
793	{
794	using type = int32_t;
795	};
796
797	template <>
798	struct reducer_return_type_impl<uint64x2_t>
799	{
800	using type = uint64_t;
801	};
802
803	template <>
804	struct reducer_return_type_impl<int64x2_t>
805	{
806	using type = int64_t;
807	};
808
809	template <>
810	struct reducer_return_type_impl<float32x4_t>
811	{
812	using type = float;
813	};
814
815	template <>
816	struct reducer_return_type_impl<float64x2_t>
817	{
818	using type = double;
819	};
820
821	template <class R>
822	using reducer_return_type = typename reducer_return_type_impl<R>::type;
823
824	template <class... T>
825	struct neon_reducer_dispatcher_impl : neon_dispatcher_base<reducer_return_type, T...>
826	{
827	};
828
829	using neon_reducer_dispatcher = neon_reducer_dispatcher_impl<uint8x16_t, int8x16_t,
830	uint16x8_t, int16x8_t,
831	uint32x4_t, int32x4_t,
832	uint64x2_t, int64x2_t,
833	float32x4_t, float64x2_t>;
834	template <class T>
835	using enable_neon64_type_t = typename std::enable_if<std::is_integral<T>::value \|\| std::is_same<T, float>::value \|\| std::is_same<T, double>::value,
836	int>::type;
837	}
838
839	/**************
840	* reduce_add *
841	**************/
842
843	WRAP_REDUCER_INT(vaddvq)
844	WRAP_REDUCER_FLOAT(vaddvq)
845
846	template <class A, class T, detail::enable_neon64_type_t<T> = `0`>
847	inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
848	{
849	using register_type = typename batch<T, A>::register_type;
850	const detail::neon_reducer_dispatcher::unary dispatcher = {
851	.m_func: std::make_tuple(args&: wrap::vaddvq_u8, args&: wrap::vaddvq_s8, args&: wrap::vaddvq_u16, args&: wrap::vaddvq_s16,
852	args&: wrap::vaddvq_u32, args&: wrap::vaddvq_s32, args&: wrap::vaddvq_u64, args&: wrap::vaddvq_s64,
853	args&: wrap::vaddvq_f32, args&: wrap::vaddvq_f64)
854	};
855	return dispatcher.apply(register_type(arg));
856	}
857
858	/**************
859	* reduce_max *
860	**************/
861
862	WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq)
863	WRAP_REDUCER_FLOAT(vmaxvq)
864
865	namespace wrap
866	{
867	inline uint64_t vmaxvq_u64(uint64x2_t a) noexcept
868	{
869	return std::max(vdupd_laneq_u64(a, `0`), vdupd_laneq_u64(a, `1`));
870	}
871
872	inline int64_t vmaxvq_s64(int64x2_t a) noexcept
873	{
874	return std::max(vdupd_laneq_s64(a, `0`), vdupd_laneq_s64(a, `1`));
875	}
876	}
877
878	template <class A, class T, detail::enable_neon64_type_t<T> = `0`>
879	inline typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
880	{
881	using register_type = typename batch<T, A>::register_type;
882	const detail::neon_reducer_dispatcher::unary dispatcher = {
883	.m_func: std::make_tuple(args&: wrap::vmaxvq_u8, args&: wrap::vmaxvq_s8, args&: wrap::vmaxvq_u16, args&: wrap::vmaxvq_s16,
884	args&: wrap::vmaxvq_u32, args&: wrap::vmaxvq_s32, args&: wrap::vmaxvq_u64, args&: wrap::vmaxvq_s64,
885	args&: wrap::vmaxvq_f32, args&: wrap::vmaxvq_f64)
886	};
887	return dispatcher.apply(register_type(arg));
888	}
889
890	/**************
891	* reduce_min *
892	**************/
893
894	WRAP_REDUCER_INT_EXCLUDING_64(vminvq)
895	WRAP_REDUCER_FLOAT(vminvq)
896
897	namespace wrap
898	{
899	inline uint64_t vminvq_u64(uint64x2_t a) noexcept
900	{
901	return std::min(vdupd_laneq_u64(a, `0`), vdupd_laneq_u64(a, `1`));
902	}
903
904	inline int64_t vminvq_s64(int64x2_t a) noexcept
905	{
906	return std::min(vdupd_laneq_s64(a, `0`), vdupd_laneq_s64(a, `1`));
907	}
908	}
909
910	template <class A, class T, detail::enable_neon64_type_t<T> = `0`>
911	inline typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
912	{
913	using register_type = typename batch<T, A>::register_type;
914	const detail::neon_reducer_dispatcher::unary dispatcher = {
915	.m_func: std::make_tuple(args&: wrap::vminvq_u8, args&: wrap::vminvq_s8, args&: wrap::vminvq_u16, args&: wrap::vminvq_s16,
916	args&: wrap::vminvq_u32, args&: wrap::vminvq_s32, args&: wrap::vminvq_u64, args&: wrap::vminvq_s64,
917	args&: wrap::vminvq_f32, args&: wrap::vminvq_f64)
918	};
919	return dispatcher.apply(register_type(arg));
920	}
921
922	#undef WRAP_REDUCER_INT_EXCLUDING_64
923	#undef WRAP_REDUCER_INT
924	#undef WRAP_REDUCER_FLOAT
925
926	/**********
927	* select *
928	**********/
929
930	template <class A>
931	inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept
932	{
933	return vbslq_f64(cond, a, b);
934	}
935
936	template <class A, bool... b>
937	inline batch<double, A> select(batch_bool_constant<batch<double, A>, b...> const&,
938	batch<double, A> const& true_br,
939	batch<double, A> const& false_br,
940	requires_arch<neon64>) noexcept
941	{
942	return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
943	}
944	/**********
945	* zip_lo *
946	**********/
947
948	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
949	inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
950	{
951	return vzip1q_u64(lhs, rhs);
952	}
953
954	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
955	inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
956	{
957	return vzip1q_s64(lhs, rhs);
958	}
959
960	template <class A>
961	inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
962	{
963	return vzip1q_f64(lhs, rhs);
964	}
965
966	/**********
967	* zip_hi *
968	**********/
969
970	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
971	inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
972	{
973	return vzip2q_u64(lhs, rhs);
974	}
975
976	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
977	inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
978	{
979	return vzip2q_s64(lhs, rhs);
980	}
981
982	template <class A>
983	inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
984	{
985	return vzip2q_f64(lhs, rhs);
986	}
987
988	/****************
989	* extract_pair *
990	****************/
991
992	namespace detail
993	{
994	template <class A, size_t I, size_t... Is>
995	inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n,
996	::xsimd::detail::index_sequence<I, Is...>) noexcept
997	{
998	if (n == I)
999	{
1000	return vextq_f64(rhs, lhs, I);
1001	}
1002	else
1003	{
1004	return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1005	}
1006	}
1007	}
1008
1009	template <class A>
1010	inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept
1011	{
1012	constexpr std::size_t size = batch<double, A>::size;
1013	assert(n < size && "index in bounds");
1014	return detail::extract_pair(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
1015	}
1016
1017	/******************
1018	* bitwise_rshift *
1019	******************/
1020
1021	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
1022	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
1023	{
1024	return bitwise_rshift<A>(lhs, n, neon {});
1025	}
1026
1027	template <class A, class T, detail::enable_sized_unsigned_t<T, `8`> = `0`>
1028	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
1029	{
1030	return vshlq_u64(lhs, vnegq_s64(rhs));
1031	}
1032
1033	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
1034	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
1035	{
1036	return bitwise_rshift<A>(lhs, n, neon {});
1037	}
1038
1039	template <class A, class T, detail::enable_sized_signed_t<T, `8`> = `0`>
1040	inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1041	{
1042	return vshlq_s64(lhs, vnegq_s64(rhs));
1043	}
1044
1045	/****************
1046	* bitwise_cast *
1047	****************/
1048
1049	#define WRAP_CAST(SUFFIX, TYPE) \
1050	namespace wrap \
1051	{ \
1052	inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept \
1053	{ \
1054	return ::vreinterpretq_f64_##SUFFIX(a); \
1055	} \
1056	inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
1057	{ \
1058	return ::vreinterpretq_##SUFFIX##_f64(a); \
1059	} \
1060	}
1061
1062	WRAP_CAST(u8, uint8x16_t)
1063	WRAP_CAST(s8, int8x16_t)
1064	WRAP_CAST(u16, uint16x8_t)
1065	WRAP_CAST(s16, int16x8_t)
1066	WRAP_CAST(u32, uint32x4_t)
1067	WRAP_CAST(s32, int32x4_t)
1068	WRAP_CAST(u64, uint64x2_t)
1069	WRAP_CAST(s64, int64x2_t)
1070	WRAP_CAST(f32, float32x4_t)
1071
1072	#undef WRAP_CAST
1073
1074	template <class A, class T>
1075	inline batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
1076	{
1077	using caster_type = detail::bitwise_caster_impl<float64x2_t,
1078	uint8x16_t, int8x16_t,
1079	uint16x8_t, int16x8_t,
1080	uint32x4_t, int32x4_t,
1081	uint64x2_t, int64x2_t,
1082	float32x4_t>;
1083	const caster_type caster = {
1084	.m_func: std::make_tuple(args&: wrap::vreinterpretq_f64_u8, args&: wrap::vreinterpretq_f64_s8, args&: wrap::vreinterpretq_f64_u16, args&: wrap::vreinterpretq_f64_s16,
1085	args&: wrap::vreinterpretq_f64_u32, args&: wrap::vreinterpretq_f64_s32, args&: wrap::vreinterpretq_f64_u64, args&: wrap::vreinterpretq_f64_s64,
1086	args&: wrap::vreinterpretq_f64_f32)
1087	};
1088	using register_type = typename batch<T, A>::register_type;
1089	return caster.apply(register_type(arg));
1090	}
1091
1092	namespace detail
1093	{
1094	template <class S, class... R>
1095	struct bitwise_caster_neon64
1096	{
1097	using container_type = std::tuple<R (*)(S)...>;
1098	container_type m_func;
1099
1100	template <class V>
1101	V apply(float64x2_t rhs) const
1102	{
1103	using func_type = V (*)(float64x2_t);
1104	auto func = xsimd::detail::get<func_type>(m_func);
1105	return func(rhs);
1106	}
1107	};
1108	}
1109
1110	template <class A, class R>
1111	inline batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
1112	{
1113	using caster_type = detail::bitwise_caster_neon64<float64x2_t,
1114	uint8x16_t, int8x16_t,
1115	uint16x8_t, int16x8_t,
1116	uint32x4_t, int32x4_t,
1117	uint64x2_t, int64x2_t,
1118	float32x4_t>;
1119	const caster_type caster = {
1120	.m_func: std::make_tuple(args&: wrap::vreinterpretq_u8_f64, args&: wrap::vreinterpretq_s8_f64, args&: wrap::vreinterpretq_u16_f64, args&: wrap::vreinterpretq_s16_f64,
1121	args&: wrap::vreinterpretq_u32_f64, args&: wrap::vreinterpretq_s32_f64, args&: wrap::vreinterpretq_u64_f64, args&: wrap::vreinterpretq_s64_f64,
1122	args&: wrap::vreinterpretq_f32_f64)
1123	};
1124	using src_register_type = typename batch<double, A>::register_type;
1125	using dst_register_type = typename batch<R, A>::register_type;
1126	return caster.apply<dst_register_type>(src_register_type(arg));
1127	}
1128
1129	template <class A>
1130	inline batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
1131	{
1132	return arg;
1133	}
1134
1135	/*********
1136	* isnan *
1137	*********/
1138
1139	template <class A>
1140	inline batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept
1141	{
1142	return !(arg == arg);
1143	}
1144	}
1145
1146	template <class batch_type, typename batch_type::value_type... Values>
1147	struct batch_constant;
1148
1149	namespace kernel
1150	{
1151	/***********
1152	* swizzle *
1153	***********/
1154
1155	namespace detail
1156	{
1157	using ::xsimd::batch_constant;
1158	using ::xsimd::detail::integer_sequence;
1159	using ::xsimd::detail::make_integer_sequence;
1160
1161	template <class CB1, class CB2, class IS>
1162	struct index_burst_impl;
1163
1164	template <class B1, class B2, typename B2::value_type... V,
1165	typename B2::value_type... incr>
1166	struct index_burst_impl<batch_constant<B1>, batch_constant<B2, V...>,
1167	integer_sequence<typename B2::value_type, incr...>>
1168	{
1169	using type = batch_constant<B2, V...>;
1170	};
1171
1172	template <class B1, typename B1::value_type V0, typename B1::value_type... V1,
1173	class B2, typename B2::value_type... V2,
1174	typename B2::value_type... incr>
1175	struct index_burst_impl<batch_constant<B1, V0, V1...>, batch_constant<B2, V2...>,
1176	integer_sequence<typename B2::value_type, incr...>>
1177	{
1178	using value_type = typename B2::value_type;
1179	using next_input = batch_constant<B1, V1...>;
1180	using next_output = batch_constant<B2, V2..., (V0 + incr)...>;
1181	using type = typename index_burst_impl<next_input, next_output, integer_sequence<value_type, incr...>>::type;
1182	};
1183
1184	template <class B, class T>
1185	struct index_burst;
1186
1187	template <class B, typename B::value_type... V, class T>
1188	struct index_burst<batch_constant<B, V...>, T>
1189	{
1190	static constexpr size_t mul = sizeof(typename B::value_type) / sizeof(T);
1191	using input = batch_constant<B, (mul * V)...>;
1192	using output = batch_constant<batch<T, typename B::arch_type>>;
1193	using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
1194	};
1195
1196	template <class B, class T>
1197	using index_burst_t = typename index_burst<B, T>::type;
1198
1199	template <class T, class B>
1200	inline index_burst_t<B, T> burst_index(B)
1201	{
1202	return index_burst_t<B, T>();
1203	}
1204	}
1205
1206	template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
1207	uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
1208	inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
1209	batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
1210	requires_arch<neon64>) noexcept
1211	{
1212	return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
1213	}
1214
1215	template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
1216	uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
1217	inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
1218	batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
1219	requires_arch<neon64>) noexcept
1220	{
1221	return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
1222	}
1223
1224	template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1225	inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
1226	batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
1227	requires_arch<neon64>) noexcept
1228	{
1229	using batch_type = batch<uint8_t, A>;
1230	return vreinterpretq_u16_u8(swizzle<A>(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index<uint8_t>(idx), A()));
1231	}
1232
1233	template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1234	inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
1235	batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
1236	requires_arch<neon64>) noexcept
1237	{
1238	using batch_type = batch<int8_t, A>;
1239	return vreinterpretq_s16_s8(swizzle<A>(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index<uint8_t>(idx), A()));
1240	}
1241
1242	template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1243	inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
1244	batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
1245	requires_arch<neon64>) noexcept
1246	{
1247	using batch_type = batch<uint8_t, A>;
1248	return vreinterpretq_u32_u8(swizzle<A>(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index<uint8_t>(idx), A()));
1249	}
1250
1251	template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1252	inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
1253	batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
1254	requires_arch<neon64>) noexcept
1255	{
1256	using batch_type = batch<int8_t, A>;
1257	return vreinterpretq_s32_s8(swizzle<A>(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index<uint8_t>(idx), A()));
1258	}
1259
1260	template <class A, uint64_t V0, uint64_t V1>
1261	inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
1262	batch_constant<batch<uint64_t, A>, V0, V1> idx,
1263	requires_arch<neon64>) noexcept
1264	{
1265	using batch_type = batch<uint8_t, A>;
1266	return vreinterpretq_u64_u8(swizzle<A>(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index<uint8_t>(idx), A()));
1267	}
1268
1269	template <class A, uint64_t V0, uint64_t V1>
1270	inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
1271	batch_constant<batch<uint64_t, A>, V0, V1> idx,
1272	requires_arch<neon64>) noexcept
1273	{
1274	using batch_type = batch<int8_t, A>;
1275	return vreinterpretq_s64_s8(swizzle<A>(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index<uint8_t>(idx), A()));
1276	}
1277
1278	template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1279	inline batch<float, A> swizzle(batch<float, A> const& self,
1280	batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
1281	requires_arch<neon64>) noexcept
1282	{
1283	using batch_type = batch<uint8_t, A>;
1284	return vreinterpretq_f32_u8(swizzle<A>(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index<uint8_t>(idx), A()));
1285	}
1286
1287	template <class A, uint64_t V0, uint64_t V1>
1288	inline batch<double, A> swizzle(batch<double, A> const& self,
1289	batch_constant<batch<uint64_t, A>, V0, V1> idx,
1290	requires_arch<neon64>) noexcept
1291	{
1292	using batch_type = batch<uint8_t, A>;
1293	return vreinterpretq_f64_u8(swizzle<A>(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index<uint8_t>(idx), A()));
1294	}
1295
1296	template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1297	inline batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
1298	batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
1299	requires_arch<neon64>) noexcept
1300	{
1301	return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
1302	}
1303
1304	template <class A, uint64_t V0, uint64_t V1>
1305	inline batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
1306	batch_constant<batch<uint64_t, A>, V0, V1> idx,
1307	requires_arch<neon64>) noexcept
1308	{
1309	return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
1310	}
1311	}
1312	}
1313
1314	#endif
1315

Browse the source code of Velox/build/_deps/xsimd-src/include/xsimd/arch/xsimd_neon64.hpp