1 | /*************************************************************************** |
2 | * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * |
3 | * Martin Renou * |
4 | * Copyright (c) QuantStack * |
5 | * Copyright (c) Serge Guelton * |
6 | * * |
7 | * Distributed under the terms of the BSD 3-Clause License. * |
8 | * * |
9 | * The full license is in the file LICENSE, distributed with this software. * |
10 | ****************************************************************************/ |
11 | |
12 | #ifndef XSIMD_NEON_HPP |
13 | #define XSIMD_NEON_HPP |
14 | |
15 | #include <algorithm> |
16 | #include <complex> |
17 | #include <tuple> |
18 | #include <type_traits> |
19 | |
20 | #include "../types/xsimd_neon_register.hpp" |
21 | #include "../types/xsimd_utils.hpp" |
22 | |
23 | // Wrap intrinsics so we can pass them as function pointers |
24 | // - OP: intrinsics name prefix, e.g., vorrq |
25 | // - RT: type traits to deduce intrinsics return types |
26 | #define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ |
27 | namespace wrap \ |
28 | { \ |
29 | inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \ |
30 | { \ |
31 | return ::OP##_u8(a, b); \ |
32 | } \ |
33 | inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \ |
34 | { \ |
35 | return ::OP##_s8(a, b); \ |
36 | } \ |
37 | inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \ |
38 | { \ |
39 | return ::OP##_u16(a, b); \ |
40 | } \ |
41 | inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \ |
42 | { \ |
43 | return ::OP##_s16(a, b); \ |
44 | } \ |
45 | inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \ |
46 | { \ |
47 | return ::OP##_u32(a, b); \ |
48 | } \ |
49 | inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \ |
50 | { \ |
51 | return ::OP##_s32(a, b); \ |
52 | } \ |
53 | } |
54 | |
55 | #define WRAP_BINARY_INT(OP, RT) \ |
56 | WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ |
57 | namespace wrap \ |
58 | { \ |
59 | inline RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \ |
60 | { \ |
61 | return ::OP##_u64(a, b); \ |
62 | } \ |
63 | inline RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept \ |
64 | { \ |
65 | return ::OP##_s64(a, b); \ |
66 | } \ |
67 | } |
68 | |
69 | #define WRAP_BINARY_FLOAT(OP, RT) \ |
70 | namespace wrap \ |
71 | { \ |
72 | inline RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \ |
73 | { \ |
74 | return ::OP##_f32(a, b); \ |
75 | } \ |
76 | } |
77 | |
78 | #define WRAP_UNARY_INT_EXCLUDING_64(OP) \ |
79 | namespace wrap \ |
80 | { \ |
81 | inline uint8x16_t OP##_u8(uint8x16_t a) noexcept \ |
82 | { \ |
83 | return ::OP##_u8(a); \ |
84 | } \ |
85 | inline int8x16_t OP##_s8(int8x16_t a) noexcept \ |
86 | { \ |
87 | return ::OP##_s8(a); \ |
88 | } \ |
89 | inline uint16x8_t OP##_u16(uint16x8_t a) noexcept \ |
90 | { \ |
91 | return ::OP##_u16(a); \ |
92 | } \ |
93 | inline int16x8_t OP##_s16(int16x8_t a) noexcept \ |
94 | { \ |
95 | return ::OP##_s16(a); \ |
96 | } \ |
97 | inline uint32x4_t OP##_u32(uint32x4_t a) noexcept \ |
98 | { \ |
99 | return ::OP##_u32(a); \ |
100 | } \ |
101 | inline int32x4_t OP##_s32(int32x4_t a) noexcept \ |
102 | { \ |
103 | return ::OP##_s32(a); \ |
104 | } \ |
105 | } |
106 | |
107 | #define WRAP_UNARY_INT(OP) \ |
108 | WRAP_UNARY_INT_EXCLUDING_64(OP) \ |
109 | namespace wrap \ |
110 | { \ |
111 | inline uint64x2_t OP##_u64(uint64x2_t a) noexcept \ |
112 | { \ |
113 | return ::OP##_u64(a); \ |
114 | } \ |
115 | inline int64x2_t OP##_s64(int64x2_t a) noexcept \ |
116 | { \ |
117 | return ::OP##_s64(a); \ |
118 | } \ |
119 | } |
120 | |
121 | #define WRAP_UNARY_FLOAT(OP) \ |
122 | namespace wrap \ |
123 | { \ |
124 | inline float32x4_t OP##_f32(float32x4_t a) noexcept \ |
125 | { \ |
126 | return ::OP##_f32(a); \ |
127 | } \ |
128 | } |
129 | |
130 | // Dummy identity caster to ease coding |
131 | inline uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; } |
132 | inline int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; } |
133 | inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; } |
134 | inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; } |
135 | inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; } |
136 | inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; } |
137 | inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; } |
138 | inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; } |
139 | inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; } |
140 | |
141 | namespace xsimd |
142 | { |
143 | template <class batch_type, bool... Values> |
144 | struct batch_bool_constant; |
145 | |
146 | namespace kernel |
147 | { |
148 | using namespace types; |
149 | |
150 | namespace detail |
151 | { |
152 | template <template <class> class return_type, class... T> |
153 | struct neon_dispatcher_base |
154 | { |
155 | struct unary |
156 | { |
157 | using container_type = std::tuple<return_type<T> (*)(T)...>; |
158 | const container_type m_func; |
159 | |
160 | template <class U> |
161 | return_type<U> apply(U rhs) const noexcept |
162 | { |
163 | using func_type = return_type<U> (*)(U); |
164 | auto func = xsimd::detail::get<func_type>(m_func); |
165 | return func(rhs); |
166 | } |
167 | }; |
168 | |
169 | struct binary |
170 | { |
171 | using container_type = std::tuple<return_type<T> (*)(T, T)...>; |
172 | const container_type m_func; |
173 | |
174 | template <class U> |
175 | return_type<U> apply(U lhs, U rhs) const noexcept |
176 | { |
177 | using func_type = return_type<U> (*)(U, U); |
178 | auto func = xsimd::detail::get<func_type>(m_func); |
179 | return func(lhs, rhs); |
180 | } |
181 | }; |
182 | }; |
183 | |
184 | /*************************** |
185 | * arithmetic dispatchers * |
186 | ***************************/ |
187 | |
188 | template <class T> |
189 | using identity_return_type = T; |
190 | |
191 | template <class... T> |
192 | struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...> |
193 | { |
194 | }; |
195 | |
196 | using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t, |
197 | uint16x8_t, int16x8_t, |
198 | uint32x4_t, int32x4_t, |
199 | uint64x2_t, int64x2_t, |
200 | float32x4_t>; |
201 | |
202 | using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t, |
203 | uint16x8_t, int16x8_t, |
204 | uint32x4_t, int32x4_t, |
205 | float32x4_t>; |
206 | |
207 | /************************** |
208 | * comparison dispatchers * |
209 | **************************/ |
210 | |
211 | template <class T> |
212 | struct comp_return_type_impl; |
213 | |
214 | template <> |
215 | struct comp_return_type_impl<uint8x16_t> |
216 | { |
217 | using type = uint8x16_t; |
218 | }; |
219 | |
220 | template <> |
221 | struct comp_return_type_impl<int8x16_t> |
222 | { |
223 | using type = uint8x16_t; |
224 | }; |
225 | |
226 | template <> |
227 | struct comp_return_type_impl<uint16x8_t> |
228 | { |
229 | using type = uint16x8_t; |
230 | }; |
231 | |
232 | template <> |
233 | struct comp_return_type_impl<int16x8_t> |
234 | { |
235 | using type = uint16x8_t; |
236 | }; |
237 | |
238 | template <> |
239 | struct comp_return_type_impl<uint32x4_t> |
240 | { |
241 | using type = uint32x4_t; |
242 | }; |
243 | |
244 | template <> |
245 | struct comp_return_type_impl<int32x4_t> |
246 | { |
247 | using type = uint32x4_t; |
248 | }; |
249 | |
250 | template <> |
251 | struct comp_return_type_impl<uint64x2_t> |
252 | { |
253 | using type = uint64x2_t; |
254 | }; |
255 | |
256 | template <> |
257 | struct comp_return_type_impl<int64x2_t> |
258 | { |
259 | using type = uint64x2_t; |
260 | }; |
261 | |
262 | template <> |
263 | struct comp_return_type_impl<float32x4_t> |
264 | { |
265 | using type = uint32x4_t; |
266 | }; |
267 | |
268 | template <class T> |
269 | using comp_return_type = typename comp_return_type_impl<T>::type; |
270 | |
271 | template <class... T> |
272 | struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...> |
273 | { |
274 | }; |
275 | |
276 | using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t, |
277 | uint16x8_t, int16x8_t, |
278 | uint32x4_t, int32x4_t, |
279 | float32x4_t>; |
280 | |
281 | /************************************** |
282 | * enabling / disabling metafunctions * |
283 | **************************************/ |
284 | |
285 | template <class T> |
286 | using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value, |
287 | int>::type; |
288 | |
289 | template <class T> |
290 | using exclude_int64_neon_t |
291 | = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>::type; |
292 | } |
293 | |
294 | /************* |
295 | * broadcast * |
296 | *************/ |
297 | |
298 | template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> |
299 | inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept |
300 | { |
301 | return vdupq_n_u8(p0: uint8_t(val)); |
302 | } |
303 | |
304 | template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> |
305 | inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept |
306 | { |
307 | return vdupq_n_s8(p0: int8_t(val)); |
308 | } |
309 | |
310 | template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> |
311 | inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept |
312 | { |
313 | return vdupq_n_u16(p0: uint16_t(val)); |
314 | } |
315 | |
316 | template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> |
317 | inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept |
318 | { |
319 | return vdupq_n_s16(p0: int16_t(val)); |
320 | } |
321 | |
322 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
323 | inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept |
324 | { |
325 | return vdupq_n_u32(p0: uint32_t(val)); |
326 | } |
327 | |
328 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
329 | inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept |
330 | { |
331 | return vdupq_n_s32(p0: int32_t(val)); |
332 | } |
333 | |
334 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
335 | inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept |
336 | { |
337 | return vdupq_n_u64(p0: uint64_t(val)); |
338 | } |
339 | |
340 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
341 | inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept |
342 | { |
343 | return vdupq_n_s64(p0: int64_t(val)); |
344 | } |
345 | |
346 | template <class A> |
347 | inline batch<float, A> broadcast(float val, requires_arch<neon>) noexcept |
348 | { |
349 | return vdupq_n_f32(p0: val); |
350 | } |
351 | |
352 | /******* |
353 | * set * |
354 | *******/ |
355 | |
356 | template <class A, class T, class... Args, detail::enable_integral_t<T> = 0> |
357 | inline batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept |
358 | { |
359 | return xsimd::types::detail::neon_vector_type<T> { args... }; |
360 | } |
361 | |
362 | template <class A, class T, class... Args, detail::enable_integral_t<T> = 0> |
363 | inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept |
364 | { |
365 | using register_type = typename batch_bool<T, A>::register_type; |
366 | using unsigned_type = as_unsigned_integer_t<T>; |
367 | return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... }; |
368 | } |
369 | |
370 | template <class A> |
371 | inline batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept |
372 | { |
373 | return float32x4_t { f0, f1, f2, f3 }; |
374 | } |
375 | |
376 | template <class A> |
377 | inline batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>, |
378 | std::complex<float> c0, std::complex<float> c1, |
379 | std::complex<float> c2, std::complex<float> c3) noexcept |
380 | { |
381 | return batch<std::complex<float>>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() }, |
382 | float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() }); |
383 | } |
384 | |
385 | template <class A, class... Args> |
386 | inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept |
387 | { |
388 | using register_type = typename batch_bool<float, A>::register_type; |
389 | using unsigned_type = as_unsigned_integer_t<float>; |
390 | return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... }; |
391 | } |
392 | |
393 | /************* |
394 | * from_bool * |
395 | *************/ |
396 | |
397 | template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> |
398 | inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
399 | { |
400 | return vandq_u8(arg, vdupq_n_u8(p0: 1)); |
401 | } |
402 | |
403 | template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> |
404 | inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
405 | { |
406 | return vandq_s8(p0: reinterpret_cast<int8x16_t>(arg.data), p1: vdupq_n_s8(p0: 1)); |
407 | } |
408 | |
409 | template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> |
410 | inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
411 | { |
412 | return vandq_u16(arg, vdupq_n_u16(p0: 1)); |
413 | } |
414 | |
415 | template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> |
416 | inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
417 | { |
418 | return vandq_s16(p0: reinterpret_cast<int16x8_t>(arg.data), p1: vdupq_n_s16(p0: 1)); |
419 | } |
420 | |
421 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
422 | inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
423 | { |
424 | return vandq_u32(arg, vdupq_n_u32(p0: 1)); |
425 | } |
426 | |
427 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
428 | inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
429 | { |
430 | return vandq_s32(p0: reinterpret_cast<int32x4_t>(arg.data), p1: vdupq_n_s32(p0: 1)); |
431 | } |
432 | |
433 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
434 | inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
435 | { |
436 | return vandq_u64(arg, vdupq_n_u64(p0: 1)); |
437 | } |
438 | |
439 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
440 | inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
441 | { |
442 | return vandq_s64(p0: reinterpret_cast<int64x2_t>(arg.data), p1: vdupq_n_s64(p0: 1)); |
443 | } |
444 | |
445 | template <class A> |
446 | inline batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept |
447 | { |
448 | return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(p0: vdupq_n_f32(p0: 1.f)))); |
449 | } |
450 | |
451 | /******** |
452 | * load * |
453 | ********/ |
454 | |
455 | template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> |
456 | inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept |
457 | { |
458 | return vld1q_u8((uint8_t*)src); |
459 | } |
460 | |
461 | template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> |
462 | inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept |
463 | { |
464 | return vld1q_s8((int8_t*)src); |
465 | } |
466 | |
467 | template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> |
468 | inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept |
469 | { |
470 | return vld1q_u16((uint16_t*)src); |
471 | } |
472 | template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> |
473 | inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept |
474 | { |
475 | return vld1q_s16((int16_t*)src); |
476 | } |
477 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
478 | inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept |
479 | { |
480 | return vld1q_u32((uint32_t*)src); |
481 | } |
482 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
483 | inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept |
484 | { |
485 | return vld1q_s32((int32_t*)src); |
486 | } |
487 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
488 | inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept |
489 | { |
490 | return vld1q_u64((uint64_t*)src); |
491 | } |
492 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
493 | inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept |
494 | { |
495 | return vld1q_s64((int64_t*)src); |
496 | } |
497 | |
498 | template <class A> |
499 | inline batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept |
500 | { |
501 | return vld1q_f32(src); |
502 | } |
503 | |
504 | template <class A, class T> |
505 | inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept |
506 | { |
507 | return load_aligned<A>(src, convert<T>(), A {}); |
508 | } |
509 | |
510 | /********* |
511 | * store * |
512 | *********/ |
513 | |
514 | template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> |
515 | inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept |
516 | { |
517 | vst1q_u8((uint8_t*)dst, src); |
518 | } |
519 | |
520 | template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> |
521 | inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept |
522 | { |
523 | vst1q_s8((int8_t*)dst, src); |
524 | } |
525 | |
526 | template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> |
527 | inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept |
528 | { |
529 | vst1q_u16((uint16_t*)dst, src); |
530 | } |
531 | |
532 | template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> |
533 | inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept |
534 | { |
535 | vst1q_s16((int16_t*)dst, src); |
536 | } |
537 | |
538 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
539 | inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept |
540 | { |
541 | vst1q_u32((uint32_t*)dst, src); |
542 | } |
543 | |
544 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
545 | inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept |
546 | { |
547 | vst1q_s32((int32_t*)dst, src); |
548 | } |
549 | |
550 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
551 | inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept |
552 | { |
553 | vst1q_u64((uint64_t*)dst, src); |
554 | } |
555 | |
556 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
557 | inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept |
558 | { |
559 | vst1q_s64((int64_t*)dst, src); |
560 | } |
561 | |
562 | template <class A> |
563 | inline void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept |
564 | { |
565 | vst1q_f32(dst, src); |
566 | } |
567 | |
568 | template <class A, class T> |
569 | inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept |
570 | { |
571 | store_aligned<A>(dst, src, A {}); |
572 | } |
573 | |
574 | /**************** |
575 | * load_complex * |
576 | ****************/ |
577 | |
578 | template <class A> |
579 | inline batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept |
580 | { |
581 | using real_batch = batch<float, A>; |
582 | const float* buf = reinterpret_cast<const float*>(mem); |
583 | float32x4x2_t tmp = vld2q_f32(buf); |
584 | real_batch real = tmp.val[0], |
585 | imag = tmp.val[1]; |
586 | return batch<std::complex<float>, A> { real, imag }; |
587 | } |
588 | |
589 | template <class A> |
590 | inline batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept |
591 | { |
592 | return load_complex_aligned<A>(mem, cvt, A {}); |
593 | } |
594 | |
595 | /***************** |
596 | * store_complex * |
597 | *****************/ |
598 | |
599 | template <class A> |
600 | inline void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept |
601 | { |
602 | float32x4x2_t tmp; |
603 | tmp.val[0] = src.real(); |
604 | tmp.val[1] = src.imag(); |
605 | float* buf = reinterpret_cast<float*>(dst); |
606 | vst2q_f32(buf, tmp); |
607 | } |
608 | |
609 | template <class A> |
610 | inline void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept |
611 | { |
612 | store_complex_aligned(dst, src, A {}); |
613 | } |
614 | |
615 | /******* |
616 | * neg * |
617 | *******/ |
618 | |
619 | template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> |
620 | inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept |
621 | { |
622 | return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs))); |
623 | } |
624 | |
625 | template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> |
626 | inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept |
627 | { |
628 | return vnegq_s8(rhs); |
629 | } |
630 | |
631 | template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> |
632 | inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept |
633 | { |
634 | return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs))); |
635 | } |
636 | |
637 | template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> |
638 | inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept |
639 | { |
640 | return vnegq_s16(rhs); |
641 | } |
642 | |
643 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
644 | inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept |
645 | { |
646 | return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs))); |
647 | } |
648 | |
649 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
650 | inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept |
651 | { |
652 | return vnegq_s32(rhs); |
653 | } |
654 | |
655 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
656 | inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept |
657 | { |
658 | return batch<T, A> { -rhs.get(0), -rhs.get(1) }; |
659 | } |
660 | |
661 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
662 | inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept |
663 | { |
664 | return batch<T, A> { -rhs.get(0), -rhs.get(1) }; |
665 | } |
666 | |
667 | template <class A> |
668 | inline batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept |
669 | { |
670 | return vnegq_f32(rhs); |
671 | } |
672 | |
673 | /******* |
674 | * add * |
675 | *******/ |
676 | |
677 | WRAP_BINARY_INT(vaddq, detail::identity_return_type) |
678 | WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type) |
679 | |
680 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
681 | inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
682 | { |
683 | using register_type = typename batch<T, A>::register_type; |
684 | const detail::neon_dispatcher::binary dispatcher = { |
685 | .m_func: std::make_tuple(args&: wrap::vaddq_u8, args&: wrap::vaddq_s8, args&: wrap::vaddq_u16, args&: wrap::vaddq_s16, |
686 | args&: wrap::vaddq_u32, args&: wrap::vaddq_s32, args&: wrap::vaddq_u64, args&: wrap::vaddq_s64, |
687 | args&: wrap::vaddq_f32) |
688 | }; |
689 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
690 | } |
691 | |
692 | /******** |
693 | * sadd * |
694 | ********/ |
695 | |
696 | WRAP_BINARY_INT(vqaddq, detail::identity_return_type) |
697 | |
698 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
699 | inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
700 | { |
701 | using register_type = typename batch<T, A>::register_type; |
702 | const detail::neon_dispatcher::binary dispatcher = { |
703 | .m_func: std::make_tuple(args&: wrap::vqaddq_u8, args&: wrap::vqaddq_s8, args&: wrap::vqaddq_u16, args&: wrap::vqaddq_s16, |
704 | args&: wrap::vqaddq_u32, args&: wrap::vqaddq_s32, args&: wrap::vqaddq_u64, args&: wrap::vqaddq_s64, |
705 | args&: wrap::vaddq_f32) |
706 | }; |
707 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
708 | } |
709 | |
710 | /******* |
711 | * sub * |
712 | *******/ |
713 | |
714 | WRAP_BINARY_INT(vsubq, detail::identity_return_type) |
715 | WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type) |
716 | |
717 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
718 | inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
719 | { |
720 | using register_type = typename batch<T, A>::register_type; |
721 | const detail::neon_dispatcher::binary dispatcher = { |
722 | .m_func: std::make_tuple(args&: wrap::vsubq_u8, args&: wrap::vsubq_s8, args&: wrap::vsubq_u16, args&: wrap::vsubq_s16, |
723 | args&: wrap::vsubq_u32, args&: wrap::vsubq_s32, args&: wrap::vsubq_u64, args&: wrap::vsubq_s64, |
724 | args&: wrap::vsubq_f32) |
725 | }; |
726 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
727 | } |
728 | |
729 | /******** |
730 | * ssub * |
731 | ********/ |
732 | |
733 | WRAP_BINARY_INT(vqsubq, detail::identity_return_type) |
734 | |
735 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
736 | inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
737 | { |
738 | using register_type = typename batch<T, A>::register_type; |
739 | const detail::neon_dispatcher::binary dispatcher = { |
740 | .m_func: std::make_tuple(args&: wrap::vqsubq_u8, args&: wrap::vqsubq_s8, args&: wrap::vqsubq_u16, args&: wrap::vqsubq_s16, |
741 | args&: wrap::vqsubq_u32, args&: wrap::vqsubq_s32, args&: wrap::vqsubq_u64, args&: wrap::vqsubq_s64, |
742 | args&: wrap::vsubq_f32) |
743 | }; |
744 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
745 | } |
746 | |
747 | /******* |
748 | * mul * |
749 | *******/ |
750 | |
751 | WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type) |
752 | WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type) |
753 | |
754 | template <class A, class T, detail::exclude_int64_neon_t<T> = 0> |
755 | inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
756 | { |
757 | using register_type = typename batch<T, A>::register_type; |
758 | const detail::excluding_int64_dispatcher::binary dispatcher = { |
759 | .m_func: std::make_tuple(args&: wrap::vmulq_u8, args&: wrap::vmulq_s8, args&: wrap::vmulq_u16, args&: wrap::vmulq_s16, |
760 | args&: wrap::vmulq_u32, args&: wrap::vmulq_s32, args&: wrap::vmulq_f32) |
761 | }; |
762 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
763 | } |
764 | |
765 | /******* |
766 | * div * |
767 | *******/ |
768 | |
769 | #if defined(XSIMD_FAST_INTEGER_DIVISION) |
770 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
771 | inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
772 | { |
773 | return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs)); |
774 | } |
775 | |
776 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
777 | inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
778 | { |
779 | return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs)); |
780 | } |
781 | #endif |
782 | |
783 | template <class A> |
784 | inline batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept |
785 | { |
786 | // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html |
787 | // get an initial estimate of 1/b. |
788 | float32x4_t rcp = reciprocal(rhs); |
789 | |
790 | // use a couple Newton-Raphson steps to refine the estimate. Depending on your |
791 | // application's accuracy requirements, you may be able to get away with only |
792 | // one refinement (instead of the two used here). Be sure to test! |
793 | rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp); |
794 | rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp); |
795 | |
796 | // and finally, compute a / b = a * (1 / b) |
797 | return vmulq_f32(lhs, rcp); |
798 | } |
799 | |
800 | /****** |
801 | * eq * |
802 | ******/ |
803 | |
804 | WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type) |
805 | WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type) |
806 | |
807 | template <class A, class T, detail::exclude_int64_neon_t<T> = 0> |
808 | inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
809 | { |
810 | using register_type = typename batch<T, A>::register_type; |
811 | const detail::excluding_int64_comp_dispatcher::binary dispatcher = { |
812 | .m_func: std::make_tuple(args&: wrap::vceqq_u8, args&: wrap::vceqq_s8, args&: wrap::vceqq_u16, args&: wrap::vceqq_s16, |
813 | args&: wrap::vceqq_u32, args&: wrap::vceqq_s32, args&: wrap::vceqq_f32) |
814 | }; |
815 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
816 | } |
817 | |
818 | template <class A, class T, detail::exclude_int64_neon_t<T> = 0> |
819 | inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept |
820 | { |
821 | using register_type = typename batch_bool<T, A>::register_type; |
822 | using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary; |
823 | const dispatcher_type dispatcher = { |
824 | .m_func: std::make_tuple(args&: wrap::vceqq_u8, args&: wrap::vceqq_u16, args&: wrap::vceqq_u32) |
825 | }; |
826 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
827 | } |
828 | |
829 | template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> |
830 | inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
831 | { |
832 | return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) }); |
833 | } |
834 | |
835 | template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> |
836 | inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept |
837 | { |
838 | return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) }); |
839 | } |
840 | |
841 | /************* |
842 | * fast_cast * |
843 | *************/ |
844 | |
845 | namespace detail |
846 | { |
847 | template <class A> |
848 | inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept |
849 | { |
850 | return vcvtq_f32_s32(self); |
851 | } |
852 | |
853 | template <class A> |
854 | inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept |
855 | { |
856 | return vcvtq_f32_u32(self); |
857 | } |
858 | |
859 | template <class A> |
860 | inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept |
861 | { |
862 | return vcvtq_s32_f32(self); |
863 | } |
864 | |
865 | template <class A> |
866 | inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept |
867 | { |
868 | return vcvtq_u32_f32(self); |
869 | } |
870 | |
871 | } |
872 | |
873 | /****** |
874 | * lt * |
875 | ******/ |
876 | |
877 | WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type) |
878 | WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type) |
879 | |
880 | template <class A, class T, detail::exclude_int64_neon_t<T> = 0> |
881 | inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
882 | { |
883 | using register_type = typename batch<T, A>::register_type; |
884 | const detail::excluding_int64_comp_dispatcher::binary dispatcher = { |
885 | .m_func: std::make_tuple(args&: wrap::vcltq_u8, args&: wrap::vcltq_s8, args&: wrap::vcltq_u16, args&: wrap::vcltq_s16, |
886 | args&: wrap::vcltq_u32, args&: wrap::vcltq_s32, args&: wrap::vcltq_f32) |
887 | }; |
888 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
889 | } |
890 | |
891 | template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> |
892 | inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
893 | { |
894 | return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) }); |
895 | } |
896 | |
897 | /****** |
898 | * le * |
899 | ******/ |
900 | |
901 | WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type) |
902 | WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type) |
903 | |
904 | template <class A, class T, detail::exclude_int64_neon_t<T> = 0> |
905 | inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
906 | { |
907 | using register_type = typename batch<T, A>::register_type; |
908 | const detail::excluding_int64_comp_dispatcher::binary dispatcher = { |
909 | .m_func: std::make_tuple(args&: wrap::vcleq_u8, args&: wrap::vcleq_s8, args&: wrap::vcleq_u16, args&: wrap::vcleq_s16, |
910 | args&: wrap::vcleq_u32, args&: wrap::vcleq_s32, args&: wrap::vcleq_f32) |
911 | }; |
912 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
913 | } |
914 | |
915 | template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> |
916 | inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
917 | { |
918 | return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) }); |
919 | } |
920 | |
921 | /****** |
922 | * gt * |
923 | ******/ |
924 | |
925 | WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type) |
926 | WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type) |
927 | |
928 | template <class A, class T, detail::exclude_int64_neon_t<T> = 0> |
929 | inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
930 | { |
931 | using register_type = typename batch<T, A>::register_type; |
932 | const detail::excluding_int64_comp_dispatcher::binary dispatcher = { |
933 | .m_func: std::make_tuple(args&: wrap::vcgtq_u8, args&: wrap::vcgtq_s8, args&: wrap::vcgtq_u16, args&: wrap::vcgtq_s16, |
934 | args&: wrap::vcgtq_u32, args&: wrap::vcgtq_s32, args&: wrap::vcgtq_f32) |
935 | }; |
936 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
937 | } |
938 | |
939 | template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> |
940 | inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
941 | { |
942 | return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) }); |
943 | } |
944 | |
945 | /****** |
946 | * ge * |
947 | ******/ |
948 | |
949 | WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type) |
950 | WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type) |
951 | |
952 | template <class A, class T, detail::exclude_int64_neon_t<T> = 0> |
953 | inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
954 | { |
955 | using register_type = typename batch<T, A>::register_type; |
956 | const detail::excluding_int64_comp_dispatcher::binary dispatcher = { |
957 | .m_func: std::make_tuple(args&: wrap::vcgeq_u8, args&: wrap::vcgeq_s8, args&: wrap::vcgeq_u16, args&: wrap::vcgeq_s16, |
958 | args&: wrap::vcgeq_u32, args&: wrap::vcgeq_s32, args&: wrap::vcgeq_f32) |
959 | }; |
960 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
961 | } |
962 | |
963 | template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> |
964 | inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
965 | { |
966 | return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) }); |
967 | } |
968 | |
969 | /******************* |
970 | * batch_bool_cast * |
971 | *******************/ |
972 | |
973 | template <class A, class T_out, class T_in> |
974 | inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept |
975 | { |
976 | using register_type = typename batch_bool<T_out, A>::register_type; |
977 | return register_type(self); |
978 | } |
979 | |
980 | /*************** |
981 | * bitwise_and * |
982 | ***************/ |
983 | |
984 | WRAP_BINARY_INT(vandq, detail::identity_return_type) |
985 | |
986 | namespace detail |
987 | { |
988 | inline float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept |
989 | { |
990 | return vreinterpretq_f32_u32(p0: vandq_u32(p0: vreinterpretq_u32_f32(p0: lhs), |
991 | p1: vreinterpretq_u32_f32(p0: rhs))); |
992 | } |
993 | |
994 | template <class V> |
995 | V bitwise_and_neon(V const& lhs, V const& rhs) |
996 | { |
997 | const neon_dispatcher::binary dispatcher = { |
998 | .m_func: std::make_tuple(args&: wrap::vandq_u8, args&: wrap::vandq_s8, args&: wrap::vandq_u16, args&: wrap::vandq_s16, |
999 | args&: wrap::vandq_u32, args&: wrap::vandq_s32, args&: wrap::vandq_u64, args&: wrap::vandq_s64, |
1000 | args&: bitwise_and_f32) |
1001 | }; |
1002 | return dispatcher.apply(lhs, rhs); |
1003 | } |
1004 | } |
1005 | |
1006 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1007 | inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1008 | { |
1009 | using register_type = typename batch<T, A>::register_type; |
1010 | return detail::bitwise_and_neon(register_type(lhs), register_type(rhs)); |
1011 | } |
1012 | |
1013 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1014 | inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept |
1015 | { |
1016 | using register_type = typename batch_bool<T, A>::register_type; |
1017 | return detail::bitwise_and_neon(register_type(lhs), register_type(rhs)); |
1018 | } |
1019 | |
1020 | /************** |
1021 | * bitwise_or * |
1022 | **************/ |
1023 | |
1024 | WRAP_BINARY_INT(vorrq, detail::identity_return_type) |
1025 | |
1026 | namespace detail |
1027 | { |
1028 | inline float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept |
1029 | { |
1030 | return vreinterpretq_f32_u32(p0: vorrq_u32(p0: vreinterpretq_u32_f32(p0: lhs), |
1031 | p1: vreinterpretq_u32_f32(p0: rhs))); |
1032 | } |
1033 | |
1034 | template <class V> |
1035 | inline V bitwise_or_neon(V const& lhs, V const& rhs) noexcept |
1036 | { |
1037 | const neon_dispatcher::binary dispatcher = { |
1038 | .m_func: std::make_tuple(args&: wrap::vorrq_u8, args&: wrap::vorrq_s8, args&: wrap::vorrq_u16, args&: wrap::vorrq_s16, |
1039 | args&: wrap::vorrq_u32, args&: wrap::vorrq_s32, args&: wrap::vorrq_u64, args&: wrap::vorrq_s64, |
1040 | args&: bitwise_or_f32) |
1041 | }; |
1042 | return dispatcher.apply(lhs, rhs); |
1043 | } |
1044 | } |
1045 | |
1046 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1047 | inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1048 | { |
1049 | using register_type = typename batch<T, A>::register_type; |
1050 | return detail::bitwise_or_neon(register_type(lhs), register_type(rhs)); |
1051 | } |
1052 | |
1053 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1054 | inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept |
1055 | { |
1056 | using register_type = typename batch_bool<T, A>::register_type; |
1057 | return detail::bitwise_or_neon(register_type(lhs), register_type(rhs)); |
1058 | } |
1059 | |
1060 | /*************** |
1061 | * bitwise_xor * |
1062 | ***************/ |
1063 | |
1064 | WRAP_BINARY_INT(veorq, detail::identity_return_type) |
1065 | |
1066 | namespace detail |
1067 | { |
1068 | inline float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept |
1069 | { |
1070 | return vreinterpretq_f32_u32(p0: veorq_u32(p0: vreinterpretq_u32_f32(p0: lhs), |
1071 | p1: vreinterpretq_u32_f32(p0: rhs))); |
1072 | } |
1073 | |
1074 | template <class V> |
1075 | inline V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept |
1076 | { |
1077 | const neon_dispatcher::binary dispatcher = { |
1078 | .m_func: std::make_tuple(args&: wrap::veorq_u8, args&: wrap::veorq_s8, args&: wrap::veorq_u16, args&: wrap::veorq_s16, |
1079 | args&: wrap::veorq_u32, args&: wrap::veorq_s32, args&: wrap::veorq_u64, args&: wrap::veorq_s64, |
1080 | args&: bitwise_xor_f32) |
1081 | }; |
1082 | return dispatcher.apply(lhs, rhs); |
1083 | } |
1084 | } |
1085 | |
1086 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1087 | inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1088 | { |
1089 | using register_type = typename batch<T, A>::register_type; |
1090 | return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs)); |
1091 | } |
1092 | |
1093 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1094 | inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept |
1095 | { |
1096 | using register_type = typename batch_bool<T, A>::register_type; |
1097 | return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs)); |
1098 | } |
1099 | |
1100 | /******* |
1101 | * neq * |
1102 | *******/ |
1103 | |
1104 | template <class A, class T> |
1105 | inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept |
1106 | { |
1107 | return bitwise_xor(lhs, rhs, A {}); |
1108 | } |
1109 | |
1110 | /*************** |
1111 | * bitwise_not * |
1112 | ***************/ |
1113 | |
1114 | WRAP_UNARY_INT_EXCLUDING_64(vmvnq) |
1115 | |
1116 | namespace detail |
1117 | { |
1118 | inline int64x2_t bitwise_not_s64(int64x2_t arg) noexcept |
1119 | { |
1120 | return vreinterpretq_s64_s32(p0: vmvnq_s32(p0: vreinterpretq_s32_s64(p0: arg))); |
1121 | } |
1122 | |
1123 | inline uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept |
1124 | { |
1125 | return vreinterpretq_u64_u32(p0: vmvnq_u32(p0: vreinterpretq_u32_u64(p0: arg))); |
1126 | } |
1127 | |
1128 | inline float32x4_t bitwise_not_f32(float32x4_t arg) noexcept |
1129 | { |
1130 | return vreinterpretq_f32_u32(p0: vmvnq_u32(p0: vreinterpretq_u32_f32(p0: arg))); |
1131 | } |
1132 | |
1133 | template <class V> |
1134 | inline V bitwise_not_neon(V const& arg) noexcept |
1135 | { |
1136 | const neon_dispatcher::unary dispatcher = { |
1137 | .m_func: std::make_tuple(args&: wrap::vmvnq_u8, args&: wrap::vmvnq_s8, args&: wrap::vmvnq_u16, args&: wrap::vmvnq_s16, |
1138 | args&: wrap::vmvnq_u32, args&: wrap::vmvnq_s32, |
1139 | args&: bitwise_not_u64, args&: bitwise_not_s64, |
1140 | args&: bitwise_not_f32) |
1141 | }; |
1142 | return dispatcher.apply(arg); |
1143 | } |
1144 | } |
1145 | |
1146 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1147 | inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept |
1148 | { |
1149 | using register_type = typename batch<T, A>::register_type; |
1150 | return detail::bitwise_not_neon(register_type(arg)); |
1151 | } |
1152 | |
1153 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1154 | inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
1155 | { |
1156 | using register_type = typename batch_bool<T, A>::register_type; |
1157 | return detail::bitwise_not_neon(register_type(arg)); |
1158 | } |
1159 | |
1160 | /****************** |
1161 | * bitwise_andnot * |
1162 | ******************/ |
1163 | |
1164 | WRAP_BINARY_INT(vbicq, detail::identity_return_type) |
1165 | |
1166 | namespace detail |
1167 | { |
1168 | inline float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept |
1169 | { |
1170 | return vreinterpretq_f32_u32(p0: vbicq_u32(p0: vreinterpretq_u32_f32(p0: lhs), p1: vreinterpretq_u32_f32(p0: rhs))); |
1171 | } |
1172 | |
1173 | template <class V> |
1174 | inline V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept |
1175 | { |
1176 | const detail::neon_dispatcher::binary dispatcher = { |
1177 | .m_func: std::make_tuple(args&: wrap::vbicq_u8, args&: wrap::vbicq_s8, args&: wrap::vbicq_u16, args&: wrap::vbicq_s16, |
1178 | args&: wrap::vbicq_u32, args&: wrap::vbicq_s32, args&: wrap::vbicq_u64, args&: wrap::vbicq_s64, |
1179 | args&: bitwise_andnot_f32) |
1180 | }; |
1181 | return dispatcher.apply(lhs, rhs); |
1182 | } |
1183 | } |
1184 | |
1185 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1186 | inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1187 | { |
1188 | using register_type = typename batch<T, A>::register_type; |
1189 | return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs)); |
1190 | } |
1191 | |
1192 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1193 | inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept |
1194 | { |
1195 | using register_type = typename batch_bool<T, A>::register_type; |
1196 | return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs)); |
1197 | } |
1198 | |
1199 | /******* |
1200 | * min * |
1201 | *******/ |
1202 | |
1203 | WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type) |
1204 | WRAP_BINARY_FLOAT(vminq, detail::identity_return_type) |
1205 | |
1206 | template <class A, class T, detail::exclude_int64_neon_t<T> = 0> |
1207 | inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1208 | { |
1209 | using register_type = typename batch<T, A>::register_type; |
1210 | const detail::excluding_int64_dispatcher::binary dispatcher = { |
1211 | .m_func: std::make_tuple(args&: wrap::vminq_u8, args&: wrap::vminq_s8, args&: wrap::vminq_u16, args&: wrap::vminq_s16, |
1212 | args&: wrap::vminq_u32, args&: wrap::vminq_s32, args&: wrap::vminq_f32) |
1213 | }; |
1214 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
1215 | } |
1216 | |
1217 | template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> |
1218 | inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1219 | { |
1220 | return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) }; |
1221 | } |
1222 | |
1223 | /******* |
1224 | * max * |
1225 | *******/ |
1226 | |
1227 | WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type) |
1228 | WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type) |
1229 | |
1230 | template <class A, class T, detail::exclude_int64_neon_t<T> = 0> |
1231 | inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1232 | { |
1233 | using register_type = typename batch<T, A>::register_type; |
1234 | const detail::excluding_int64_dispatcher::binary dispatcher = { |
1235 | .m_func: std::make_tuple(args&: wrap::vmaxq_u8, args&: wrap::vmaxq_s8, args&: wrap::vmaxq_u16, args&: wrap::vmaxq_s16, |
1236 | args&: wrap::vmaxq_u32, args&: wrap::vmaxq_s32, args&: wrap::vmaxq_f32) |
1237 | }; |
1238 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
1239 | } |
1240 | |
1241 | template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> |
1242 | inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1243 | { |
1244 | return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) }; |
1245 | } |
1246 | |
1247 | /******* |
1248 | * abs * |
1249 | *******/ |
1250 | |
1251 | namespace wrap |
1252 | { |
1253 | inline int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(p0: a); } |
1254 | inline int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(p0: a); } |
1255 | inline int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(p0: a); } |
1256 | } |
1257 | WRAP_UNARY_FLOAT(vabsq) |
1258 | |
1259 | namespace detail |
1260 | { |
1261 | inline uint8x16_t abs_u8(uint8x16_t arg) noexcept |
1262 | { |
1263 | return arg; |
1264 | } |
1265 | |
1266 | inline uint16x8_t abs_u16(uint16x8_t arg) noexcept |
1267 | { |
1268 | return arg; |
1269 | } |
1270 | |
1271 | inline uint32x4_t abs_u32(uint32x4_t arg) noexcept |
1272 | { |
1273 | return arg; |
1274 | } |
1275 | } |
1276 | |
1277 | template <class A, class T, detail::exclude_int64_neon_t<T> = 0> |
1278 | inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept |
1279 | { |
1280 | using register_type = typename batch<T, A>::register_type; |
1281 | const detail::excluding_int64_dispatcher::unary dispatcher = { |
1282 | .m_func: std::make_tuple(args&: detail::abs_u8, args&: wrap::vabsq_s8, args&: detail::abs_u16, args&: wrap::vabsq_s16, |
1283 | args&: detail::abs_u32, args&: wrap::vabsq_s32, args&: wrap::vabsq_f32) |
1284 | }; |
1285 | return dispatcher.apply(register_type(arg)); |
1286 | } |
1287 | |
1288 | /******** |
1289 | * rsqrt * |
1290 | ********/ |
1291 | |
1292 | template <class A> |
1293 | inline batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept |
1294 | { |
1295 | return vrsqrteq_f32(arg); |
1296 | } |
1297 | |
1298 | /******** |
1299 | * sqrt * |
1300 | ********/ |
1301 | |
1302 | template <class A> |
1303 | inline batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept |
1304 | { |
1305 | batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg); |
1306 | // one iter |
1307 | sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal)); |
1308 | batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal)); |
1309 | batch<float, A> zero(0.f); |
1310 | return select(arg == zero, zero, sqrt_approx); |
1311 | } |
1312 | |
1313 | /******************** |
1314 | * Fused operations * |
1315 | ********************/ |
1316 | |
1317 | #ifdef __ARM_FEATURE_FMA |
1318 | template <class A> |
1319 | inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept |
1320 | { |
1321 | return vfmaq_f32(z, x, y); |
1322 | } |
1323 | |
1324 | template <class A> |
1325 | inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept |
1326 | { |
1327 | return vfmaq_f32(-z, x, y); |
1328 | } |
1329 | #endif |
1330 | |
1331 | /********* |
1332 | * haddp * |
1333 | *********/ |
1334 | |
1335 | template <class A> |
1336 | inline batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept |
1337 | { |
1338 | // row = (a,b,c,d) |
1339 | float32x2_t tmp1, tmp2, tmp3; |
1340 | // tmp1 = (a0 + a2, a1 + a3) |
1341 | tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0])); |
1342 | // tmp2 = (b0 + b2, b1 + b3) |
1343 | tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1])); |
1344 | // tmp1 = (a0..3, b0..3) |
1345 | tmp1 = vpadd_f32(p0: tmp1, p1: tmp2); |
1346 | // tmp2 = (c0 + c2, c1 + c3) |
1347 | tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2])); |
1348 | // tmp3 = (d0 + d2, d1 + d3) |
1349 | tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3])); |
1350 | // tmp1 = (c0..3, d0..3) |
1351 | tmp2 = vpadd_f32(p0: tmp2, p1: tmp3); |
1352 | // return = (a0..3, b0..3, c0..3, d0..3) |
1353 | return vcombine_f32(p0: tmp1, p1: tmp2); |
1354 | } |
1355 | |
1356 | /************** |
1357 | * reciprocal * |
1358 | **************/ |
1359 | |
1360 | template <class A> |
1361 | inline batch<float, A> |
1362 | reciprocal(const batch<float, A>& x, |
1363 | kernel::requires_arch<neon>) noexcept |
1364 | { |
1365 | return vrecpeq_f32(x); |
1366 | } |
1367 | |
1368 | /********** |
1369 | * insert * |
1370 | **********/ |
1371 | |
1372 | template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0> |
1373 | inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept |
1374 | { |
1375 | return vsetq_lane_u8(val, self, I); |
1376 | } |
1377 | |
1378 | template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0> |
1379 | inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept |
1380 | { |
1381 | return vsetq_lane_s8(val, self, I); |
1382 | } |
1383 | |
1384 | template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0> |
1385 | inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept |
1386 | { |
1387 | return vsetq_lane_u16(val, self, I); |
1388 | } |
1389 | |
1390 | template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0> |
1391 | inline batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept |
1392 | { |
1393 | return vsetq_lane_s16(val, self, I); |
1394 | } |
1395 | |
1396 | template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0> |
1397 | inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept |
1398 | { |
1399 | return vsetq_lane_u32(val, self, I); |
1400 | } |
1401 | |
1402 | template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0> |
1403 | inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept |
1404 | { |
1405 | return vsetq_lane_s32(val, self, I); |
1406 | } |
1407 | |
1408 | template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0> |
1409 | inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept |
1410 | { |
1411 | return vsetq_lane_u64(val, self, I); |
1412 | } |
1413 | |
1414 | template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 8> = 0> |
1415 | inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept |
1416 | { |
1417 | return vsetq_lane_s64(val, self, I); |
1418 | } |
1419 | |
1420 | template <class A, size_t I> |
1421 | inline batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept |
1422 | { |
1423 | return vsetq_lane_f32(val, self, I); |
1424 | } |
1425 | |
1426 | /******************** |
1427 | * nearbyint_as_int * |
1428 | *******************/ |
1429 | |
1430 | template <class A> |
1431 | inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self, |
1432 | requires_arch<neon>) noexcept |
1433 | { |
1434 | /* origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 */ |
1435 | // Contributors to this work are: |
1436 | // John W. Ratcliff <jratcliffscarab@gmail.com> |
1437 | // Brandon Rowlett <browlett@nvidia.com> |
1438 | // Ken Fast <kfast@gdeb.com> |
1439 | // Eric van Beurden <evanbeurden@nvidia.com> |
1440 | // Alexander Potylitsin <apotylitsin@nvidia.com> |
1441 | // Hasindu Gamaarachchi <hasindu2008@gmail.com> |
1442 | // Jim Huang <jserv@biilabs.io> |
1443 | // Mark Cheng <marktwtn@biilabs.io> |
1444 | // Malcolm James MacLeod <malcolm@gulden.com> |
1445 | // Devin Hussey (easyaspi314) <husseydevin@gmail.com> |
1446 | // Sebastian Pop <spop@amazon.com> |
1447 | // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com> |
1448 | // Danila Kutenin <danilak@google.com> |
1449 | // François Turban (JishinMaster) <francois.turban@gmail.com> |
1450 | // Pei-Hsuan Hung <afcidk@gmail.com> |
1451 | // Yang-Hao Yuan <yanghau@biilabs.io> |
1452 | // Syoyo Fujita <syoyo@lighttransport.com> |
1453 | // Brecht Van Lommel <brecht@blender.org> |
1454 | |
1455 | /* |
1456 | * sse2neon is freely redistributable under the MIT License. |
1457 | * |
1458 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
1459 | * of this software and associated documentation files (the "Software"), to deal |
1460 | * in the Software without restriction, including without limitation the rights |
1461 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
1462 | * copies of the Software, and to permit persons to whom the Software is |
1463 | * furnished to do so, subject to the following conditions: |
1464 | * |
1465 | * The above copyright notice and this permission notice shall be included in |
1466 | * all copies or substantial portions of the Software. |
1467 | * |
1468 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
1469 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
1470 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
1471 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
1472 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
1473 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
1474 | * SOFTWARE. |
1475 | */ |
1476 | |
1477 | const auto signmask = vdupq_n_u32(p0: 0x80000000); |
1478 | const auto half = vbslq_f32(signmask, self, |
1479 | vdupq_n_f32(p0: 0.5f)); /* +/- 0.5 */ |
1480 | const auto r_normal = vcvtq_s32_f32(vaddq_f32( |
1481 | self, half)); /* round to integer: [a + 0.5]*/ |
1482 | const auto r_trunc = vcvtq_s32_f32(self); /* truncate to integer: [a] */ |
1483 | const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32( |
1484 | vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ |
1485 | const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), |
1486 | vdupq_n_s32(p0: 1)); /* ([a] + {0,1}) & ~1 */ |
1487 | const auto delta = vsubq_f32( |
1488 | self, |
1489 | vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ |
1490 | const auto is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ |
1491 | return vbslq_s32(is_delta_half, r_even, r_normal); |
1492 | } |
1493 | |
1494 | /************** |
1495 | * reduce_add * |
1496 | **************/ |
1497 | |
1498 | namespace detail |
1499 | { |
1500 | template <class T, class A, class V> |
1501 | inline T sum_batch(V const& arg) noexcept |
1502 | { |
1503 | T res = T(0); |
1504 | for (std::size_t i = 0; i < batch<T, A>::size; ++i) |
1505 | { |
1506 | res += arg[i]; |
1507 | } |
1508 | return res; |
1509 | } |
1510 | } |
1511 | |
1512 | template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> |
1513 | inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept |
1514 | { |
1515 | uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg)); |
1516 | tmp = vpadd_u8(p0: tmp, p1: tmp); |
1517 | tmp = vpadd_u8(p0: tmp, p1: tmp); |
1518 | tmp = vpadd_u8(p0: tmp, p1: tmp); |
1519 | return vget_lane_u8(tmp, 0); |
1520 | } |
1521 | |
1522 | template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> |
1523 | inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept |
1524 | { |
1525 | int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg)); |
1526 | tmp = vpadd_s8(p0: tmp, p1: tmp); |
1527 | tmp = vpadd_s8(p0: tmp, p1: tmp); |
1528 | tmp = vpadd_s8(p0: tmp, p1: tmp); |
1529 | return vget_lane_s8(tmp, 0); |
1530 | } |
1531 | |
1532 | template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> |
1533 | inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept |
1534 | { |
1535 | uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg)); |
1536 | tmp = vpadd_u16(p0: tmp, p1: tmp); |
1537 | tmp = vpadd_u16(p0: tmp, p1: tmp); |
1538 | return vget_lane_u16(tmp, 0); |
1539 | } |
1540 | |
1541 | template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> |
1542 | inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept |
1543 | { |
1544 | int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg)); |
1545 | tmp = vpadd_s16(p0: tmp, p1: tmp); |
1546 | tmp = vpadd_s16(p0: tmp, p1: tmp); |
1547 | return vget_lane_s16(tmp, 0); |
1548 | } |
1549 | |
1550 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
1551 | inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept |
1552 | { |
1553 | uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg)); |
1554 | tmp = vpadd_u32(p0: tmp, p1: tmp); |
1555 | return vget_lane_u32(tmp, 0); |
1556 | } |
1557 | |
1558 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
1559 | inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept |
1560 | { |
1561 | int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg)); |
1562 | tmp = vpadd_s32(p0: tmp, p1: tmp); |
1563 | return vget_lane_s32(tmp, 0); |
1564 | } |
1565 | |
1566 | template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> |
1567 | inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept |
1568 | { |
1569 | return arg.get(0) + arg.get(1); |
1570 | } |
1571 | |
1572 | template <class A> |
1573 | inline float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept |
1574 | { |
1575 | float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg)); |
1576 | tmp = vpadd_f32(p0: tmp, p1: tmp); |
1577 | return vget_lane_f32(tmp, 0); |
1578 | } |
1579 | |
1580 | /************** |
1581 | * reduce_max * |
1582 | **************/ |
1583 | |
1584 | // Using generic implementation because ARM doe snot provide intrinsics |
1585 | // for this operation |
1586 | |
1587 | /************** |
1588 | * reduce_min * |
1589 | **************/ |
1590 | |
1591 | // Using generic implementation because ARM doe snot provide intrinsics |
1592 | // for this operation |
1593 | |
1594 | /********** |
1595 | * select * |
1596 | **********/ |
1597 | |
1598 | namespace wrap |
1599 | { |
1600 | inline uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(p0: a, p1: b, p2: c); } |
1601 | inline int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(p0: a, p1: b, p2: c); } |
1602 | inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(p0: a, p1: b, p2: c); } |
1603 | inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(p0: a, p1: b, p2: c); } |
1604 | inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(p0: a, p1: b, p2: c); } |
1605 | inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(p0: a, p1: b, p2: c); } |
1606 | inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(p0: a, p1: b, p2: c); } |
1607 | inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(p0: a, p1: b, p2: c); } |
1608 | inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(p0: a, p1: b, p2: c); } |
1609 | } |
1610 | |
1611 | namespace detail |
1612 | { |
1613 | template <class... T> |
1614 | struct neon_select_dispatcher_impl |
1615 | { |
1616 | using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>; |
1617 | const container_type m_func; |
1618 | |
1619 | template <class U> |
1620 | U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept |
1621 | { |
1622 | using func_type = U (*)(comp_return_type<U>, U, U); |
1623 | auto func = xsimd::detail::get<func_type>(m_func); |
1624 | return func(cond, lhs, rhs); |
1625 | } |
1626 | }; |
1627 | |
1628 | using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t, |
1629 | uint16x8_t, int16x8_t, |
1630 | uint32x4_t, int32x4_t, |
1631 | uint64x2_t, int64x2_t, |
1632 | float32x4_t>; |
1633 | } |
1634 | |
1635 | template <class A, class T, detail::enable_neon_type_t<T> = 0> |
1636 | inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept |
1637 | { |
1638 | using bool_register_type = typename batch_bool<T, A>::register_type; |
1639 | using register_type = typename batch<T, A>::register_type; |
1640 | const detail::neon_select_dispatcher dispatcher = { |
1641 | .m_func: std::make_tuple(args&: wrap::vbslq_u8, args&: wrap::vbslq_s8, args&: wrap::vbslq_u16, args&: wrap::vbslq_s16, |
1642 | args&: wrap::vbslq_u32, args&: wrap::vbslq_s32, args&: wrap::vbslq_u64, args&: wrap::vbslq_s64, |
1643 | args&: wrap::vbslq_f32) |
1644 | }; |
1645 | return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b)); |
1646 | } |
1647 | |
1648 | template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0> |
1649 | inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept |
1650 | { |
1651 | return select(batch_bool<T, A> { b... }, true_br, false_br, neon {}); |
1652 | } |
1653 | |
1654 | /********** |
1655 | * zip_lo * |
1656 | **********/ |
1657 | |
1658 | template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> |
1659 | inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1660 | { |
1661 | uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs)); |
1662 | return vcombine_u8(p0: tmp.val[0], p1: tmp.val[1]); |
1663 | } |
1664 | |
1665 | template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> |
1666 | inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1667 | { |
1668 | int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs)); |
1669 | return vcombine_s8(p0: tmp.val[0], p1: tmp.val[1]); |
1670 | } |
1671 | |
1672 | template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> |
1673 | inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1674 | { |
1675 | uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs)); |
1676 | return vcombine_u16(p0: tmp.val[0], p1: tmp.val[1]); |
1677 | } |
1678 | |
1679 | template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> |
1680 | inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1681 | { |
1682 | int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs)); |
1683 | return vcombine_s16(p0: tmp.val[0], p1: tmp.val[1]); |
1684 | } |
1685 | |
1686 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
1687 | inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1688 | { |
1689 | uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs)); |
1690 | return vcombine_u32(p0: tmp.val[0], p1: tmp.val[1]); |
1691 | } |
1692 | |
1693 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
1694 | inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1695 | { |
1696 | int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs)); |
1697 | return vcombine_s32(p0: tmp.val[0], p1: tmp.val[1]); |
1698 | } |
1699 | |
1700 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
1701 | inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1702 | { |
1703 | return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs)); |
1704 | } |
1705 | |
1706 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
1707 | inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1708 | { |
1709 | return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs)); |
1710 | } |
1711 | |
1712 | template <class A> |
1713 | inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept |
1714 | { |
1715 | float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs)); |
1716 | return vcombine_f32(p0: tmp.val[0], p1: tmp.val[1]); |
1717 | } |
1718 | |
1719 | /********** |
1720 | * zip_hi * |
1721 | **********/ |
1722 | |
1723 | template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> |
1724 | inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1725 | { |
1726 | uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs)); |
1727 | return vcombine_u8(p0: tmp.val[0], p1: tmp.val[1]); |
1728 | } |
1729 | |
1730 | template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> |
1731 | inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1732 | { |
1733 | int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs)); |
1734 | return vcombine_s8(p0: tmp.val[0], p1: tmp.val[1]); |
1735 | } |
1736 | |
1737 | template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> |
1738 | inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1739 | { |
1740 | uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs)); |
1741 | return vcombine_u16(p0: tmp.val[0], p1: tmp.val[1]); |
1742 | } |
1743 | |
1744 | template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> |
1745 | inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1746 | { |
1747 | int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs)); |
1748 | return vcombine_s16(p0: tmp.val[0], p1: tmp.val[1]); |
1749 | } |
1750 | |
1751 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
1752 | inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1753 | { |
1754 | uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs)); |
1755 | return vcombine_u32(p0: tmp.val[0], p1: tmp.val[1]); |
1756 | } |
1757 | |
1758 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
1759 | inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1760 | { |
1761 | int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs)); |
1762 | return vcombine_s32(p0: tmp.val[0], p1: tmp.val[1]); |
1763 | } |
1764 | |
1765 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
1766 | inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1767 | { |
1768 | return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs)); |
1769 | } |
1770 | |
1771 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
1772 | inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
1773 | { |
1774 | return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs)); |
1775 | } |
1776 | |
1777 | template <class A> |
1778 | inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept |
1779 | { |
1780 | float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs)); |
1781 | return vcombine_f32(p0: tmp.val[0], p1: tmp.val[1]); |
1782 | } |
1783 | |
1784 | /**************** |
1785 | * extract_pair * |
1786 | ****************/ |
1787 | |
1788 | namespace detail |
1789 | { |
1790 | template <class A, class T> |
1791 | inline batch<T, A> (batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept |
1792 | { |
1793 | assert(false && "extract_pair out of bounds" ); |
1794 | return batch<T, A> {}; |
1795 | } |
1796 | |
1797 | template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 1> = 0> |
1798 | inline batch<T, A> (batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept |
1799 | { |
1800 | if (n == I) |
1801 | { |
1802 | return vextq_u8(rhs, lhs, I); |
1803 | } |
1804 | else |
1805 | { |
1806 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1807 | } |
1808 | } |
1809 | |
1810 | template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 1> = 0> |
1811 | inline batch<T, A> (batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept |
1812 | { |
1813 | if (n == I) |
1814 | { |
1815 | return vextq_s8(rhs, lhs, I); |
1816 | } |
1817 | else |
1818 | { |
1819 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1820 | } |
1821 | } |
1822 | |
1823 | template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 2> = 0> |
1824 | inline batch<T, A> (batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept |
1825 | { |
1826 | if (n == I) |
1827 | { |
1828 | return vextq_u16(rhs, lhs, I); |
1829 | } |
1830 | else |
1831 | { |
1832 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1833 | } |
1834 | } |
1835 | |
1836 | template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 2> = 0> |
1837 | inline batch<T, A> (batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept |
1838 | { |
1839 | if (n == I) |
1840 | { |
1841 | return vextq_s16(rhs, lhs, I); |
1842 | } |
1843 | else |
1844 | { |
1845 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1846 | } |
1847 | } |
1848 | |
1849 | template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 4> = 0> |
1850 | inline batch<T, A> (batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept |
1851 | { |
1852 | if (n == I) |
1853 | { |
1854 | return vextq_u32(rhs, lhs, I); |
1855 | } |
1856 | else |
1857 | { |
1858 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1859 | } |
1860 | } |
1861 | |
1862 | template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 4> = 0> |
1863 | inline batch<T, A> (batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept |
1864 | { |
1865 | if (n == I) |
1866 | { |
1867 | return vextq_s32(rhs, lhs, I); |
1868 | } |
1869 | else |
1870 | { |
1871 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1872 | } |
1873 | } |
1874 | |
1875 | template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 8> = 0> |
1876 | inline batch<T, A> (batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept |
1877 | { |
1878 | if (n == I) |
1879 | { |
1880 | return vextq_u64(rhs, lhs, I); |
1881 | } |
1882 | else |
1883 | { |
1884 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1885 | } |
1886 | } |
1887 | |
1888 | template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 8> = 0> |
1889 | inline batch<T, A> (batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept |
1890 | { |
1891 | if (n == I) |
1892 | { |
1893 | return vextq_s64(rhs, lhs, I); |
1894 | } |
1895 | else |
1896 | { |
1897 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1898 | } |
1899 | } |
1900 | |
1901 | template <class A, size_t I, size_t... Is> |
1902 | inline batch<float, A> (batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept |
1903 | { |
1904 | if (n == I) |
1905 | { |
1906 | return vextq_f32(rhs, lhs, I); |
1907 | } |
1908 | else |
1909 | { |
1910 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1911 | } |
1912 | } |
1913 | |
1914 | template <class A, class T, size_t... Is> |
1915 | inline batch<T, A> (batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept |
1916 | { |
1917 | if (n == 0) |
1918 | { |
1919 | return rhs; |
1920 | } |
1921 | else |
1922 | { |
1923 | return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); |
1924 | } |
1925 | } |
1926 | } |
1927 | |
1928 | template <class A, class T> |
1929 | inline batch<T, A> (batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept |
1930 | { |
1931 | constexpr std::size_t size = batch<T, A>::size; |
1932 | assert(n < size && "index in bounds" ); |
1933 | return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>()); |
1934 | } |
1935 | |
1936 | /****************** |
1937 | * bitwise_lshift * |
1938 | ******************/ |
1939 | |
1940 | namespace detail |
1941 | { |
1942 | template <class A, class T> |
1943 | inline batch<T, A> bitwise_lshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept |
1944 | { |
1945 | assert(false && "bitwise_lshift out of bounds" ); |
1946 | return batch<T, A> {}; |
1947 | } |
1948 | |
1949 | template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0> |
1950 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
1951 | { |
1952 | if (n == I) |
1953 | { |
1954 | return vshlq_n_u8(lhs, I); |
1955 | } |
1956 | else |
1957 | { |
1958 | return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
1959 | } |
1960 | } |
1961 | |
1962 | template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0> |
1963 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
1964 | { |
1965 | if (n == I) |
1966 | { |
1967 | return vshlq_n_s8(lhs, I); |
1968 | } |
1969 | else |
1970 | { |
1971 | return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
1972 | } |
1973 | } |
1974 | |
1975 | template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0> |
1976 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
1977 | { |
1978 | if (n == I) |
1979 | { |
1980 | return vshlq_n_u16(lhs, I); |
1981 | } |
1982 | else |
1983 | { |
1984 | return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
1985 | } |
1986 | } |
1987 | |
1988 | template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0> |
1989 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
1990 | { |
1991 | if (n == I) |
1992 | { |
1993 | return vshlq_n_s16(lhs, I); |
1994 | } |
1995 | else |
1996 | { |
1997 | return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
1998 | } |
1999 | } |
2000 | |
2001 | template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0> |
2002 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2003 | { |
2004 | if (n == I) |
2005 | { |
2006 | return vshlq_n_u32(lhs, I); |
2007 | } |
2008 | else |
2009 | { |
2010 | return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2011 | } |
2012 | } |
2013 | |
2014 | template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0> |
2015 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2016 | { |
2017 | if (n == I) |
2018 | { |
2019 | return vshlq_n_s32(lhs, I); |
2020 | } |
2021 | else |
2022 | { |
2023 | return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2024 | } |
2025 | } |
2026 | |
2027 | template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0> |
2028 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2029 | { |
2030 | if (n == I) |
2031 | { |
2032 | return vshlq_n_u64(lhs, I); |
2033 | } |
2034 | else |
2035 | { |
2036 | return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2037 | } |
2038 | } |
2039 | |
2040 | template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0> |
2041 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2042 | { |
2043 | if (n == I) |
2044 | { |
2045 | return vshlq_n_s64(lhs, I); |
2046 | } |
2047 | else |
2048 | { |
2049 | return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2050 | } |
2051 | } |
2052 | |
2053 | template <class A, class T, int... Is> |
2054 | inline batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept |
2055 | { |
2056 | if (n == 0) |
2057 | { |
2058 | return lhs; |
2059 | } |
2060 | else |
2061 | { |
2062 | return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2063 | } |
2064 | } |
2065 | } |
2066 | |
2067 | template <class A, class T> |
2068 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept |
2069 | { |
2070 | constexpr int size = sizeof(typename batch<T, A>::value_type) * 8; |
2071 | assert(0 <= n && n < size && "index in bounds" ); |
2072 | return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>()); |
2073 | } |
2074 | |
2075 | template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> |
2076 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept |
2077 | { |
2078 | return vshlq_u8(lhs, rhs); |
2079 | } |
2080 | |
2081 | template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> |
2082 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
2083 | { |
2084 | return vshlq_s8(lhs, rhs); |
2085 | } |
2086 | |
2087 | template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> |
2088 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept |
2089 | { |
2090 | return vshlq_u16(lhs, rhs); |
2091 | } |
2092 | |
2093 | template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> |
2094 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
2095 | { |
2096 | return vshlq_s16(lhs, rhs); |
2097 | } |
2098 | |
2099 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
2100 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept |
2101 | { |
2102 | return vshlq_u32(lhs, rhs); |
2103 | } |
2104 | |
2105 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
2106 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
2107 | { |
2108 | return vshlq_s32(lhs, rhs); |
2109 | } |
2110 | |
2111 | template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> |
2112 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept |
2113 | { |
2114 | return vshlq_u64(lhs, rhs); |
2115 | } |
2116 | |
2117 | template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> |
2118 | inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
2119 | { |
2120 | return vshlq_s64(lhs, rhs); |
2121 | } |
2122 | |
2123 | /****************** |
2124 | * bitwise_rshift * |
2125 | ******************/ |
2126 | |
2127 | namespace detail |
2128 | { |
2129 | template <class A, class T> |
2130 | inline batch<T, A> bitwise_rshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept |
2131 | { |
2132 | assert(false && "bitwise_rshift out of bounds" ); |
2133 | return batch<T, A> {}; |
2134 | } |
2135 | |
2136 | template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0> |
2137 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2138 | { |
2139 | if (n == I) |
2140 | { |
2141 | return vshrq_n_u8(lhs, I); |
2142 | } |
2143 | else |
2144 | { |
2145 | return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2146 | } |
2147 | } |
2148 | |
2149 | template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0> |
2150 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2151 | { |
2152 | if (n == I) |
2153 | { |
2154 | return vshrq_n_s8(lhs, I); |
2155 | } |
2156 | else |
2157 | { |
2158 | return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2159 | } |
2160 | } |
2161 | |
2162 | template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0> |
2163 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2164 | { |
2165 | if (n == I) |
2166 | { |
2167 | return vshrq_n_u16(lhs, I); |
2168 | } |
2169 | else |
2170 | { |
2171 | return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2172 | } |
2173 | } |
2174 | |
2175 | template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0> |
2176 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2177 | { |
2178 | if (n == I) |
2179 | { |
2180 | return vshrq_n_s16(lhs, I); |
2181 | } |
2182 | else |
2183 | { |
2184 | return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2185 | } |
2186 | } |
2187 | |
2188 | template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0> |
2189 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2190 | { |
2191 | if (n == I) |
2192 | { |
2193 | return vshrq_n_u32(lhs, I); |
2194 | } |
2195 | else |
2196 | { |
2197 | return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2198 | } |
2199 | } |
2200 | |
2201 | template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0> |
2202 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2203 | { |
2204 | if (n == I) |
2205 | { |
2206 | return vshrq_n_s32(lhs, I); |
2207 | } |
2208 | else |
2209 | { |
2210 | return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2211 | } |
2212 | } |
2213 | |
2214 | template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0> |
2215 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2216 | { |
2217 | if (n == I) |
2218 | { |
2219 | return vshrq_n_u64(lhs, I); |
2220 | } |
2221 | else |
2222 | { |
2223 | return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2224 | } |
2225 | } |
2226 | |
2227 | template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0> |
2228 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept |
2229 | { |
2230 | if (n == I) |
2231 | { |
2232 | return vshrq_n_s64(lhs, I); |
2233 | } |
2234 | else |
2235 | { |
2236 | return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2237 | } |
2238 | } |
2239 | |
2240 | template <class A, class T, int... Is> |
2241 | inline batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept |
2242 | { |
2243 | if (n == 0) |
2244 | { |
2245 | return lhs; |
2246 | } |
2247 | else |
2248 | { |
2249 | return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); |
2250 | } |
2251 | } |
2252 | } |
2253 | |
2254 | template <class A, class T> |
2255 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept |
2256 | { |
2257 | constexpr int size = sizeof(typename batch<T, A>::value_type) * 8; |
2258 | assert(0 <= n && n < size && "index in bounds" ); |
2259 | return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>()); |
2260 | } |
2261 | |
2262 | template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> |
2263 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept |
2264 | { |
2265 | return vshlq_u8(lhs, vnegq_s8(rhs)); |
2266 | } |
2267 | |
2268 | template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> |
2269 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
2270 | { |
2271 | return vshlq_s8(lhs, vnegq_s8(rhs)); |
2272 | } |
2273 | |
2274 | template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> |
2275 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept |
2276 | { |
2277 | return vshlq_u16(lhs, vnegq_s16(rhs)); |
2278 | } |
2279 | |
2280 | template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> |
2281 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
2282 | { |
2283 | return vshlq_s16(lhs, vnegq_s16(rhs)); |
2284 | } |
2285 | |
2286 | template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> |
2287 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept |
2288 | { |
2289 | return vshlq_u32(lhs, vnegq_s32(rhs)); |
2290 | } |
2291 | |
2292 | template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> |
2293 | inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
2294 | { |
2295 | return vshlq_s32(lhs, vnegq_s32(rhs)); |
2296 | } |
2297 | |
2298 | // Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7 |
2299 | |
2300 | /******* |
2301 | * all * |
2302 | *******/ |
2303 | |
2304 | template <class A, class T, detail::enable_sized_t<T, 8> = 0> |
2305 | inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
2306 | { |
2307 | uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg)); |
2308 | return vget_lane_u64(tmp, 0) == ~0ULL; |
2309 | } |
2310 | |
2311 | template <class A, class T, detail::enable_sized_t<T, 1> = 0> |
2312 | inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
2313 | { |
2314 | return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {}); |
2315 | } |
2316 | |
2317 | template <class A, class T, detail::enable_sized_t<T, 2> = 0> |
2318 | inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
2319 | { |
2320 | return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {}); |
2321 | } |
2322 | |
2323 | template <class A, class T, detail::enable_sized_t<T, 4> = 0> |
2324 | inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
2325 | { |
2326 | return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {}); |
2327 | } |
2328 | |
2329 | /******* |
2330 | * any * |
2331 | *******/ |
2332 | |
2333 | template <class A, class T, detail::enable_sized_t<T, 8> = 0> |
2334 | inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
2335 | { |
2336 | uint32x2_t tmp = vqmovn_u64(arg); |
2337 | return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0; |
2338 | } |
2339 | |
2340 | template <class A, class T, detail::enable_sized_t<T, 1> = 0> |
2341 | inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
2342 | { |
2343 | return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {}); |
2344 | } |
2345 | |
2346 | template <class A, class T, detail::enable_sized_t<T, 2> = 0> |
2347 | inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
2348 | { |
2349 | return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {}); |
2350 | } |
2351 | |
2352 | template <class A, class T, detail::enable_sized_t<T, 4> = 0> |
2353 | inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept |
2354 | { |
2355 | return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {}); |
2356 | } |
2357 | |
2358 | /**************** |
2359 | * bitwise_cast * |
2360 | ****************/ |
2361 | |
2362 | #define WRAP_CAST(SUFFIX, TYPE) \ |
2363 | namespace wrap \ |
2364 | { \ |
2365 | inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept \ |
2366 | { \ |
2367 | return ::vreinterpretq_##SUFFIX##_u8(a); \ |
2368 | } \ |
2369 | inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept \ |
2370 | { \ |
2371 | return ::vreinterpretq_##SUFFIX##_s8(a); \ |
2372 | } \ |
2373 | inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept \ |
2374 | { \ |
2375 | return ::vreinterpretq_##SUFFIX##_u16(a); \ |
2376 | } \ |
2377 | inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept \ |
2378 | { \ |
2379 | return ::vreinterpretq_##SUFFIX##_s16(a); \ |
2380 | } \ |
2381 | inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept \ |
2382 | { \ |
2383 | return ::vreinterpretq_##SUFFIX##_u32(a); \ |
2384 | } \ |
2385 | inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept \ |
2386 | { \ |
2387 | return ::vreinterpretq_##SUFFIX##_s32(a); \ |
2388 | } \ |
2389 | inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept \ |
2390 | { \ |
2391 | return ::vreinterpretq_##SUFFIX##_u64(a); \ |
2392 | } \ |
2393 | inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept \ |
2394 | { \ |
2395 | return ::vreinterpretq_##SUFFIX##_s64(a); \ |
2396 | } \ |
2397 | inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \ |
2398 | { \ |
2399 | return ::vreinterpretq_##SUFFIX##_f32(a); \ |
2400 | } \ |
2401 | } |
2402 | |
2403 | WRAP_CAST(u8, uint8x16_t) |
2404 | WRAP_CAST(s8, int8x16_t) |
2405 | WRAP_CAST(u16, uint16x8_t) |
2406 | WRAP_CAST(s16, int16x8_t) |
2407 | WRAP_CAST(u32, uint32x4_t) |
2408 | WRAP_CAST(s32, int32x4_t) |
2409 | WRAP_CAST(u64, uint64x2_t) |
2410 | WRAP_CAST(s64, int64x2_t) |
2411 | WRAP_CAST(f32, float32x4_t) |
2412 | |
2413 | #undef WRAP_CAST |
2414 | |
2415 | namespace detail |
2416 | { |
2417 | template <class R, class... T> |
2418 | struct bitwise_caster_impl |
2419 | { |
2420 | using container_type = std::tuple<R (*)(T)...>; |
2421 | container_type m_func; |
2422 | |
2423 | template <class U> |
2424 | R apply(U rhs) const noexcept |
2425 | { |
2426 | using func_type = R (*)(U); |
2427 | auto func = xsimd::detail::get<func_type>(m_func); |
2428 | return func(rhs); |
2429 | } |
2430 | }; |
2431 | |
2432 | template <class R, class... T> |
2433 | inline const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept |
2434 | { |
2435 | return { std::make_tuple(arg...) }; |
2436 | } |
2437 | |
2438 | template <class... T> |
2439 | struct type_list |
2440 | { |
2441 | }; |
2442 | |
2443 | template <class RTL, class TTL> |
2444 | struct bitwise_caster; |
2445 | |
2446 | template <class... R, class... T> |
2447 | struct bitwise_caster<type_list<R...>, type_list<T...>> |
2448 | { |
2449 | using container_type = std::tuple<bitwise_caster_impl<R, T...>...>; |
2450 | container_type m_caster; |
2451 | |
2452 | template <class V, class U> |
2453 | V apply(U rhs) const noexcept |
2454 | { |
2455 | using caster_type = bitwise_caster_impl<V, T...>; |
2456 | auto caster = xsimd::detail::get<caster_type>(m_caster); |
2457 | return caster.apply(rhs); |
2458 | } |
2459 | }; |
2460 | |
2461 | template <class... T> |
2462 | using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>; |
2463 | |
2464 | using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t, |
2465 | uint16x8_t, int16x8_t, |
2466 | uint32x4_t, int32x4_t, |
2467 | uint64x2_t, int64x2_t, |
2468 | float32x4_t>; |
2469 | } |
2470 | |
2471 | template <class A, class T, class R> |
2472 | inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept |
2473 | { |
2474 | const detail::neon_bitwise_caster caster = { |
2475 | .m_caster: std::make_tuple( |
2476 | args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u8_u8, arg: wrap::vreinterpretq_u8_s8, arg: wrap::vreinterpretq_u8_u16, arg: wrap::vreinterpretq_u8_s16, |
2477 | arg: wrap::vreinterpretq_u8_u32, arg: wrap::vreinterpretq_u8_s32, arg: wrap::vreinterpretq_u8_u64, arg: wrap::vreinterpretq_u8_s64, |
2478 | arg: wrap::vreinterpretq_u8_f32), |
2479 | args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s8_u8, arg: wrap::vreinterpretq_s8_s8, arg: wrap::vreinterpretq_s8_u16, arg: wrap::vreinterpretq_s8_s16, |
2480 | arg: wrap::vreinterpretq_s8_u32, arg: wrap::vreinterpretq_s8_s32, arg: wrap::vreinterpretq_s8_u64, arg: wrap::vreinterpretq_s8_s64, |
2481 | arg: wrap::vreinterpretq_s8_f32), |
2482 | args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u16_u8, arg: wrap::vreinterpretq_u16_s8, arg: wrap::vreinterpretq_u16_u16, arg: wrap::vreinterpretq_u16_s16, |
2483 | arg: wrap::vreinterpretq_u16_u32, arg: wrap::vreinterpretq_u16_s32, arg: wrap::vreinterpretq_u16_u64, arg: wrap::vreinterpretq_u16_s64, |
2484 | arg: wrap::vreinterpretq_u16_f32), |
2485 | args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s16_u8, arg: wrap::vreinterpretq_s16_s8, arg: wrap::vreinterpretq_s16_u16, arg: wrap::vreinterpretq_s16_s16, |
2486 | arg: wrap::vreinterpretq_s16_u32, arg: wrap::vreinterpretq_s16_s32, arg: wrap::vreinterpretq_s16_u64, arg: wrap::vreinterpretq_s16_s64, |
2487 | arg: wrap::vreinterpretq_s16_f32), |
2488 | args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u32_u8, arg: wrap::vreinterpretq_u32_s8, arg: wrap::vreinterpretq_u32_u16, arg: wrap::vreinterpretq_u32_s16, |
2489 | arg: wrap::vreinterpretq_u32_u32, arg: wrap::vreinterpretq_u32_s32, arg: wrap::vreinterpretq_u32_u64, arg: wrap::vreinterpretq_u32_s64, |
2490 | arg: wrap::vreinterpretq_u32_f32), |
2491 | args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s32_u8, arg: wrap::vreinterpretq_s32_s8, arg: wrap::vreinterpretq_s32_u16, arg: wrap::vreinterpretq_s32_s16, |
2492 | arg: wrap::vreinterpretq_s32_u32, arg: wrap::vreinterpretq_s32_s32, arg: wrap::vreinterpretq_s32_u64, arg: wrap::vreinterpretq_s32_s64, |
2493 | arg: wrap::vreinterpretq_s32_f32), |
2494 | args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_u64_u8, arg: wrap::vreinterpretq_u64_s8, arg: wrap::vreinterpretq_u64_u16, arg: wrap::vreinterpretq_u64_s16, |
2495 | arg: wrap::vreinterpretq_u64_u32, arg: wrap::vreinterpretq_u64_s32, arg: wrap::vreinterpretq_u64_u64, arg: wrap::vreinterpretq_u64_s64, |
2496 | arg: wrap::vreinterpretq_u64_f32), |
2497 | args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_s64_u8, arg: wrap::vreinterpretq_s64_s8, arg: wrap::vreinterpretq_s64_u16, arg: wrap::vreinterpretq_s64_s16, |
2498 | arg: wrap::vreinterpretq_s64_u32, arg: wrap::vreinterpretq_s64_s32, arg: wrap::vreinterpretq_s64_u64, arg: wrap::vreinterpretq_s64_s64, |
2499 | arg: wrap::vreinterpretq_s64_f32), |
2500 | args: detail::make_bitwise_caster_impl(arg: wrap::vreinterpretq_f32_u8, arg: wrap::vreinterpretq_f32_s8, arg: wrap::vreinterpretq_f32_u16, arg: wrap::vreinterpretq_f32_s16, |
2501 | arg: wrap::vreinterpretq_f32_u32, arg: wrap::vreinterpretq_f32_s32, arg: wrap::vreinterpretq_f32_u64, arg: wrap::vreinterpretq_f32_s64, |
2502 | arg: wrap::vreinterpretq_f32_f32)) |
2503 | }; |
2504 | using src_register_type = typename batch<T, A>::register_type; |
2505 | using dst_register_type = typename batch<R, A>::register_type; |
2506 | return caster.apply<dst_register_type>(src_register_type(arg)); |
2507 | } |
2508 | |
2509 | /********* |
2510 | * isnan * |
2511 | *********/ |
2512 | |
2513 | template <class A> |
2514 | inline batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept |
2515 | { |
2516 | return !(arg == arg); |
2517 | } |
2518 | |
2519 | // slide_left |
2520 | namespace detail |
2521 | { |
2522 | template <size_t N> |
2523 | struct slider_left |
2524 | { |
2525 | template <class A, class T> |
2526 | inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept |
2527 | { |
2528 | const auto left = vdupq_n_u8(p0: 0); |
2529 | const auto right = bitwise_cast<batch<uint8_t, A>>(x).data; |
2530 | const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N)); |
2531 | return bitwise_cast<batch<T, A>>(res); |
2532 | } |
2533 | }; |
2534 | |
2535 | template <> |
2536 | struct slider_left<0> |
2537 | { |
2538 | template <class A, class T> |
2539 | inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept |
2540 | { |
2541 | return x; |
2542 | } |
2543 | }; |
2544 | } // namespace detail |
2545 | |
2546 | template <size_t N, class A, class T> |
2547 | inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept |
2548 | { |
2549 | return detail::slider_left<N> {}(x, A {}); |
2550 | } |
2551 | |
2552 | // slide_right |
2553 | namespace detail |
2554 | { |
2555 | template <size_t N> |
2556 | struct slider_right |
2557 | { |
2558 | template <class A, class T> |
2559 | inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept |
2560 | { |
2561 | const auto left = bitwise_cast<batch<uint8_t, A>>(x).data; |
2562 | const auto right = vdupq_n_u8(p0: 0); |
2563 | const batch<uint8_t, A> res(vextq_u8(left, right, N)); |
2564 | return bitwise_cast<batch<T, A>>(res); |
2565 | } |
2566 | }; |
2567 | |
2568 | template <> |
2569 | struct slider_right<16> |
2570 | { |
2571 | template <class A, class T> |
2572 | inline batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept |
2573 | { |
2574 | return batch<T, A> {}; |
2575 | } |
2576 | }; |
2577 | } // namespace detail |
2578 | |
2579 | template <size_t N, class A, class T> |
2580 | inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept |
2581 | { |
2582 | return detail::slider_right<N> {}(x, A {}); |
2583 | } |
2584 | } |
2585 | |
2586 | template <class batch_type, typename batch_type::value_type... Values> |
2587 | struct batch_constant; |
2588 | |
2589 | namespace kernel |
2590 | { |
2591 | /*********** |
2592 | * swizzle * |
2593 | ***********/ |
2594 | |
2595 | template <class A, class T, class I, I... idx> |
2596 | inline batch<T, A> swizzle(batch<T, A> const& self, |
2597 | batch_constant<batch<I, A>, idx...>, |
2598 | requires_arch<neon>) noexcept |
2599 | { |
2600 | static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices" ); |
2601 | std::array<T, batch<T, A>::size> data; |
2602 | self.store_aligned(data.data()); |
2603 | return set(batch<T, A>(), A(), data[idx]...); |
2604 | } |
2605 | } |
2606 | } |
2607 | |
2608 | #undef WRAP_BINARY_INT_EXCLUDING_64 |
2609 | #undef WRAP_BINARY_INT |
2610 | #undef WRAP_BINARY_FLOAT |
2611 | #undef WRAP_UNARY_INT_EXCLUDING_64 |
2612 | #undef WRAP_UNARY_INT |
2613 | #undef WRAP_UNARY_FLOAT |
2614 | |
2615 | #endif |
2616 | |