simd.h source code [Velox/build/_deps/simdjson-src/include/simdjson/arm64/simd.h]

1	#ifndef SIMDJSON_ARM64_SIMD_H
2	#define SIMDJSON_ARM64_SIMD_H
3
4	#include "simdjson/base.h"
5	#include "simdjson/internal/simdprune_tables.h"
6	#include "simdjson/arm64/bitmanipulation.h"
7	#include <type_traits>
8
9
10	namespace simdjson {
11	namespace SIMDJSON_IMPLEMENTATION {
12	namespace {
13	namespace simd {
14
15	#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
16	namespace {
17	// Start of private section with Visual Studio workaround
18
19
20	/**
21	* make_uint8x16_t initializes a SIMD register (uint8x16_t).
22	* This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
23	* is not recognized under Visual Studio! This is a workaround.
24	* Using a std::initializer_list<uint8_t> as a parameter resulted in
25	* inefficient code. With the current approach, if the parameters are
26	* compile-time constants,
27	* GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
28	* You should not use this function except for compile-time constants:
29	* it is not efficient.
30	*/
31	simdjson_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
32	uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
33	uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
34	uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
35	// Doing a load like so end ups generating worse code.
36	// uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
37	// x9, x10,x11,x12,x13,x14,x15,x16};
38	// return vld1q_u8(array);
39	uint8x16_t x{};
40	// incredibly, Visual Studio does not allow x[0] = x1
41	x = vsetq_lane_u8(x1, x, `0`);
42	x = vsetq_lane_u8(x2, x, `1`);
43	x = vsetq_lane_u8(x3, x, `2`);
44	x = vsetq_lane_u8(x4, x, `3`);
45	x = vsetq_lane_u8(x5, x, `4`);
46	x = vsetq_lane_u8(x6, x, `5`);
47	x = vsetq_lane_u8(x7, x, `6`);
48	x = vsetq_lane_u8(x8, x, `7`);
49	x = vsetq_lane_u8(x9, x, `8`);
50	x = vsetq_lane_u8(x10, x, `9`);
51	x = vsetq_lane_u8(x11, x, `10`);
52	x = vsetq_lane_u8(x12, x, `11`);
53	x = vsetq_lane_u8(x13, x, `12`);
54	x = vsetq_lane_u8(x14, x, `13`);
55	x = vsetq_lane_u8(x15, x, `14`);
56	x = vsetq_lane_u8(x16, x, `15`);
57	return x;
58	}
59
60	simdjson_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
61	uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8) {
62	uint8x8_t x{};
63	x = vset_lane_u8(x1, x, `0`);
64	x = vset_lane_u8(x2, x, `1`);
65	x = vset_lane_u8(x3, x, `2`);
66	x = vset_lane_u8(x4, x, `3`);
67	x = vset_lane_u8(x5, x, `4`);
68	x = vset_lane_u8(x6, x, `5`);
69	x = vset_lane_u8(x7, x, `6`);
70	x = vset_lane_u8(x8, x, `7`);
71	return x;
72	}
73
74	// We have to do the same work for make_int8x16_t
75	simdjson_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
76	int8_t x5, int8_t x6, int8_t x7, int8_t x8,
77	int8_t x9, int8_t x10, int8_t x11, int8_t x12,
78	int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
79	// Doing a load like so end ups generating worse code.
80	// int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
81	// x9, x10,x11,x12,x13,x14,x15,x16};
82	// return vld1q_s8(array);
83	int8x16_t x{};
84	// incredibly, Visual Studio does not allow x[0] = x1
85	x = vsetq_lane_s8(x1, x, `0`);
86	x = vsetq_lane_s8(x2, x, `1`);
87	x = vsetq_lane_s8(x3, x, `2`);
88	x = vsetq_lane_s8(x4, x, `3`);
89	x = vsetq_lane_s8(x5, x, `4`);
90	x = vsetq_lane_s8(x6, x, `5`);
91	x = vsetq_lane_s8(x7, x, `6`);
92	x = vsetq_lane_s8(x8, x, `7`);
93	x = vsetq_lane_s8(x9, x, `8`);
94	x = vsetq_lane_s8(x10, x, `9`);
95	x = vsetq_lane_s8(x11, x, `10`);
96	x = vsetq_lane_s8(x12, x, `11`);
97	x = vsetq_lane_s8(x13, x, `12`);
98	x = vsetq_lane_s8(x14, x, `13`);
99	x = vsetq_lane_s8(x15, x, `14`);
100	x = vsetq_lane_s8(x16, x, `15`);
101	return x;
102	}
103
104	// End of private section with Visual Studio workaround
105	} // namespace
106	#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
107
108
109	template<typename T>
110	struct simd8;
111
112	//
113	// Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
114	//
115	template<typename T, typename Mask=simd8<bool>>
116	struct base_u8 {
117	uint8x16_t value;
118	static const int SIZE = sizeof(value);
119
120	// Conversion from/to SIMD register
121	simdjson_inline base_u8(const uint8x16_t _value) : value(_value) {}
122	simdjson_inline operator const uint8x16_t&() const { return this->value; }
123	simdjson_inline operator uint8x16_t&() { return this->value; }
124
125	// Bit operations
126	simdjson_inline simd8<T> operator\|(const simd8<T> other) const { return vorrq_u8(*this, other); }
127	simdjson_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
128	simdjson_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
129	simdjson_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
130	simdjson_inline simd8<T> operator~() const { return *this ^ `0xFFu`; }
131	simdjson_inline simd8<T>& operator\|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>>(this); this_cast = this_cast \| other; return* *this_cast; }
132	simdjson_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>>(this); this_cast = this_cast & other; return* *this_cast; }
133	simdjson_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>>(this); this_cast = this_cast ^ other; return* *this_cast; }
134
135	friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
136
137	template<int N=`1`>
138	simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
139	return vextq_u8(prev_chunk, *this, `16` - N);
140	}
141	};
142
143	// SIMD byte mask type (returned by things like eq and gt)
144	template<>
145	struct simd8<bool>: base_u8<bool> {
146	typedef uint16_t bitmask_t;
147	typedef uint32_t bitmask2_t;
148
149	static simdjson_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(p0: uint8_t(-(!!_value))); }
150
151	simdjson_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
152	// False constructor
153	simdjson_inline simd8() : simd8 (vdupq_n_u8(p0: `0`)) {}
154	// Splat constructor
155	simdjson_inline simd8(bool _value) : simd8 (splat(_value)) {}
156
157	// We return uint32_t instead of uint16_t because that seems to be more efficient for most
158	// purposes (cutting it down to uint16_t costs performance in some compilers).
159	simdjson_inline uint32_t to_bitmask() const {
160	#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
161	const uint8x16_t bit_mask = make_uint8x16_t(`0x01`, `0x02`, `0x4`, `0x8`, `0x10`, `0x20`, `0x40`, `0x80`,
162	`0x01`, `0x02`, `0x4`, `0x8`, `0x10`, `0x20`, `0x40`, `0x80`);
163	#else
164	const uint8x16_t bit_mask = {`0x01`, `0x02`, `0x4`, `0x8`, `0x10`, `0x20`, `0x40`, `0x80`,
165	`0x01`, `0x02`, `0x4`, `0x8`, `0x10`, `0x20`, `0x40`, `0x80`};
166	#endif
167	auto minput = *this & bit_mask;
168	uint8x16_t tmp = vpaddq_u8(p0: minput, p1: minput);
169	tmp = vpaddq_u8(p0: tmp, p1: tmp);
170	tmp = vpaddq_u8(p0: tmp, p1: tmp);
171	return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), `0`);
172	}
173	simdjson_inline bool any() const { return vmaxvq_u8(p0: *this) != `0`; }
174	};
175
176	// Unsigned bytes
177	template<>
178	struct simd8<uint8_t>: base_u8<uint8_t> {
179	static simdjson_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(p0: _value); }
180	static simdjson_inline uint8x16_t zero() { return vdupq_n_u8(p0: `0`); }
181	static simdjson_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); }
182
183	simdjson_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
184	// Zero constructor
185	simdjson_inline simd8() : simd8 (zero()) {}
186	// Array constructor
187	simdjson_inline simd8(const uint8_t values[`16`]) : simd8 (load(values)) {}
188	// Splat constructor
189	simdjson_inline simd8(uint8_t _value) : simd8 (splat(_value)) {}
190	// Member-by-member initialization
191	#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
192	simdjson_inline simd8(
193	uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
194	uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
195	) : simd8(make_uint8x16_t(
196	v0, v1, v2, v3, v4, v5, v6, v7,
197	v8, v9, v10,v11,v12,v13,v14,v15
198	)) {}
199	#else
200	simdjson_inline simd8(
201	uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
202	uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
203	) : simd8 (uint8x16_t{
204	v0, v1, v2, v3, v4, v5, v6, v7,
205	v8, v9, v10,v11,v12,v13,v14,v15
206	}) {}
207	#endif
208
209	// Repeat 16 values as many times as necessary (usually for lookup tables)
210	simdjson_inline static simd8<uint8_t> repeat_16(
211	uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
212	uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
213	) {
214	return simd8<uint8_t>(
215	v0, v1, v2, v3, v4, v5, v6, v7,
216	v8, v9, v10,v11,v12,v13,v14,v15
217	);
218	}
219
220	// Store to array
221	simdjson_inline void store(uint8_t dst[`16`]) const { return vst1q_u8(dst, *this); }
222
223	// Saturated math
224	simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(p0: *this, p1: other); }
225	simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(p0: *this, p1: other); }
226
227	// Addition/subtraction are the same for signed and unsigned
228	simdjson_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(p0: *this, p1: other); }
229	simdjson_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(p0: *this, p1: other); }
230	simdjson_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
231	simdjson_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
232
233	// Order-specific operations
234	simdjson_inline uint8_t max_val() const { return vmaxvq_u8(p0: *this); }
235	simdjson_inline uint8_t min_val() const { return vminvq_u8(p0: *this); }
236	simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(p0: *this, p1: other); }
237	simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(p0: *this, p1: other); }
238	simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(p0: *this, p1: other); }
239	simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(p0: *this, p1: other); }
240	simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(p0: *this, p1: other); }
241	simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(p0: *this, p1: other); }
242	// Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
243	simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
244	// Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
245	simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
246
247	// Bit-specific operations
248	simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(p0: *this, p1: bits); }
249	simdjson_inline bool any_bits_set_anywhere() const { return this->max_val() != `0`; }
250	simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
251	template<int N>
252	simdjson_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
253	template<int N>
254	simdjson_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
255
256	// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
257	template<typename L>
258	simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
259	return lookup_table.apply_lookup_16_to(*this);
260	}
261
262
263	// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
264	// Passing a 0 value for mask would be equivalent to writing out every byte to output.
265	// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
266	// get written.
267	// Design consideration: it seems like a function with the
268	// signature simd8<L> compress(uint16_t mask) would be
269	// sensible, but the AVX ISA makes this kind of approach difficult.
270	template<typename L>
271	simdjson_inline void compress(uint16_t mask, L * output) const {
272	using internal::thintable_epi8;
273	using internal::BitsSetTable256mul2;
274	using internal::pshufb_combine_table;
275	// this particular implementation was inspired by work done by @animetosho
276	// we do it in two steps, first 8 bytes and then second 8 bytes
277	uint8_t mask1 = uint8_t(mask); // least significant 8 bits
278	uint8_t mask2 = uint8_t(mask >> `8`); // most significant 8 bits
279	// next line just loads the 64-bit values thintable_epi8[mask1] and
280	// thintable_epi8[mask2] into a 128-bit register, using only
281	// two instructions on most compilers.
282	uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
283	uint8x16_t shufmask = vreinterpretq_u8_u64(p0: shufmask64);
284	// we increment by 0x08 the second half of the mask
285	#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
286	uint8x16_t inc = make_uint8x16_t(`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`);
287	#else
288	uint8x16_t inc = {`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`};
289	#endif
290	shufmask = vaddq_u8(p0: shufmask, p1: inc);
291	// this is the version "nearly pruned"
292	uint8x16_t pruned = vqtbl1q_u8(p0: *this, p1: shufmask);
293	// we still need to put the two halves together.
294	// we compute the popcount of the first half:
295	int pop1 = BitsSetTable256mul2[mask1];
296	// then load the corresponding mask, what it does is to write
297	// only the first pop1 bytes from the first 8 bytes, and then
298	// it fills in with the bytes from the second 8 bytes + some filling
299	// at the end.
300	uint8x16_t compactmask = vld1q_u8(reinterpret_cast<const uint8_t >(pshufb_combine_table + pop1 `8`));
301	uint8x16_t answer = vqtbl1q_u8(p0: pruned, p1: compactmask);
302	vst1q_u8(reinterpret_cast<uint8_t*>(output), answer);
303	}
304
305	// Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a
306	// bitset) to output1, then those corresponding to a 0 in the high half to output2.
307	template<typename L>
308	simdjson_inline void compress_halves(uint16_t mask, L output1, L output2) const {
309	using internal::thintable_epi8;
310	uint8_t mask1 = uint8_t(mask); // least significant 8 bits
311	uint8_t mask2 = uint8_t(mask >> `8`); // most significant 8 bits
312	uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
313	uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
314	// we increment by 0x08 the second half of the mask
315	#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
316	uint8x8_t inc = make_uint8x8_t(`0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`);
317	#else
318	uint8x8_t inc = {`0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`, `0x08`};
319	#endif
320	compactmask2 = vadd_u8(p0: compactmask2, p1: inc);
321	// store each result (with the second store possibly overlapping the first)
322	vst1_u8((uint8_t)output1, vqtbl1_u8(this, compactmask1));
323	vst1_u8((uint8_t)output2, vqtbl1_u8(this, compactmask2));
324	}
325
326	template<typename L>
327	simdjson_inline simd8<L> lookup_16(
328	L replace0, L replace1, L replace2, L replace3,
329	L replace4, L replace5, L replace6, L replace7,
330	L replace8, L replace9, L replace10, L replace11,
331	L replace12, L replace13, L replace14, L replace15) const {
332	return lookup_16(simd8<L>::repeat_16(
333	replace0, replace1, replace2, replace3,
334	replace4, replace5, replace6, replace7,
335	replace8, replace9, replace10, replace11,
336	replace12, replace13, replace14, replace15
337	));
338	}
339
340	template<typename T>
341	simdjson_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
342	return vqtbl1q_u8(p0: *this, p1: simd8<uint8_t>(original));
343	}
344	};
345
346	// Signed bytes
347	template<>
348	struct simd8<int8_t> {
349	int8x16_t value;
350
351	static simdjson_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(p0: _value); }
352	static simdjson_inline simd8<int8_t> zero() { return vdupq_n_s8(p0: `0`); }
353	static simdjson_inline simd8<int8_t> load(const int8_t values[`16`]) { return vld1q_s8(values); }
354
355	// Conversion from/to SIMD register
356	simdjson_inline simd8(const int8x16_t _value) : value{_value} {}
357	simdjson_inline operator const int8x16_t&() const { return this->value; }
358	simdjson_inline operator int8x16_t&() { return this->value; }
359
360	// Zero constructor
361	simdjson_inline simd8() : simd8 (zero()) {}
362	// Splat constructor
363	simdjson_inline simd8(int8_t _value) : simd8 (splat(_value)) {}
364	// Array constructor
365	simdjson_inline simd8(const int8_t* values) : simd8 (load(values)) {}
366	// Member-by-member initialization
367	#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
368	simdjson_inline simd8(
369	int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
370	int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
371	) : simd8(make_int8x16_t(
372	v0, v1, v2, v3, v4, v5, v6, v7,
373	v8, v9, v10,v11,v12,v13,v14,v15
374	)) {}
375	#else
376	simdjson_inline simd8(
377	int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
378	int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
379	) : simd8 (int8x16_t{
380	v0, v1, v2, v3, v4, v5, v6, v7,
381	v8, v9, v10,v11,v12,v13,v14,v15
382	}) {}
383	#endif
384	// Repeat 16 values as many times as necessary (usually for lookup tables)
385	simdjson_inline static simd8<int8_t> repeat_16(
386	int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
387	int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
388	) {
389	return simd8<int8_t>(
390	v0, v1, v2, v3, v4, v5, v6, v7,
391	v8, v9, v10,v11,v12,v13,v14,v15
392	);
393	}
394
395	// Store to array
396	simdjson_inline void store(int8_t dst[`16`]) const { return vst1q_s8(dst, *this); }
397
398	// Explicit conversion to/from unsigned
399	//
400	// Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
401	// In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
402	// and relatively ugly and hard to read.
403	#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
404	simdjson_inline explicit simd8(const uint8x16_t other): simd8 (vreinterpretq_s8_u8(p0: other)) {}
405	#endif
406	simdjson_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(p0: this->value); }
407
408	// Math
409	simdjson_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(p0: *this, p1: other); }
410	simdjson_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(p0: *this, p1: other); }
411	simdjson_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
412	simdjson_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
413
414	// Order-sensitive comparisons
415	simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(p0: *this, p1: other); }
416	simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(p0: *this, p1: other); }
417	simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(p0: *this, p1: other); }
418	simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(p0: *this, p1: other); }
419	simdjson_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(p0: *this, p1: other); }
420
421	template<int N=`1`>
422	simdjson_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
423	return vextq_s8(prev_chunk, *this, `16` - N);
424	}
425
426	// Perform a lookup assuming no value is larger than 16
427	template<typename L>
428	simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
429	return lookup_table.apply_lookup_16_to(*this);
430	}
431	template<typename L>
432	simdjson_inline simd8<L> lookup_16(
433	L replace0, L replace1, L replace2, L replace3,
434	L replace4, L replace5, L replace6, L replace7,
435	L replace8, L replace9, L replace10, L replace11,
436	L replace12, L replace13, L replace14, L replace15) const {
437	return lookup_16(simd8<L>::repeat_16(
438	replace0, replace1, replace2, replace3,
439	replace4, replace5, replace6, replace7,
440	replace8, replace9, replace10, replace11,
441	replace12, replace13, replace14, replace15
442	));
443	}
444
445	template<typename T>
446	simdjson_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
447	return vqtbl1q_s8(p0: *this, p1: simd8<uint8_t>(original));
448	}
449	};
450
451	template<typename T>
452	struct simd8x64 {
453	static constexpr int NUM_CHUNKS = `64` / sizeof(simd8<T>);
454	static_assert(NUM_CHUNKS == `4`, "ARM kernel should use four registers per 64-byte block.");
455	const simd8<T> chunks[NUM_CHUNKS];
456
457	simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
458	simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
459	simd8x64() = delete; // no default constructor allowed
460
461	simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
462	simdjson_inline simd8x64(const T ptr[`64`]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+`16`), simd8<T>::load(ptr+`32`), simd8<T>::load(ptr+`48`)} {}
463
464	simdjson_inline void store(T ptr[`64`]) const {
465	this->chunks[`0`].store(ptr+sizeof(simd8<T>)*`0`);
466	this->chunks[`1`].store(ptr+sizeof(simd8<T>)*`1`);
467	this->chunks[`2`].store(ptr+sizeof(simd8<T>)*`2`);
468	this->chunks[`3`].store(ptr+sizeof(simd8<T>)*`3`);
469	}
470
471	simdjson_inline simd8<T> reduce_or() const {
472	return (this->chunks[`0`] \| this->chunks[`1`]) \| (this->chunks[`2`] \| this->chunks[`3`]);
473	}
474
475
476	simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
477	uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), `0`);
478	// compute the prefix sum of the popcounts of each byte
479	uint64_t offsets = popcounts * `0x0101010101010101`;
480	this->chunks[`0`].compress_halves(uint16_t(mask), output, &output[popcounts & `0xFF`]);
481	this->chunks[`1`].compress_halves(uint16_t(mask >> `16`), &output[(offsets >> `8`) & `0xFF`], &output[(offsets >> `16`) & `0xFF`]);
482	this->chunks[`2`].compress_halves(uint16_t(mask >> `32`), &output[(offsets >> `24`) & `0xFF`], &output[(offsets >> `32`) & `0xFF`]);
483	this->chunks[`3`].compress_halves(uint16_t(mask >> `48`), &output[(offsets >> `40`) & `0xFF`], &output[(offsets >> `48`) & `0xFF`]);
484	return offsets >> `56`;
485	}
486
487	simdjson_inline uint64_t to_bitmask() const {
488	#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
489	const uint8x16_t bit_mask = make_uint8x16_t(
490	`0x01`, `0x02`, `0x4`, `0x8`, `0x10`, `0x20`, `0x40`, `0x80`,
491	`0x01`, `0x02`, `0x4`, `0x8`, `0x10`, `0x20`, `0x40`, `0x80`
492	);
493	#else
494	const uint8x16_t bit_mask = {
495	`0x01`, `0x02`, `0x4`, `0x8`, `0x10`, `0x20`, `0x40`, `0x80`,
496	`0x01`, `0x02`, `0x4`, `0x8`, `0x10`, `0x20`, `0x40`, `0x80`
497	};
498	#endif
499	// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
500	uint8x16_t sum0 = vpaddq_u8(this->chunks[`0`] & bit_mask, this->chunks[`1`] & bit_mask);
501	uint8x16_t sum1 = vpaddq_u8(this->chunks[`2`] & bit_mask, this->chunks[`3`] & bit_mask);
502	sum0 = vpaddq_u8(p0: sum0, p1: sum1);
503	sum0 = vpaddq_u8(p0: sum0, p1: sum0);
504	return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), `0`);
505	}
506
507	simdjson_inline uint64_t eq(const T m) const {
508	const simd8<T> mask = simd8<T>::splat(m);
509	return simd8x64<bool>(
510	this->chunks[`0`] == mask,
511	this->chunks[`1`] == mask,
512	this->chunks[`2`] == mask,
513	this->chunks[`3`] == mask
514	).to_bitmask();
515	}
516
517	simdjson_inline uint64_t lteq(const T m) const {
518	const simd8<T> mask = simd8<T>::splat(m);
519	return simd8x64<bool>(
520	this->chunks[`0`] <= mask,
521	this->chunks[`1`] <= mask,
522	this->chunks[`2`] <= mask,
523	this->chunks[`3`] <= mask
524	).to_bitmask();
525	}
526	}; // struct simd8x64<T>
527
528	} // namespace simd
529	} // unnamed namespace
530	} // namespace SIMDJSON_IMPLEMENTATION
531	} // namespace simdjson
532
533	#endif // SIMDJSON_ARM64_SIMD_H
534

Browse the source code of Velox/build/_deps/simdjson-src/include/simdjson/arm64/simd.h