1#ifndef SIMDJSON_ARM64_SIMD_H
2#define SIMDJSON_ARM64_SIMD_H
3
4#include "simdjson/base.h"
5#include "simdjson/internal/simdprune_tables.h"
6#include "simdjson/arm64/bitmanipulation.h"
7#include <type_traits>
8
9
10namespace simdjson {
11namespace SIMDJSON_IMPLEMENTATION {
12namespace {
13namespace simd {
14
15#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
16namespace {
17// Start of private section with Visual Studio workaround
18
19
20/**
21 * make_uint8x16_t initializes a SIMD register (uint8x16_t).
22 * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
23 * is not recognized under Visual Studio! This is a workaround.
24 * Using a std::initializer_list<uint8_t> as a parameter resulted in
25 * inefficient code. With the current approach, if the parameters are
26 * compile-time constants,
27 * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
28 * You should not use this function except for compile-time constants:
29 * it is not efficient.
30 */
31simdjson_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
32 uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
33 uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
34 uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
35 // Doing a load like so end ups generating worse code.
36 // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
37 // x9, x10,x11,x12,x13,x14,x15,x16};
38 // return vld1q_u8(array);
39 uint8x16_t x{};
40 // incredibly, Visual Studio does not allow x[0] = x1
41 x = vsetq_lane_u8(x1, x, 0);
42 x = vsetq_lane_u8(x2, x, 1);
43 x = vsetq_lane_u8(x3, x, 2);
44 x = vsetq_lane_u8(x4, x, 3);
45 x = vsetq_lane_u8(x5, x, 4);
46 x = vsetq_lane_u8(x6, x, 5);
47 x = vsetq_lane_u8(x7, x, 6);
48 x = vsetq_lane_u8(x8, x, 7);
49 x = vsetq_lane_u8(x9, x, 8);
50 x = vsetq_lane_u8(x10, x, 9);
51 x = vsetq_lane_u8(x11, x, 10);
52 x = vsetq_lane_u8(x12, x, 11);
53 x = vsetq_lane_u8(x13, x, 12);
54 x = vsetq_lane_u8(x14, x, 13);
55 x = vsetq_lane_u8(x15, x, 14);
56 x = vsetq_lane_u8(x16, x, 15);
57 return x;
58}
59
60simdjson_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
61 uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8) {
62 uint8x8_t x{};
63 x = vset_lane_u8(x1, x, 0);
64 x = vset_lane_u8(x2, x, 1);
65 x = vset_lane_u8(x3, x, 2);
66 x = vset_lane_u8(x4, x, 3);
67 x = vset_lane_u8(x5, x, 4);
68 x = vset_lane_u8(x6, x, 5);
69 x = vset_lane_u8(x7, x, 6);
70 x = vset_lane_u8(x8, x, 7);
71 return x;
72}
73
74// We have to do the same work for make_int8x16_t
75simdjson_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
76 int8_t x5, int8_t x6, int8_t x7, int8_t x8,
77 int8_t x9, int8_t x10, int8_t x11, int8_t x12,
78 int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
79 // Doing a load like so end ups generating worse code.
80 // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
81 // x9, x10,x11,x12,x13,x14,x15,x16};
82 // return vld1q_s8(array);
83 int8x16_t x{};
84 // incredibly, Visual Studio does not allow x[0] = x1
85 x = vsetq_lane_s8(x1, x, 0);
86 x = vsetq_lane_s8(x2, x, 1);
87 x = vsetq_lane_s8(x3, x, 2);
88 x = vsetq_lane_s8(x4, x, 3);
89 x = vsetq_lane_s8(x5, x, 4);
90 x = vsetq_lane_s8(x6, x, 5);
91 x = vsetq_lane_s8(x7, x, 6);
92 x = vsetq_lane_s8(x8, x, 7);
93 x = vsetq_lane_s8(x9, x, 8);
94 x = vsetq_lane_s8(x10, x, 9);
95 x = vsetq_lane_s8(x11, x, 10);
96 x = vsetq_lane_s8(x12, x, 11);
97 x = vsetq_lane_s8(x13, x, 12);
98 x = vsetq_lane_s8(x14, x, 13);
99 x = vsetq_lane_s8(x15, x, 14);
100 x = vsetq_lane_s8(x16, x, 15);
101 return x;
102}
103
104// End of private section with Visual Studio workaround
105} // namespace
106#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
107
108
109 template<typename T>
110 struct simd8;
111
112 //
113 // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
114 //
115 template<typename T, typename Mask=simd8<bool>>
116 struct base_u8 {
117 uint8x16_t value;
118 static const int SIZE = sizeof(value);
119
120 // Conversion from/to SIMD register
121 simdjson_inline base_u8(const uint8x16_t _value) : value(_value) {}
122 simdjson_inline operator const uint8x16_t&() const { return this->value; }
123 simdjson_inline operator uint8x16_t&() { return this->value; }
124
125 // Bit operations
126 simdjson_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
127 simdjson_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
128 simdjson_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
129 simdjson_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
130 simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
131 simdjson_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
132 simdjson_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
133 simdjson_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
134
135 friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
136
137 template<int N=1>
138 simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
139 return vextq_u8(prev_chunk, *this, 16 - N);
140 }
141 };
142
143 // SIMD byte mask type (returned by things like eq and gt)
144 template<>
145 struct simd8<bool>: base_u8<bool> {
146 typedef uint16_t bitmask_t;
147 typedef uint32_t bitmask2_t;
148
149 static simdjson_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(p0: uint8_t(-(!!_value))); }
150
151 simdjson_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
152 // False constructor
153 simdjson_inline simd8() : simd8(vdupq_n_u8(p0: 0)) {}
154 // Splat constructor
155 simdjson_inline simd8(bool _value) : simd8(splat(_value)) {}
156
157 // We return uint32_t instead of uint16_t because that seems to be more efficient for most
158 // purposes (cutting it down to uint16_t costs performance in some compilers).
159 simdjson_inline uint32_t to_bitmask() const {
160#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
161 const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
162 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
163#else
164 const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
165 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
166#endif
167 auto minput = *this & bit_mask;
168 uint8x16_t tmp = vpaddq_u8(p0: minput, p1: minput);
169 tmp = vpaddq_u8(p0: tmp, p1: tmp);
170 tmp = vpaddq_u8(p0: tmp, p1: tmp);
171 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
172 }
173 simdjson_inline bool any() const { return vmaxvq_u8(p0: *this) != 0; }
174 };
175
176 // Unsigned bytes
177 template<>
178 struct simd8<uint8_t>: base_u8<uint8_t> {
179 static simdjson_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(p0: _value); }
180 static simdjson_inline uint8x16_t zero() { return vdupq_n_u8(p0: 0); }
181 static simdjson_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); }
182
183 simdjson_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
184 // Zero constructor
185 simdjson_inline simd8() : simd8(zero()) {}
186 // Array constructor
187 simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
188 // Splat constructor
189 simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
190 // Member-by-member initialization
191#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
192 simdjson_inline simd8(
193 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
194 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
195 ) : simd8(make_uint8x16_t(
196 v0, v1, v2, v3, v4, v5, v6, v7,
197 v8, v9, v10,v11,v12,v13,v14,v15
198 )) {}
199#else
200 simdjson_inline simd8(
201 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
202 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
203 ) : simd8(uint8x16_t{
204 v0, v1, v2, v3, v4, v5, v6, v7,
205 v8, v9, v10,v11,v12,v13,v14,v15
206 }) {}
207#endif
208
209 // Repeat 16 values as many times as necessary (usually for lookup tables)
210 simdjson_inline static simd8<uint8_t> repeat_16(
211 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
212 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
213 ) {
214 return simd8<uint8_t>(
215 v0, v1, v2, v3, v4, v5, v6, v7,
216 v8, v9, v10,v11,v12,v13,v14,v15
217 );
218 }
219
220 // Store to array
221 simdjson_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
222
223 // Saturated math
224 simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(p0: *this, p1: other); }
225 simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(p0: *this, p1: other); }
226
227 // Addition/subtraction are the same for signed and unsigned
228 simdjson_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(p0: *this, p1: other); }
229 simdjson_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(p0: *this, p1: other); }
230 simdjson_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
231 simdjson_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
232
233 // Order-specific operations
234 simdjson_inline uint8_t max_val() const { return vmaxvq_u8(p0: *this); }
235 simdjson_inline uint8_t min_val() const { return vminvq_u8(p0: *this); }
236 simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(p0: *this, p1: other); }
237 simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(p0: *this, p1: other); }
238 simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(p0: *this, p1: other); }
239 simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(p0: *this, p1: other); }
240 simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(p0: *this, p1: other); }
241 simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(p0: *this, p1: other); }
242 // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
243 simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
244 // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
245 simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
246
247 // Bit-specific operations
248 simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(p0: *this, p1: bits); }
249 simdjson_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
250 simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
251 template<int N>
252 simdjson_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
253 template<int N>
254 simdjson_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
255
256 // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
257 template<typename L>
258 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
259 return lookup_table.apply_lookup_16_to(*this);
260 }
261
262
263 // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
264 // Passing a 0 value for mask would be equivalent to writing out every byte to output.
265 // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
266 // get written.
267 // Design consideration: it seems like a function with the
268 // signature simd8<L> compress(uint16_t mask) would be
269 // sensible, but the AVX ISA makes this kind of approach difficult.
270 template<typename L>
271 simdjson_inline void compress(uint16_t mask, L * output) const {
272 using internal::thintable_epi8;
273 using internal::BitsSetTable256mul2;
274 using internal::pshufb_combine_table;
275 // this particular implementation was inspired by work done by @animetosho
276 // we do it in two steps, first 8 bytes and then second 8 bytes
277 uint8_t mask1 = uint8_t(mask); // least significant 8 bits
278 uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
279 // next line just loads the 64-bit values thintable_epi8[mask1] and
280 // thintable_epi8[mask2] into a 128-bit register, using only
281 // two instructions on most compilers.
282 uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
283 uint8x16_t shufmask = vreinterpretq_u8_u64(p0: shufmask64);
284 // we increment by 0x08 the second half of the mask
285#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
286 uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
287#else
288 uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
289#endif
290 shufmask = vaddq_u8(p0: shufmask, p1: inc);
291 // this is the version "nearly pruned"
292 uint8x16_t pruned = vqtbl1q_u8(p0: *this, p1: shufmask);
293 // we still need to put the two halves together.
294 // we compute the popcount of the first half:
295 int pop1 = BitsSetTable256mul2[mask1];
296 // then load the corresponding mask, what it does is to write
297 // only the first pop1 bytes from the first 8 bytes, and then
298 // it fills in with the bytes from the second 8 bytes + some filling
299 // at the end.
300 uint8x16_t compactmask = vld1q_u8(reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
301 uint8x16_t answer = vqtbl1q_u8(p0: pruned, p1: compactmask);
302 vst1q_u8(reinterpret_cast<uint8_t*>(output), answer);
303 }
304
305 // Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a
306 // bitset) to output1, then those corresponding to a 0 in the high half to output2.
307 template<typename L>
308 simdjson_inline void compress_halves(uint16_t mask, L *output1, L *output2) const {
309 using internal::thintable_epi8;
310 uint8_t mask1 = uint8_t(mask); // least significant 8 bits
311 uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
312 uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
313 uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
314 // we increment by 0x08 the second half of the mask
315#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
316 uint8x8_t inc = make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
317#else
318 uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
319#endif
320 compactmask2 = vadd_u8(p0: compactmask2, p1: inc);
321 // store each result (with the second store possibly overlapping the first)
322 vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1));
323 vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2));
324 }
325
326 template<typename L>
327 simdjson_inline simd8<L> lookup_16(
328 L replace0, L replace1, L replace2, L replace3,
329 L replace4, L replace5, L replace6, L replace7,
330 L replace8, L replace9, L replace10, L replace11,
331 L replace12, L replace13, L replace14, L replace15) const {
332 return lookup_16(simd8<L>::repeat_16(
333 replace0, replace1, replace2, replace3,
334 replace4, replace5, replace6, replace7,
335 replace8, replace9, replace10, replace11,
336 replace12, replace13, replace14, replace15
337 ));
338 }
339
340 template<typename T>
341 simdjson_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
342 return vqtbl1q_u8(p0: *this, p1: simd8<uint8_t>(original));
343 }
344 };
345
346 // Signed bytes
347 template<>
348 struct simd8<int8_t> {
349 int8x16_t value;
350
351 static simdjson_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(p0: _value); }
352 static simdjson_inline simd8<int8_t> zero() { return vdupq_n_s8(p0: 0); }
353 static simdjson_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
354
355 // Conversion from/to SIMD register
356 simdjson_inline simd8(const int8x16_t _value) : value{_value} {}
357 simdjson_inline operator const int8x16_t&() const { return this->value; }
358 simdjson_inline operator int8x16_t&() { return this->value; }
359
360 // Zero constructor
361 simdjson_inline simd8() : simd8(zero()) {}
362 // Splat constructor
363 simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
364 // Array constructor
365 simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
366 // Member-by-member initialization
367#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
368 simdjson_inline simd8(
369 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
370 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
371 ) : simd8(make_int8x16_t(
372 v0, v1, v2, v3, v4, v5, v6, v7,
373 v8, v9, v10,v11,v12,v13,v14,v15
374 )) {}
375#else
376 simdjson_inline simd8(
377 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
378 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
379 ) : simd8(int8x16_t{
380 v0, v1, v2, v3, v4, v5, v6, v7,
381 v8, v9, v10,v11,v12,v13,v14,v15
382 }) {}
383#endif
384 // Repeat 16 values as many times as necessary (usually for lookup tables)
385 simdjson_inline static simd8<int8_t> repeat_16(
386 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
387 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
388 ) {
389 return simd8<int8_t>(
390 v0, v1, v2, v3, v4, v5, v6, v7,
391 v8, v9, v10,v11,v12,v13,v14,v15
392 );
393 }
394
395 // Store to array
396 simdjson_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
397
398 // Explicit conversion to/from unsigned
399 //
400 // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
401 // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
402 // and relatively ugly and hard to read.
403#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
404 simdjson_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(p0: other)) {}
405#endif
406 simdjson_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(p0: this->value); }
407
408 // Math
409 simdjson_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(p0: *this, p1: other); }
410 simdjson_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(p0: *this, p1: other); }
411 simdjson_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
412 simdjson_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
413
414 // Order-sensitive comparisons
415 simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(p0: *this, p1: other); }
416 simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(p0: *this, p1: other); }
417 simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(p0: *this, p1: other); }
418 simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(p0: *this, p1: other); }
419 simdjson_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(p0: *this, p1: other); }
420
421 template<int N=1>
422 simdjson_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
423 return vextq_s8(prev_chunk, *this, 16 - N);
424 }
425
426 // Perform a lookup assuming no value is larger than 16
427 template<typename L>
428 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
429 return lookup_table.apply_lookup_16_to(*this);
430 }
431 template<typename L>
432 simdjson_inline simd8<L> lookup_16(
433 L replace0, L replace1, L replace2, L replace3,
434 L replace4, L replace5, L replace6, L replace7,
435 L replace8, L replace9, L replace10, L replace11,
436 L replace12, L replace13, L replace14, L replace15) const {
437 return lookup_16(simd8<L>::repeat_16(
438 replace0, replace1, replace2, replace3,
439 replace4, replace5, replace6, replace7,
440 replace8, replace9, replace10, replace11,
441 replace12, replace13, replace14, replace15
442 ));
443 }
444
445 template<typename T>
446 simdjson_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
447 return vqtbl1q_s8(p0: *this, p1: simd8<uint8_t>(original));
448 }
449 };
450
451 template<typename T>
452 struct simd8x64 {
453 static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
454 static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
455 const simd8<T> chunks[NUM_CHUNKS];
456
457 simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
458 simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
459 simd8x64() = delete; // no default constructor allowed
460
461 simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
462 simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
463
464 simdjson_inline void store(T ptr[64]) const {
465 this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
466 this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
467 this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
468 this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
469 }
470
471 simdjson_inline simd8<T> reduce_or() const {
472 return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
473 }
474
475
476 simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
477 uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
478 // compute the prefix sum of the popcounts of each byte
479 uint64_t offsets = popcounts * 0x0101010101010101;
480 this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
481 this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
482 this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
483 this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
484 return offsets >> 56;
485 }
486
487 simdjson_inline uint64_t to_bitmask() const {
488#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
489 const uint8x16_t bit_mask = make_uint8x16_t(
490 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
491 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
492 );
493#else
494 const uint8x16_t bit_mask = {
495 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
496 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
497 };
498#endif
499 // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
500 uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
501 uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
502 sum0 = vpaddq_u8(p0: sum0, p1: sum1);
503 sum0 = vpaddq_u8(p0: sum0, p1: sum0);
504 return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
505 }
506
507 simdjson_inline uint64_t eq(const T m) const {
508 const simd8<T> mask = simd8<T>::splat(m);
509 return simd8x64<bool>(
510 this->chunks[0] == mask,
511 this->chunks[1] == mask,
512 this->chunks[2] == mask,
513 this->chunks[3] == mask
514 ).to_bitmask();
515 }
516
517 simdjson_inline uint64_t lteq(const T m) const {
518 const simd8<T> mask = simd8<T>::splat(m);
519 return simd8x64<bool>(
520 this->chunks[0] <= mask,
521 this->chunks[1] <= mask,
522 this->chunks[2] <= mask,
523 this->chunks[3] <= mask
524 ).to_bitmask();
525 }
526 }; // struct simd8x64<T>
527
528} // namespace simd
529} // unnamed namespace
530} // namespace SIMDJSON_IMPLEMENTATION
531} // namespace simdjson
532
533#endif // SIMDJSON_ARM64_SIMD_H
534