1 | #ifndef SIMDJSON_ARM64_SIMD_H |
2 | #define SIMDJSON_ARM64_SIMD_H |
3 | |
4 | #include "simdjson/base.h" |
5 | #include "simdjson/internal/simdprune_tables.h" |
6 | #include "simdjson/arm64/bitmanipulation.h" |
7 | #include <type_traits> |
8 | |
9 | |
10 | namespace simdjson { |
11 | namespace SIMDJSON_IMPLEMENTATION { |
12 | namespace { |
13 | namespace simd { |
14 | |
15 | #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO |
16 | namespace { |
17 | // Start of private section with Visual Studio workaround |
18 | |
19 | |
20 | /** |
21 | * make_uint8x16_t initializes a SIMD register (uint8x16_t). |
22 | * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...} |
23 | * is not recognized under Visual Studio! This is a workaround. |
24 | * Using a std::initializer_list<uint8_t> as a parameter resulted in |
25 | * inefficient code. With the current approach, if the parameters are |
26 | * compile-time constants, |
27 | * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}. |
28 | * You should not use this function except for compile-time constants: |
29 | * it is not efficient. |
30 | */ |
31 | simdjson_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, |
32 | uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8, |
33 | uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12, |
34 | uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) { |
35 | // Doing a load like so end ups generating worse code. |
36 | // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, |
37 | // x9, x10,x11,x12,x13,x14,x15,x16}; |
38 | // return vld1q_u8(array); |
39 | uint8x16_t x{}; |
40 | // incredibly, Visual Studio does not allow x[0] = x1 |
41 | x = vsetq_lane_u8(x1, x, 0); |
42 | x = vsetq_lane_u8(x2, x, 1); |
43 | x = vsetq_lane_u8(x3, x, 2); |
44 | x = vsetq_lane_u8(x4, x, 3); |
45 | x = vsetq_lane_u8(x5, x, 4); |
46 | x = vsetq_lane_u8(x6, x, 5); |
47 | x = vsetq_lane_u8(x7, x, 6); |
48 | x = vsetq_lane_u8(x8, x, 7); |
49 | x = vsetq_lane_u8(x9, x, 8); |
50 | x = vsetq_lane_u8(x10, x, 9); |
51 | x = vsetq_lane_u8(x11, x, 10); |
52 | x = vsetq_lane_u8(x12, x, 11); |
53 | x = vsetq_lane_u8(x13, x, 12); |
54 | x = vsetq_lane_u8(x14, x, 13); |
55 | x = vsetq_lane_u8(x15, x, 14); |
56 | x = vsetq_lane_u8(x16, x, 15); |
57 | return x; |
58 | } |
59 | |
60 | simdjson_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, |
61 | uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8) { |
62 | uint8x8_t x{}; |
63 | x = vset_lane_u8(x1, x, 0); |
64 | x = vset_lane_u8(x2, x, 1); |
65 | x = vset_lane_u8(x3, x, 2); |
66 | x = vset_lane_u8(x4, x, 3); |
67 | x = vset_lane_u8(x5, x, 4); |
68 | x = vset_lane_u8(x6, x, 5); |
69 | x = vset_lane_u8(x7, x, 6); |
70 | x = vset_lane_u8(x8, x, 7); |
71 | return x; |
72 | } |
73 | |
74 | // We have to do the same work for make_int8x16_t |
75 | simdjson_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4, |
76 | int8_t x5, int8_t x6, int8_t x7, int8_t x8, |
77 | int8_t x9, int8_t x10, int8_t x11, int8_t x12, |
78 | int8_t x13, int8_t x14, int8_t x15, int8_t x16) { |
79 | // Doing a load like so end ups generating worse code. |
80 | // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, |
81 | // x9, x10,x11,x12,x13,x14,x15,x16}; |
82 | // return vld1q_s8(array); |
83 | int8x16_t x{}; |
84 | // incredibly, Visual Studio does not allow x[0] = x1 |
85 | x = vsetq_lane_s8(x1, x, 0); |
86 | x = vsetq_lane_s8(x2, x, 1); |
87 | x = vsetq_lane_s8(x3, x, 2); |
88 | x = vsetq_lane_s8(x4, x, 3); |
89 | x = vsetq_lane_s8(x5, x, 4); |
90 | x = vsetq_lane_s8(x6, x, 5); |
91 | x = vsetq_lane_s8(x7, x, 6); |
92 | x = vsetq_lane_s8(x8, x, 7); |
93 | x = vsetq_lane_s8(x9, x, 8); |
94 | x = vsetq_lane_s8(x10, x, 9); |
95 | x = vsetq_lane_s8(x11, x, 10); |
96 | x = vsetq_lane_s8(x12, x, 11); |
97 | x = vsetq_lane_s8(x13, x, 12); |
98 | x = vsetq_lane_s8(x14, x, 13); |
99 | x = vsetq_lane_s8(x15, x, 14); |
100 | x = vsetq_lane_s8(x16, x, 15); |
101 | return x; |
102 | } |
103 | |
104 | // End of private section with Visual Studio workaround |
105 | } // namespace |
106 | #endif // SIMDJSON_REGULAR_VISUAL_STUDIO |
107 | |
108 | |
109 | template<typename T> |
110 | struct simd8; |
111 | |
112 | // |
113 | // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally. |
114 | // |
115 | template<typename T, typename Mask=simd8<bool>> |
116 | struct base_u8 { |
117 | uint8x16_t value; |
118 | static const int SIZE = sizeof(value); |
119 | |
120 | // Conversion from/to SIMD register |
121 | simdjson_inline base_u8(const uint8x16_t _value) : value(_value) {} |
122 | simdjson_inline operator const uint8x16_t&() const { return this->value; } |
123 | simdjson_inline operator uint8x16_t&() { return this->value; } |
124 | |
125 | // Bit operations |
126 | simdjson_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); } |
127 | simdjson_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); } |
128 | simdjson_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); } |
129 | simdjson_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); } |
130 | simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; } |
131 | simdjson_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; } |
132 | simdjson_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; } |
133 | simdjson_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; } |
134 | |
135 | friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); } |
136 | |
137 | template<int N=1> |
138 | simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const { |
139 | return vextq_u8(prev_chunk, *this, 16 - N); |
140 | } |
141 | }; |
142 | |
143 | // SIMD byte mask type (returned by things like eq and gt) |
144 | template<> |
145 | struct simd8<bool>: base_u8<bool> { |
146 | typedef uint16_t bitmask_t; |
147 | typedef uint32_t bitmask2_t; |
148 | |
149 | static simdjson_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(p0: uint8_t(-(!!_value))); } |
150 | |
151 | simdjson_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {} |
152 | // False constructor |
153 | simdjson_inline simd8() : simd8(vdupq_n_u8(p0: 0)) {} |
154 | // Splat constructor |
155 | simdjson_inline simd8(bool _value) : simd8(splat(_value)) {} |
156 | |
157 | // We return uint32_t instead of uint16_t because that seems to be more efficient for most |
158 | // purposes (cutting it down to uint16_t costs performance in some compilers). |
159 | simdjson_inline uint32_t to_bitmask() const { |
160 | #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO |
161 | const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, |
162 | 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); |
163 | #else |
164 | const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, |
165 | 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; |
166 | #endif |
167 | auto minput = *this & bit_mask; |
168 | uint8x16_t tmp = vpaddq_u8(p0: minput, p1: minput); |
169 | tmp = vpaddq_u8(p0: tmp, p1: tmp); |
170 | tmp = vpaddq_u8(p0: tmp, p1: tmp); |
171 | return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); |
172 | } |
173 | simdjson_inline bool any() const { return vmaxvq_u8(p0: *this) != 0; } |
174 | }; |
175 | |
176 | // Unsigned bytes |
177 | template<> |
178 | struct simd8<uint8_t>: base_u8<uint8_t> { |
179 | static simdjson_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(p0: _value); } |
180 | static simdjson_inline uint8x16_t zero() { return vdupq_n_u8(p0: 0); } |
181 | static simdjson_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); } |
182 | |
183 | simdjson_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {} |
184 | // Zero constructor |
185 | simdjson_inline simd8() : simd8(zero()) {} |
186 | // Array constructor |
187 | simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} |
188 | // Splat constructor |
189 | simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {} |
190 | // Member-by-member initialization |
191 | #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO |
192 | simdjson_inline simd8( |
193 | uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, |
194 | uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 |
195 | ) : simd8(make_uint8x16_t( |
196 | v0, v1, v2, v3, v4, v5, v6, v7, |
197 | v8, v9, v10,v11,v12,v13,v14,v15 |
198 | )) {} |
199 | #else |
200 | simdjson_inline simd8( |
201 | uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, |
202 | uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 |
203 | ) : simd8(uint8x16_t{ |
204 | v0, v1, v2, v3, v4, v5, v6, v7, |
205 | v8, v9, v10,v11,v12,v13,v14,v15 |
206 | }) {} |
207 | #endif |
208 | |
209 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
210 | simdjson_inline static simd8<uint8_t> repeat_16( |
211 | uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, |
212 | uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 |
213 | ) { |
214 | return simd8<uint8_t>( |
215 | v0, v1, v2, v3, v4, v5, v6, v7, |
216 | v8, v9, v10,v11,v12,v13,v14,v15 |
217 | ); |
218 | } |
219 | |
220 | // Store to array |
221 | simdjson_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); } |
222 | |
223 | // Saturated math |
224 | simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(p0: *this, p1: other); } |
225 | simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(p0: *this, p1: other); } |
226 | |
227 | // Addition/subtraction are the same for signed and unsigned |
228 | simdjson_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(p0: *this, p1: other); } |
229 | simdjson_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(p0: *this, p1: other); } |
230 | simdjson_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; } |
231 | simdjson_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; } |
232 | |
233 | // Order-specific operations |
234 | simdjson_inline uint8_t max_val() const { return vmaxvq_u8(p0: *this); } |
235 | simdjson_inline uint8_t min_val() const { return vminvq_u8(p0: *this); } |
236 | simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(p0: *this, p1: other); } |
237 | simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(p0: *this, p1: other); } |
238 | simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(p0: *this, p1: other); } |
239 | simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(p0: *this, p1: other); } |
240 | simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(p0: *this, p1: other); } |
241 | simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(p0: *this, p1: other); } |
242 | // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's. |
243 | simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); } |
244 | // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's. |
245 | simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); } |
246 | |
247 | // Bit-specific operations |
248 | simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(p0: *this, p1: bits); } |
249 | simdjson_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; } |
250 | simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); } |
251 | template<int N> |
252 | simdjson_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); } |
253 | template<int N> |
254 | simdjson_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); } |
255 | |
256 | // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) |
257 | template<typename L> |
258 | simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const { |
259 | return lookup_table.apply_lookup_16_to(*this); |
260 | } |
261 | |
262 | |
263 | // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). |
264 | // Passing a 0 value for mask would be equivalent to writing out every byte to output. |
265 | // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes |
266 | // get written. |
267 | // Design consideration: it seems like a function with the |
268 | // signature simd8<L> compress(uint16_t mask) would be |
269 | // sensible, but the AVX ISA makes this kind of approach difficult. |
270 | template<typename L> |
271 | simdjson_inline void compress(uint16_t mask, L * output) const { |
272 | using internal::thintable_epi8; |
273 | using internal::BitsSetTable256mul2; |
274 | using internal::pshufb_combine_table; |
275 | // this particular implementation was inspired by work done by @animetosho |
276 | // we do it in two steps, first 8 bytes and then second 8 bytes |
277 | uint8_t mask1 = uint8_t(mask); // least significant 8 bits |
278 | uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits |
279 | // next line just loads the 64-bit values thintable_epi8[mask1] and |
280 | // thintable_epi8[mask2] into a 128-bit register, using only |
281 | // two instructions on most compilers. |
282 | uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]}; |
283 | uint8x16_t shufmask = vreinterpretq_u8_u64(p0: shufmask64); |
284 | // we increment by 0x08 the second half of the mask |
285 | #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO |
286 | uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08); |
287 | #else |
288 | uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08}; |
289 | #endif |
290 | shufmask = vaddq_u8(p0: shufmask, p1: inc); |
291 | // this is the version "nearly pruned" |
292 | uint8x16_t pruned = vqtbl1q_u8(p0: *this, p1: shufmask); |
293 | // we still need to put the two halves together. |
294 | // we compute the popcount of the first half: |
295 | int pop1 = BitsSetTable256mul2[mask1]; |
296 | // then load the corresponding mask, what it does is to write |
297 | // only the first pop1 bytes from the first 8 bytes, and then |
298 | // it fills in with the bytes from the second 8 bytes + some filling |
299 | // at the end. |
300 | uint8x16_t compactmask = vld1q_u8(reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8)); |
301 | uint8x16_t answer = vqtbl1q_u8(p0: pruned, p1: compactmask); |
302 | vst1q_u8(reinterpret_cast<uint8_t*>(output), answer); |
303 | } |
304 | |
305 | // Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a |
306 | // bitset) to output1, then those corresponding to a 0 in the high half to output2. |
307 | template<typename L> |
308 | simdjson_inline void compress_halves(uint16_t mask, L *output1, L *output2) const { |
309 | using internal::thintable_epi8; |
310 | uint8_t mask1 = uint8_t(mask); // least significant 8 bits |
311 | uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits |
312 | uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]); |
313 | uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]); |
314 | // we increment by 0x08 the second half of the mask |
315 | #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO |
316 | uint8x8_t inc = make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08); |
317 | #else |
318 | uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08}; |
319 | #endif |
320 | compactmask2 = vadd_u8(p0: compactmask2, p1: inc); |
321 | // store each result (with the second store possibly overlapping the first) |
322 | vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1)); |
323 | vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2)); |
324 | } |
325 | |
326 | template<typename L> |
327 | simdjson_inline simd8<L> lookup_16( |
328 | L replace0, L replace1, L replace2, L replace3, |
329 | L replace4, L replace5, L replace6, L replace7, |
330 | L replace8, L replace9, L replace10, L replace11, |
331 | L replace12, L replace13, L replace14, L replace15) const { |
332 | return lookup_16(simd8<L>::repeat_16( |
333 | replace0, replace1, replace2, replace3, |
334 | replace4, replace5, replace6, replace7, |
335 | replace8, replace9, replace10, replace11, |
336 | replace12, replace13, replace14, replace15 |
337 | )); |
338 | } |
339 | |
340 | template<typename T> |
341 | simdjson_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) { |
342 | return vqtbl1q_u8(p0: *this, p1: simd8<uint8_t>(original)); |
343 | } |
344 | }; |
345 | |
346 | // Signed bytes |
347 | template<> |
348 | struct simd8<int8_t> { |
349 | int8x16_t value; |
350 | |
351 | static simdjson_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(p0: _value); } |
352 | static simdjson_inline simd8<int8_t> zero() { return vdupq_n_s8(p0: 0); } |
353 | static simdjson_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); } |
354 | |
355 | // Conversion from/to SIMD register |
356 | simdjson_inline simd8(const int8x16_t _value) : value{_value} {} |
357 | simdjson_inline operator const int8x16_t&() const { return this->value; } |
358 | simdjson_inline operator int8x16_t&() { return this->value; } |
359 | |
360 | // Zero constructor |
361 | simdjson_inline simd8() : simd8(zero()) {} |
362 | // Splat constructor |
363 | simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {} |
364 | // Array constructor |
365 | simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {} |
366 | // Member-by-member initialization |
367 | #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO |
368 | simdjson_inline simd8( |
369 | int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, |
370 | int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 |
371 | ) : simd8(make_int8x16_t( |
372 | v0, v1, v2, v3, v4, v5, v6, v7, |
373 | v8, v9, v10,v11,v12,v13,v14,v15 |
374 | )) {} |
375 | #else |
376 | simdjson_inline simd8( |
377 | int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, |
378 | int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 |
379 | ) : simd8(int8x16_t{ |
380 | v0, v1, v2, v3, v4, v5, v6, v7, |
381 | v8, v9, v10,v11,v12,v13,v14,v15 |
382 | }) {} |
383 | #endif |
384 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
385 | simdjson_inline static simd8<int8_t> repeat_16( |
386 | int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, |
387 | int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 |
388 | ) { |
389 | return simd8<int8_t>( |
390 | v0, v1, v2, v3, v4, v5, v6, v7, |
391 | v8, v9, v10,v11,v12,v13,v14,v15 |
392 | ); |
393 | } |
394 | |
395 | // Store to array |
396 | simdjson_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); } |
397 | |
398 | // Explicit conversion to/from unsigned |
399 | // |
400 | // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type. |
401 | // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14 |
402 | // and relatively ugly and hard to read. |
403 | #ifndef SIMDJSON_REGULAR_VISUAL_STUDIO |
404 | simdjson_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(p0: other)) {} |
405 | #endif |
406 | simdjson_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(p0: this->value); } |
407 | |
408 | // Math |
409 | simdjson_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(p0: *this, p1: other); } |
410 | simdjson_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(p0: *this, p1: other); } |
411 | simdjson_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; } |
412 | simdjson_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; } |
413 | |
414 | // Order-sensitive comparisons |
415 | simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(p0: *this, p1: other); } |
416 | simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(p0: *this, p1: other); } |
417 | simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(p0: *this, p1: other); } |
418 | simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(p0: *this, p1: other); } |
419 | simdjson_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(p0: *this, p1: other); } |
420 | |
421 | template<int N=1> |
422 | simdjson_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const { |
423 | return vextq_s8(prev_chunk, *this, 16 - N); |
424 | } |
425 | |
426 | // Perform a lookup assuming no value is larger than 16 |
427 | template<typename L> |
428 | simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const { |
429 | return lookup_table.apply_lookup_16_to(*this); |
430 | } |
431 | template<typename L> |
432 | simdjson_inline simd8<L> lookup_16( |
433 | L replace0, L replace1, L replace2, L replace3, |
434 | L replace4, L replace5, L replace6, L replace7, |
435 | L replace8, L replace9, L replace10, L replace11, |
436 | L replace12, L replace13, L replace14, L replace15) const { |
437 | return lookup_16(simd8<L>::repeat_16( |
438 | replace0, replace1, replace2, replace3, |
439 | replace4, replace5, replace6, replace7, |
440 | replace8, replace9, replace10, replace11, |
441 | replace12, replace13, replace14, replace15 |
442 | )); |
443 | } |
444 | |
445 | template<typename T> |
446 | simdjson_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) { |
447 | return vqtbl1q_s8(p0: *this, p1: simd8<uint8_t>(original)); |
448 | } |
449 | }; |
450 | |
451 | template<typename T> |
452 | struct simd8x64 { |
453 | static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>); |
454 | static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block." ); |
455 | const simd8<T> chunks[NUM_CHUNKS]; |
456 | |
457 | simd8x64(const simd8x64<T>& o) = delete; // no copy allowed |
458 | simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed |
459 | simd8x64() = delete; // no default constructor allowed |
460 | |
461 | simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} |
462 | simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {} |
463 | |
464 | simdjson_inline void store(T ptr[64]) const { |
465 | this->chunks[0].store(ptr+sizeof(simd8<T>)*0); |
466 | this->chunks[1].store(ptr+sizeof(simd8<T>)*1); |
467 | this->chunks[2].store(ptr+sizeof(simd8<T>)*2); |
468 | this->chunks[3].store(ptr+sizeof(simd8<T>)*3); |
469 | } |
470 | |
471 | simdjson_inline simd8<T> reduce_or() const { |
472 | return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); |
473 | } |
474 | |
475 | |
476 | simdjson_inline uint64_t compress(uint64_t mask, T * output) const { |
477 | uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0); |
478 | // compute the prefix sum of the popcounts of each byte |
479 | uint64_t offsets = popcounts * 0x0101010101010101; |
480 | this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]); |
481 | this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]); |
482 | this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]); |
483 | this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]); |
484 | return offsets >> 56; |
485 | } |
486 | |
487 | simdjson_inline uint64_t to_bitmask() const { |
488 | #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO |
489 | const uint8x16_t bit_mask = make_uint8x16_t( |
490 | 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, |
491 | 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 |
492 | ); |
493 | #else |
494 | const uint8x16_t bit_mask = { |
495 | 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, |
496 | 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 |
497 | }; |
498 | #endif |
499 | // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. |
500 | uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask); |
501 | uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask); |
502 | sum0 = vpaddq_u8(p0: sum0, p1: sum1); |
503 | sum0 = vpaddq_u8(p0: sum0, p1: sum0); |
504 | return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); |
505 | } |
506 | |
507 | simdjson_inline uint64_t eq(const T m) const { |
508 | const simd8<T> mask = simd8<T>::splat(m); |
509 | return simd8x64<bool>( |
510 | this->chunks[0] == mask, |
511 | this->chunks[1] == mask, |
512 | this->chunks[2] == mask, |
513 | this->chunks[3] == mask |
514 | ).to_bitmask(); |
515 | } |
516 | |
517 | simdjson_inline uint64_t lteq(const T m) const { |
518 | const simd8<T> mask = simd8<T>::splat(m); |
519 | return simd8x64<bool>( |
520 | this->chunks[0] <= mask, |
521 | this->chunks[1] <= mask, |
522 | this->chunks[2] <= mask, |
523 | this->chunks[3] <= mask |
524 | ).to_bitmask(); |
525 | } |
526 | }; // struct simd8x64<T> |
527 | |
528 | } // namespace simd |
529 | } // unnamed namespace |
530 | } // namespace SIMDJSON_IMPLEMENTATION |
531 | } // namespace simdjson |
532 | |
533 | #endif // SIMDJSON_ARM64_SIMD_H |
534 | |