1 | #ifndef SIMDJSON_HASWELL_SIMD_H |
2 | #define SIMDJSON_HASWELL_SIMD_H |
3 | |
4 | #include "simdjson/portability.h" |
5 | |
6 | #ifdef IS_X86_64 |
7 | |
8 | #include "simdjson/common_defs.h" |
9 | #include "haswell/intrinsics.h" |
10 | |
11 | TARGET_HASWELL |
12 | namespace simdjson::haswell::simd { |
13 | |
14 | // Forward-declared so they can be used by splat and friends. |
15 | template<typename Child> |
16 | struct base { |
17 | __m256i value; |
18 | |
19 | // Zero constructor |
20 | really_inline base() : value{__m256i()} {} |
21 | |
22 | // Conversion from SIMD register |
23 | really_inline base(const __m256i _value) : value(_value) {} |
24 | |
25 | // Conversion to SIMD register |
26 | really_inline operator const __m256i&() const { return this->value; } |
27 | really_inline operator __m256i&() { return this->value; } |
28 | |
29 | // Bit operations |
30 | really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); } |
31 | really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); } |
32 | really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); } |
33 | really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); } |
34 | really_inline Child operator~() const { return *this ^ 0xFFu; } |
35 | really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; } |
36 | really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; } |
37 | really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; } |
38 | }; |
39 | |
40 | // Forward-declared so they can be used by splat and friends. |
41 | template<typename T> |
42 | struct simd8; |
43 | |
44 | template<typename T, typename Mask=simd8<bool>> |
45 | struct base8: base<simd8<T>> { |
46 | typedef uint32_t bitmask_t; |
47 | typedef uint64_t bitmask2_t; |
48 | |
49 | really_inline base8() : base<simd8<T>>() {} |
50 | really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {} |
51 | |
52 | really_inline Mask operator==(const simd8<T> other) const { return _mm256_cmpeq_epi8(*this, other); } |
53 | |
54 | static const int SIZE = sizeof(base<T>::value); |
55 | |
56 | template<int N=1> |
57 | really_inline simd8<T> prev(const simd8<T> prev_chunk) const { |
58 | return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N); |
59 | } |
60 | }; |
61 | |
62 | // SIMD byte mask type (returned by things like eq and gt) |
63 | template<> |
64 | struct simd8<bool>: base8<bool> { |
65 | static really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(-(!!_value)); } |
66 | |
67 | really_inline simd8<bool>() : base8() {} |
68 | really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {} |
69 | // Splat constructor |
70 | really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {} |
71 | |
72 | really_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); } |
73 | really_inline bool any() const { return !_mm256_testz_si256(*this, *this); } |
74 | }; |
75 | |
76 | template<typename T> |
77 | struct base8_numeric: base8<T> { |
78 | static really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); } |
79 | static really_inline simd8<T> zero() { return _mm256_setzero_si256(); } |
80 | static really_inline simd8<T> load(const T values[32]) { |
81 | return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values)); |
82 | } |
83 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
84 | static really_inline simd8<T> repeat_16( |
85 | T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, |
86 | T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 |
87 | ) { |
88 | return simd8<T>( |
89 | v0, v1, v2, v3, v4, v5, v6, v7, |
90 | v8, v9, v10,v11,v12,v13,v14,v15, |
91 | v0, v1, v2, v3, v4, v5, v6, v7, |
92 | v8, v9, v10,v11,v12,v13,v14,v15 |
93 | ); |
94 | } |
95 | |
96 | really_inline base8_numeric() : base8<T>() {} |
97 | really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {} |
98 | |
99 | // Store to array |
100 | really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } |
101 | |
102 | // Addition/subtraction are the same for signed and unsigned |
103 | really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); } |
104 | really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); } |
105 | really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *(simd8<T>*)this; } |
106 | really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *(simd8<T>*)this; } |
107 | |
108 | // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) |
109 | template<typename L> |
110 | really_inline simd8<L> lookup_16(simd8<L> lookup_table) const { |
111 | return _mm256_shuffle_epi8(lookup_table, *this); |
112 | } |
113 | template<typename L> |
114 | really_inline simd8<L> lookup_16( |
115 | L replace0, L replace1, L replace2, L replace3, |
116 | L replace4, L replace5, L replace6, L replace7, |
117 | L replace8, L replace9, L replace10, L replace11, |
118 | L replace12, L replace13, L replace14, L replace15) const { |
119 | return lookup_16(simd8<L>::repeat_16( |
120 | replace0, replace1, replace2, replace3, |
121 | replace4, replace5, replace6, replace7, |
122 | replace8, replace9, replace10, replace11, |
123 | replace12, replace13, replace14, replace15 |
124 | )); |
125 | } |
126 | }; |
127 | |
128 | // Signed bytes |
129 | template<> |
130 | struct simd8<int8_t> : base8_numeric<int8_t> { |
131 | really_inline simd8() : base8_numeric<int8_t>() {} |
132 | really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {} |
133 | // Splat constructor |
134 | really_inline simd8(int8_t _value) : simd8(splat(_value)) {} |
135 | // Array constructor |
136 | really_inline simd8(const int8_t values[32]) : simd8(load(values)) {} |
137 | // Member-by-member initialization |
138 | really_inline simd8( |
139 | int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, |
140 | int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, |
141 | int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, |
142 | int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31 |
143 | ) : simd8(_mm256_setr_epi8( |
144 | v0, v1, v2, v3, v4, v5, v6, v7, |
145 | v8, v9, v10,v11,v12,v13,v14,v15, |
146 | v16,v17,v18,v19,v20,v21,v22,v23, |
147 | v24,v25,v26,v27,v28,v29,v30,v31 |
148 | )) {} |
149 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
150 | really_inline static simd8<int8_t> repeat_16( |
151 | int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, |
152 | int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 |
153 | ) { |
154 | return simd8<int8_t>( |
155 | v0, v1, v2, v3, v4, v5, v6, v7, |
156 | v8, v9, v10,v11,v12,v13,v14,v15, |
157 | v0, v1, v2, v3, v4, v5, v6, v7, |
158 | v8, v9, v10,v11,v12,v13,v14,v15 |
159 | ); |
160 | } |
161 | |
162 | // Order-sensitive comparisons |
163 | really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); } |
164 | really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); } |
165 | really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); } |
166 | really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); } |
167 | }; |
168 | |
169 | // Unsigned bytes |
170 | template<> |
171 | struct simd8<uint8_t>: base8_numeric<uint8_t> { |
172 | really_inline simd8() : base8_numeric<uint8_t>() {} |
173 | really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {} |
174 | // Splat constructor |
175 | really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} |
176 | // Array constructor |
177 | really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} |
178 | // Member-by-member initialization |
179 | really_inline simd8( |
180 | uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, |
181 | uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, |
182 | uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23, |
183 | uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31 |
184 | ) : simd8(_mm256_setr_epi8( |
185 | v0, v1, v2, v3, v4, v5, v6, v7, |
186 | v8, v9, v10,v11,v12,v13,v14,v15, |
187 | v16,v17,v18,v19,v20,v21,v22,v23, |
188 | v24,v25,v26,v27,v28,v29,v30,v31 |
189 | )) {} |
190 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
191 | really_inline static simd8<uint8_t> repeat_16( |
192 | uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, |
193 | uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 |
194 | ) { |
195 | return simd8<uint8_t>( |
196 | v0, v1, v2, v3, v4, v5, v6, v7, |
197 | v8, v9, v10,v11,v12,v13,v14,v15, |
198 | v0, v1, v2, v3, v4, v5, v6, v7, |
199 | v8, v9, v10,v11,v12,v13,v14,v15 |
200 | ); |
201 | } |
202 | |
203 | // Saturated math |
204 | really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); } |
205 | really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); } |
206 | |
207 | // Order-specific operations |
208 | really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); } |
209 | really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); } |
210 | // Same as >, but only guarantees true is nonzero (< guarantees true = -1) |
211 | really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); } |
212 | // Same as <, but only guarantees true is nonzero (< guarantees true = -1) |
213 | really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); } |
214 | really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max(*this) == other; } |
215 | really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min(*this) == other; } |
216 | really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); } |
217 | really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); } |
218 | |
219 | // Bit-specific operations |
220 | really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); } |
221 | really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); } |
222 | really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); } |
223 | really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); } |
224 | really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); } |
225 | really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } |
226 | really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); } |
227 | really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); } |
228 | template<int N> |
229 | really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } |
230 | template<int N> |
231 | really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } |
232 | // Get one of the bits and make a bitmask out of it. |
233 | // e.g. value.get_bit<7>() gets the high bit |
234 | template<int N> |
235 | really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); } |
236 | }; |
237 | |
238 | template<typename T> |
239 | struct simd8x64 { |
240 | static const int NUM_CHUNKS = 64 / sizeof(simd8<T>); |
241 | const simd8<T> chunks[NUM_CHUNKS]; |
242 | |
243 | really_inline simd8x64() : chunks{simd8<T>(), simd8<T>()} {} |
244 | really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {} |
245 | really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {} |
246 | |
247 | template <typename F> |
248 | static really_inline void each_index(F const& each) { |
249 | each(0); |
250 | each(1); |
251 | } |
252 | |
253 | really_inline void store(T ptr[64]) const { |
254 | this->chunks[0].store(ptr+sizeof(simd8<T>)*0); |
255 | this->chunks[1].store(ptr+sizeof(simd8<T>)*1); |
256 | } |
257 | |
258 | template <typename F> |
259 | really_inline void each(F const& each_chunk) const |
260 | { |
261 | each_chunk(this->chunks[0]); |
262 | each_chunk(this->chunks[1]); |
263 | } |
264 | |
265 | template <typename R=bool, typename F> |
266 | really_inline simd8x64<R> map(F const& map_chunk) const { |
267 | return simd8x64<R>( |
268 | map_chunk(this->chunks[0]), |
269 | map_chunk(this->chunks[1]) |
270 | ); |
271 | } |
272 | |
273 | template <typename R=bool, typename F> |
274 | really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const { |
275 | return simd8x64<R>( |
276 | map_chunk(this->chunks[0], b.chunks[0]), |
277 | map_chunk(this->chunks[1], b.chunks[1]) |
278 | ); |
279 | } |
280 | |
281 | template <typename F> |
282 | really_inline simd8<T> reduce(F const& reduce_pair) const { |
283 | return reduce_pair(this->chunks[0], this->chunks[1]); |
284 | } |
285 | |
286 | really_inline uint64_t to_bitmask() const { |
287 | uint64_t r_lo = static_cast<uint32_t>(this->chunks[0].to_bitmask()); |
288 | uint64_t r_hi = this->chunks[1].to_bitmask(); |
289 | return r_lo | (r_hi << 32); |
290 | } |
291 | |
292 | really_inline simd8x64<T> bit_or(const T m) const { |
293 | const simd8<T> mask = simd8<T>::splat(m); |
294 | return this->map( [&](auto a) { return a | mask; } ); |
295 | } |
296 | |
297 | really_inline uint64_t eq(const T m) const { |
298 | const simd8<T> mask = simd8<T>::splat(m); |
299 | return this->map( [&](auto a) { return a == mask; } ).to_bitmask(); |
300 | } |
301 | |
302 | really_inline uint64_t lteq(const T m) const { |
303 | const simd8<T> mask = simd8<T>::splat(m); |
304 | return this->map( [&](auto a) { return a <= mask; } ).to_bitmask(); |
305 | } |
306 | |
307 | }; // struct simd8x64<T> |
308 | |
309 | } // namespace simdjson::haswell::simd |
310 | UNTARGET_REGION |
311 | |
312 | #endif // IS_X86_64 |
313 | #endif // SIMDJSON_HASWELL_SIMD_H |
314 | |