1 | #ifndef SIMDJSON_WESTMERE_SIMD_H |
2 | #define SIMDJSON_WESTMERE_SIMD_H |
3 | |
4 | #include "simdjson/portability.h" |
5 | |
6 | #ifdef IS_X86_64 |
7 | |
8 | #include "simdjson/common_defs.h" |
9 | #include "simdjson/simdjson.h" |
10 | #include "westmere/intrinsics.h" |
11 | |
12 | TARGET_WESTMERE |
13 | namespace simdjson::westmere::simd { |
14 | |
15 | template<typename Child> |
16 | struct base { |
17 | __m128i value; |
18 | |
19 | // Zero constructor |
20 | really_inline base() : value{__m128i()} {} |
21 | |
22 | // Conversion from SIMD register |
23 | really_inline base(const __m128i _value) : value(_value) {} |
24 | |
25 | // Conversion to SIMD register |
26 | really_inline operator const __m128i&() const { return this->value; } |
27 | really_inline operator __m128i&() { return this->value; } |
28 | |
29 | // Bit operations |
30 | really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); } |
31 | really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); } |
32 | really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); } |
33 | really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); } |
34 | really_inline Child operator~() const { return *this ^ 0xFFu; } |
35 | really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; } |
36 | really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; } |
37 | really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; } |
38 | }; |
39 | |
40 | // Forward-declared so they can be used by splat and friends. |
41 | template<typename T> |
42 | struct simd8; |
43 | |
44 | template<typename T, typename Mask=simd8<bool>> |
45 | struct base8: base<simd8<T>> { |
46 | typedef uint16_t bitmask_t; |
47 | typedef uint32_t bitmask2_t; |
48 | |
49 | really_inline base8() : base<simd8<T>>() {} |
50 | really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {} |
51 | |
52 | really_inline Mask operator==(const simd8<T> other) const { return _mm_cmpeq_epi8(*this, other); } |
53 | |
54 | static const int SIZE = sizeof(base<simd8<T>>::value); |
55 | |
56 | template<int N=1> |
57 | really_inline simd8<T> prev(const simd8<T> prev_chunk) const { |
58 | return _mm_alignr_epi8(*this, prev_chunk, 16 - N); |
59 | } |
60 | }; |
61 | |
62 | // SIMD byte mask type (returned by things like eq and gt) |
63 | template<> |
64 | struct simd8<bool>: base8<bool> { |
65 | static really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(-(!!_value)); } |
66 | |
67 | really_inline simd8<bool>() : base8() {} |
68 | really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {} |
69 | // Splat constructor |
70 | really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {} |
71 | |
72 | really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); } |
73 | really_inline bool any() const { return !_mm_testz_si128(*this, *this); } |
74 | }; |
75 | |
76 | template<typename T> |
77 | struct base8_numeric: base8<T> { |
78 | static really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); } |
79 | static really_inline simd8<T> zero() { return _mm_setzero_si128(); } |
80 | static really_inline simd8<T> load(const T values[16]) { |
81 | return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values)); |
82 | } |
83 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
84 | static really_inline simd8<T> repeat_16( |
85 | T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, |
86 | T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 |
87 | ) { |
88 | return simd8<T>( |
89 | v0, v1, v2, v3, v4, v5, v6, v7, |
90 | v8, v9, v10,v11,v12,v13,v14,v15 |
91 | ); |
92 | } |
93 | |
94 | really_inline base8_numeric() : base8<T>() {} |
95 | really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {} |
96 | |
97 | // Store to array |
98 | really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); } |
99 | |
100 | // Addition/subtraction are the same for signed and unsigned |
101 | really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); } |
102 | really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); } |
103 | really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *(simd8<T>*)this; } |
104 | really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *(simd8<T>*)this; } |
105 | |
106 | // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) |
107 | template<typename L> |
108 | really_inline simd8<L> lookup_16(simd8<L> lookup_table) const { |
109 | return _mm_shuffle_epi8(lookup_table, *this); |
110 | } |
111 | template<typename L> |
112 | really_inline simd8<L> lookup_16( |
113 | L replace0, L replace1, L replace2, L replace3, |
114 | L replace4, L replace5, L replace6, L replace7, |
115 | L replace8, L replace9, L replace10, L replace11, |
116 | L replace12, L replace13, L replace14, L replace15) const { |
117 | return lookup_16(simd8<L>::repeat_16( |
118 | replace0, replace1, replace2, replace3, |
119 | replace4, replace5, replace6, replace7, |
120 | replace8, replace9, replace10, replace11, |
121 | replace12, replace13, replace14, replace15 |
122 | )); |
123 | } |
124 | }; |
125 | |
126 | // Signed bytes |
127 | template<> |
128 | struct simd8<int8_t> : base8_numeric<int8_t> { |
129 | really_inline simd8() : base8_numeric<int8_t>() {} |
130 | really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {} |
131 | // Splat constructor |
132 | really_inline simd8(int8_t _value) : simd8(splat(_value)) {} |
133 | // Array constructor |
134 | really_inline simd8(const int8_t* values) : simd8(load(values)) {} |
135 | // Member-by-member initialization |
136 | really_inline simd8( |
137 | int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, |
138 | int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 |
139 | ) : simd8(_mm_setr_epi8( |
140 | v0, v1, v2, v3, v4, v5, v6, v7, |
141 | v8, v9, v10,v11,v12,v13,v14,v15 |
142 | )) {} |
143 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
144 | really_inline static simd8<int8_t> repeat_16( |
145 | int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, |
146 | int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 |
147 | ) { |
148 | return simd8<int8_t>( |
149 | v0, v1, v2, v3, v4, v5, v6, v7, |
150 | v8, v9, v10,v11,v12,v13,v14,v15 |
151 | ); |
152 | } |
153 | |
154 | // Order-sensitive comparisons |
155 | really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); } |
156 | really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); } |
157 | really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); } |
158 | really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); } |
159 | }; |
160 | |
161 | // Unsigned bytes |
162 | template<> |
163 | struct simd8<uint8_t>: base8_numeric<uint8_t> { |
164 | really_inline simd8() : base8_numeric<uint8_t>() {} |
165 | really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {} |
166 | // Splat constructor |
167 | really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} |
168 | // Array constructor |
169 | really_inline simd8(const uint8_t* values) : simd8(load(values)) {} |
170 | // Member-by-member initialization |
171 | really_inline simd8( |
172 | uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, |
173 | uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 |
174 | ) : simd8(_mm_setr_epi8( |
175 | v0, v1, v2, v3, v4, v5, v6, v7, |
176 | v8, v9, v10,v11,v12,v13,v14,v15 |
177 | )) {} |
178 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
179 | really_inline static simd8<uint8_t> repeat_16( |
180 | uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, |
181 | uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 |
182 | ) { |
183 | return simd8<uint8_t>( |
184 | v0, v1, v2, v3, v4, v5, v6, v7, |
185 | v8, v9, v10,v11,v12,v13,v14,v15 |
186 | ); |
187 | } |
188 | |
189 | // Saturated math |
190 | really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); } |
191 | really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); } |
192 | |
193 | // Order-specific operations |
194 | really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); } |
195 | really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); } |
196 | // Same as >, but only guarantees true is nonzero (< guarantees true = -1) |
197 | really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); } |
198 | // Same as <, but only guarantees true is nonzero (< guarantees true = -1) |
199 | really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); } |
200 | really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max(*this) == other; } |
201 | really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min(*this) == other; } |
202 | really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); } |
203 | really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); } |
204 | |
205 | // Bit-specific operations |
206 | really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); } |
207 | really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); } |
208 | really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); } |
209 | really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); } |
210 | really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } |
211 | really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } |
212 | really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); } |
213 | really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); } |
214 | template<int N> |
215 | really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } |
216 | template<int N> |
217 | really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } |
218 | // Get one of the bits and make a bitmask out of it. |
219 | // e.g. value.get_bit<7>() gets the high bit |
220 | template<int N> |
221 | really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); } |
222 | }; |
223 | |
224 | template<typename T> |
225 | struct simd8x64 { |
226 | static const int NUM_CHUNKS = 64 / sizeof(simd8<T>); |
227 | const simd8<T> chunks[NUM_CHUNKS]; |
228 | |
229 | really_inline simd8x64() : chunks{simd8<T>(), simd8<T>(), simd8<T>(), simd8<T>()} {} |
230 | really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} |
231 | really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {} |
232 | |
233 | really_inline void store(T ptr[64]) const { |
234 | this->chunks[0].store(ptr+sizeof(simd8<T>)*0); |
235 | this->chunks[1].store(ptr+sizeof(simd8<T>)*1); |
236 | this->chunks[2].store(ptr+sizeof(simd8<T>)*2); |
237 | this->chunks[3].store(ptr+sizeof(simd8<T>)*3); |
238 | } |
239 | |
240 | template <typename F> |
241 | static really_inline void each_index(F const& each) { |
242 | each(0); |
243 | each(1); |
244 | each(2); |
245 | each(3); |
246 | } |
247 | |
248 | template <typename F> |
249 | really_inline void each(F const& each_chunk) const |
250 | { |
251 | each_chunk(this->chunks[0]); |
252 | each_chunk(this->chunks[1]); |
253 | each_chunk(this->chunks[2]); |
254 | each_chunk(this->chunks[3]); |
255 | } |
256 | |
257 | template <typename F, typename R=bool> |
258 | really_inline simd8x64<R> map(F const& map_chunk) const { |
259 | return simd8x64<R>( |
260 | map_chunk(this->chunks[0]), |
261 | map_chunk(this->chunks[1]), |
262 | map_chunk(this->chunks[2]), |
263 | map_chunk(this->chunks[3]) |
264 | ); |
265 | } |
266 | |
267 | template <typename F, typename R=bool> |
268 | really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const { |
269 | return simd8x64<R>( |
270 | map_chunk(this->chunks[0], b.chunks[0]), |
271 | map_chunk(this->chunks[1], b.chunks[1]), |
272 | map_chunk(this->chunks[2], b.chunks[2]), |
273 | map_chunk(this->chunks[3], b.chunks[3]) |
274 | ); |
275 | } |
276 | |
277 | template <typename F> |
278 | really_inline simd8<T> reduce(F const& reduce_pair) const { |
279 | return reduce_pair( |
280 | reduce_pair(this->chunks[0], this->chunks[1]), |
281 | reduce_pair(this->chunks[2], this->chunks[3]) |
282 | ); |
283 | } |
284 | |
285 | really_inline uint64_t to_bitmask() const { |
286 | uint64_t r0 = static_cast<uint32_t>(this->chunks[0].to_bitmask()); |
287 | uint64_t r1 = this->chunks[1].to_bitmask(); |
288 | uint64_t r2 = this->chunks[2].to_bitmask(); |
289 | uint64_t r3 = this->chunks[3].to_bitmask(); |
290 | return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); |
291 | } |
292 | |
293 | really_inline simd8x64<T> bit_or(const T m) const { |
294 | const simd8<T> mask = simd8<T>::splat(m); |
295 | return this->map( [&](auto a) { return a | mask; } ); |
296 | } |
297 | |
298 | really_inline uint64_t eq(const T m) const { |
299 | const simd8<T> mask = simd8<T>::splat(m); |
300 | return this->map( [&](auto a) { return a == mask; } ).to_bitmask(); |
301 | } |
302 | |
303 | really_inline uint64_t lteq(const T m) const { |
304 | const simd8<T> mask = simd8<T>::splat(m); |
305 | return this->map( [&](auto a) { return a <= mask; } ).to_bitmask(); |
306 | } |
307 | |
308 | }; // struct simd8x64<T> |
309 | |
310 | } // namespace simdjson::westmere::simd |
311 | UNTARGET_REGION |
312 | |
313 | #endif // IS_X86_64 |
314 | #endif // SIMDJSON_WESTMERE_SIMD_INPUT_H |
315 | |