| 1 | #ifndef SIMDJSON_HASWELL_SIMD_H |
| 2 | #define SIMDJSON_HASWELL_SIMD_H |
| 3 | |
| 4 | #include "simdjson/portability.h" |
| 5 | |
| 6 | #ifdef IS_X86_64 |
| 7 | |
| 8 | #include "simdjson/common_defs.h" |
| 9 | #include "haswell/intrinsics.h" |
| 10 | |
| 11 | TARGET_HASWELL |
| 12 | namespace simdjson::haswell::simd { |
| 13 | |
| 14 | // Forward-declared so they can be used by splat and friends. |
| 15 | template<typename Child> |
| 16 | struct base { |
| 17 | __m256i value; |
| 18 | |
| 19 | // Zero constructor |
| 20 | really_inline base() : value{__m256i()} {} |
| 21 | |
| 22 | // Conversion from SIMD register |
| 23 | really_inline base(const __m256i _value) : value(_value) {} |
| 24 | |
| 25 | // Conversion to SIMD register |
| 26 | really_inline operator const __m256i&() const { return this->value; } |
| 27 | really_inline operator __m256i&() { return this->value; } |
| 28 | |
| 29 | // Bit operations |
| 30 | really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); } |
| 31 | really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); } |
| 32 | really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); } |
| 33 | really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); } |
| 34 | really_inline Child operator~() const { return *this ^ 0xFFu; } |
| 35 | really_inline Child& operator|=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast | other; return *this_cast; } |
| 36 | really_inline Child& operator&=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast & other; return *this_cast; } |
| 37 | really_inline Child& operator^=(const Child other) { auto this_cast = (Child*)this; *this_cast = *this_cast ^ other; return *this_cast; } |
| 38 | }; |
| 39 | |
| 40 | // Forward-declared so they can be used by splat and friends. |
| 41 | template<typename T> |
| 42 | struct simd8; |
| 43 | |
| 44 | template<typename T, typename Mask=simd8<bool>> |
| 45 | struct base8: base<simd8<T>> { |
| 46 | typedef uint32_t bitmask_t; |
| 47 | typedef uint64_t bitmask2_t; |
| 48 | |
| 49 | really_inline base8() : base<simd8<T>>() {} |
| 50 | really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {} |
| 51 | |
| 52 | really_inline Mask operator==(const simd8<T> other) const { return _mm256_cmpeq_epi8(*this, other); } |
| 53 | |
| 54 | static const int SIZE = sizeof(base<T>::value); |
| 55 | |
| 56 | template<int N=1> |
| 57 | really_inline simd8<T> prev(const simd8<T> prev_chunk) const { |
| 58 | return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N); |
| 59 | } |
| 60 | }; |
| 61 | |
| 62 | // SIMD byte mask type (returned by things like eq and gt) |
| 63 | template<> |
| 64 | struct simd8<bool>: base8<bool> { |
| 65 | static really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(-(!!_value)); } |
| 66 | |
| 67 | really_inline simd8<bool>() : base8() {} |
| 68 | really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {} |
| 69 | // Splat constructor |
| 70 | really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {} |
| 71 | |
| 72 | really_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); } |
| 73 | really_inline bool any() const { return !_mm256_testz_si256(*this, *this); } |
| 74 | }; |
| 75 | |
| 76 | template<typename T> |
| 77 | struct base8_numeric: base8<T> { |
| 78 | static really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); } |
| 79 | static really_inline simd8<T> zero() { return _mm256_setzero_si256(); } |
| 80 | static really_inline simd8<T> load(const T values[32]) { |
| 81 | return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values)); |
| 82 | } |
| 83 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
| 84 | static really_inline simd8<T> repeat_16( |
| 85 | T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, |
| 86 | T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 |
| 87 | ) { |
| 88 | return simd8<T>( |
| 89 | v0, v1, v2, v3, v4, v5, v6, v7, |
| 90 | v8, v9, v10,v11,v12,v13,v14,v15, |
| 91 | v0, v1, v2, v3, v4, v5, v6, v7, |
| 92 | v8, v9, v10,v11,v12,v13,v14,v15 |
| 93 | ); |
| 94 | } |
| 95 | |
| 96 | really_inline base8_numeric() : base8<T>() {} |
| 97 | really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {} |
| 98 | |
| 99 | // Store to array |
| 100 | really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } |
| 101 | |
| 102 | // Addition/subtraction are the same for signed and unsigned |
| 103 | really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); } |
| 104 | really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); } |
| 105 | really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *(simd8<T>*)this; } |
| 106 | really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *(simd8<T>*)this; } |
| 107 | |
| 108 | // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) |
| 109 | template<typename L> |
| 110 | really_inline simd8<L> lookup_16(simd8<L> lookup_table) const { |
| 111 | return _mm256_shuffle_epi8(lookup_table, *this); |
| 112 | } |
| 113 | template<typename L> |
| 114 | really_inline simd8<L> lookup_16( |
| 115 | L replace0, L replace1, L replace2, L replace3, |
| 116 | L replace4, L replace5, L replace6, L replace7, |
| 117 | L replace8, L replace9, L replace10, L replace11, |
| 118 | L replace12, L replace13, L replace14, L replace15) const { |
| 119 | return lookup_16(simd8<L>::repeat_16( |
| 120 | replace0, replace1, replace2, replace3, |
| 121 | replace4, replace5, replace6, replace7, |
| 122 | replace8, replace9, replace10, replace11, |
| 123 | replace12, replace13, replace14, replace15 |
| 124 | )); |
| 125 | } |
| 126 | }; |
| 127 | |
| 128 | // Signed bytes |
| 129 | template<> |
| 130 | struct simd8<int8_t> : base8_numeric<int8_t> { |
| 131 | really_inline simd8() : base8_numeric<int8_t>() {} |
| 132 | really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {} |
| 133 | // Splat constructor |
| 134 | really_inline simd8(int8_t _value) : simd8(splat(_value)) {} |
| 135 | // Array constructor |
| 136 | really_inline simd8(const int8_t values[32]) : simd8(load(values)) {} |
| 137 | // Member-by-member initialization |
| 138 | really_inline simd8( |
| 139 | int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, |
| 140 | int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, |
| 141 | int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, |
| 142 | int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31 |
| 143 | ) : simd8(_mm256_setr_epi8( |
| 144 | v0, v1, v2, v3, v4, v5, v6, v7, |
| 145 | v8, v9, v10,v11,v12,v13,v14,v15, |
| 146 | v16,v17,v18,v19,v20,v21,v22,v23, |
| 147 | v24,v25,v26,v27,v28,v29,v30,v31 |
| 148 | )) {} |
| 149 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
| 150 | really_inline static simd8<int8_t> repeat_16( |
| 151 | int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, |
| 152 | int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 |
| 153 | ) { |
| 154 | return simd8<int8_t>( |
| 155 | v0, v1, v2, v3, v4, v5, v6, v7, |
| 156 | v8, v9, v10,v11,v12,v13,v14,v15, |
| 157 | v0, v1, v2, v3, v4, v5, v6, v7, |
| 158 | v8, v9, v10,v11,v12,v13,v14,v15 |
| 159 | ); |
| 160 | } |
| 161 | |
| 162 | // Order-sensitive comparisons |
| 163 | really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); } |
| 164 | really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); } |
| 165 | really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); } |
| 166 | really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); } |
| 167 | }; |
| 168 | |
| 169 | // Unsigned bytes |
| 170 | template<> |
| 171 | struct simd8<uint8_t>: base8_numeric<uint8_t> { |
| 172 | really_inline simd8() : base8_numeric<uint8_t>() {} |
| 173 | really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {} |
| 174 | // Splat constructor |
| 175 | really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} |
| 176 | // Array constructor |
| 177 | really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} |
| 178 | // Member-by-member initialization |
| 179 | really_inline simd8( |
| 180 | uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, |
| 181 | uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, |
| 182 | uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23, |
| 183 | uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31 |
| 184 | ) : simd8(_mm256_setr_epi8( |
| 185 | v0, v1, v2, v3, v4, v5, v6, v7, |
| 186 | v8, v9, v10,v11,v12,v13,v14,v15, |
| 187 | v16,v17,v18,v19,v20,v21,v22,v23, |
| 188 | v24,v25,v26,v27,v28,v29,v30,v31 |
| 189 | )) {} |
| 190 | // Repeat 16 values as many times as necessary (usually for lookup tables) |
| 191 | really_inline static simd8<uint8_t> repeat_16( |
| 192 | uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, |
| 193 | uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 |
| 194 | ) { |
| 195 | return simd8<uint8_t>( |
| 196 | v0, v1, v2, v3, v4, v5, v6, v7, |
| 197 | v8, v9, v10,v11,v12,v13,v14,v15, |
| 198 | v0, v1, v2, v3, v4, v5, v6, v7, |
| 199 | v8, v9, v10,v11,v12,v13,v14,v15 |
| 200 | ); |
| 201 | } |
| 202 | |
| 203 | // Saturated math |
| 204 | really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); } |
| 205 | really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); } |
| 206 | |
| 207 | // Order-specific operations |
| 208 | really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); } |
| 209 | really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); } |
| 210 | // Same as >, but only guarantees true is nonzero (< guarantees true = -1) |
| 211 | really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); } |
| 212 | // Same as <, but only guarantees true is nonzero (< guarantees true = -1) |
| 213 | really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); } |
| 214 | really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max(*this) == other; } |
| 215 | really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min(*this) == other; } |
| 216 | really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); } |
| 217 | really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); } |
| 218 | |
| 219 | // Bit-specific operations |
| 220 | really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); } |
| 221 | really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); } |
| 222 | really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); } |
| 223 | really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); } |
| 224 | really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); } |
| 225 | really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } |
| 226 | really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); } |
| 227 | really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); } |
| 228 | template<int N> |
| 229 | really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } |
| 230 | template<int N> |
| 231 | really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } |
| 232 | // Get one of the bits and make a bitmask out of it. |
| 233 | // e.g. value.get_bit<7>() gets the high bit |
| 234 | template<int N> |
| 235 | really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); } |
| 236 | }; |
| 237 | |
| 238 | template<typename T> |
| 239 | struct simd8x64 { |
| 240 | static const int NUM_CHUNKS = 64 / sizeof(simd8<T>); |
| 241 | const simd8<T> chunks[NUM_CHUNKS]; |
| 242 | |
| 243 | really_inline simd8x64() : chunks{simd8<T>(), simd8<T>()} {} |
| 244 | really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {} |
| 245 | really_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {} |
| 246 | |
| 247 | template <typename F> |
| 248 | static really_inline void each_index(F const& each) { |
| 249 | each(0); |
| 250 | each(1); |
| 251 | } |
| 252 | |
| 253 | really_inline void store(T ptr[64]) const { |
| 254 | this->chunks[0].store(ptr+sizeof(simd8<T>)*0); |
| 255 | this->chunks[1].store(ptr+sizeof(simd8<T>)*1); |
| 256 | } |
| 257 | |
| 258 | template <typename F> |
| 259 | really_inline void each(F const& each_chunk) const |
| 260 | { |
| 261 | each_chunk(this->chunks[0]); |
| 262 | each_chunk(this->chunks[1]); |
| 263 | } |
| 264 | |
| 265 | template <typename R=bool, typename F> |
| 266 | really_inline simd8x64<R> map(F const& map_chunk) const { |
| 267 | return simd8x64<R>( |
| 268 | map_chunk(this->chunks[0]), |
| 269 | map_chunk(this->chunks[1]) |
| 270 | ); |
| 271 | } |
| 272 | |
| 273 | template <typename R=bool, typename F> |
| 274 | really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const { |
| 275 | return simd8x64<R>( |
| 276 | map_chunk(this->chunks[0], b.chunks[0]), |
| 277 | map_chunk(this->chunks[1], b.chunks[1]) |
| 278 | ); |
| 279 | } |
| 280 | |
| 281 | template <typename F> |
| 282 | really_inline simd8<T> reduce(F const& reduce_pair) const { |
| 283 | return reduce_pair(this->chunks[0], this->chunks[1]); |
| 284 | } |
| 285 | |
| 286 | really_inline uint64_t to_bitmask() const { |
| 287 | uint64_t r_lo = static_cast<uint32_t>(this->chunks[0].to_bitmask()); |
| 288 | uint64_t r_hi = this->chunks[1].to_bitmask(); |
| 289 | return r_lo | (r_hi << 32); |
| 290 | } |
| 291 | |
| 292 | really_inline simd8x64<T> bit_or(const T m) const { |
| 293 | const simd8<T> mask = simd8<T>::splat(m); |
| 294 | return this->map( [&](auto a) { return a | mask; } ); |
| 295 | } |
| 296 | |
| 297 | really_inline uint64_t eq(const T m) const { |
| 298 | const simd8<T> mask = simd8<T>::splat(m); |
| 299 | return this->map( [&](auto a) { return a == mask; } ).to_bitmask(); |
| 300 | } |
| 301 | |
| 302 | really_inline uint64_t lteq(const T m) const { |
| 303 | const simd8<T> mask = simd8<T>::splat(m); |
| 304 | return this->map( [&](auto a) { return a <= mask; } ).to_bitmask(); |
| 305 | } |
| 306 | |
| 307 | }; // struct simd8x64<T> |
| 308 | |
| 309 | } // namespace simdjson::haswell::simd |
| 310 | UNTARGET_REGION |
| 311 | |
| 312 | #endif // IS_X86_64 |
| 313 | #endif // SIMDJSON_HASWELL_SIMD_H |
| 314 | |