simd.h source code [ClickHouse/contrib/simdjson/src/haswell/simd.h]

1	#ifndef SIMDJSON_HASWELL_SIMD_H
2	#define SIMDJSON_HASWELL_SIMD_H
3
4	#include "simdjson/portability.h"
5
6	#ifdef IS_X86_64
7
8	#include "simdjson/common_defs.h"
9	#include "haswell/intrinsics.h"
10
11	TARGET_HASWELL
12	namespace simdjson::haswell::simd {
13
14	// Forward-declared so they can be used by splat and friends.
15	template<typename Child>
16	struct base {
17	__m256i value;
18
19	// Zero constructor
20	really_inline base() : value{__m256i()} {}
21
22	// Conversion from SIMD register
23	really_inline base(const __m256i _value) : value(_value) {}
24
25	// Conversion to SIMD register
26	really_inline operator const __m256i&() const { return this->value; }
27	really_inline operator __m256i&() { return this->value; }
28
29	// Bit operations
30	really_inline Child operator\|(const Child other) const { return _mm256_or_si256(*this, other); }
31	really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
32	really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
33	really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
34	really_inline Child operator~() const { return *this ^ `0xFFu`; }
35	really_inline Child& operator\|=(const Child other) { auto this_cast = (Child)this; this_cast = this_cast \| other; return* *this_cast; }
36	really_inline Child& operator&=(const Child other) { auto this_cast = (Child)this; this_cast = this_cast & other; return* *this_cast; }
37	really_inline Child& operator^=(const Child other) { auto this_cast = (Child)this; this_cast = this_cast ^ other; return* *this_cast; }
38	};
39
40	// Forward-declared so they can be used by splat and friends.
41	template<typename T>
42	struct simd8;
43
44	template<typename T, typename Mask=simd8<bool>>
45	struct base8: base<simd8<T>> {
46	typedef uint32_t bitmask_t;
47	typedef uint64_t bitmask2_t;
48
49	really_inline base8() : base<simd8<T>>() {}
50	really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
51
52	really_inline Mask operator==(const simd8<T> other) const { return _mm256_cmpeq_epi8(*this, other); }
53
54	static const int SIZE = sizeof(base<T>::value);
55
56	template<int N=`1`>
57	really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
58	return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, `0x21`), `16` - N);
59	}
60	};
61
62	// SIMD byte mask type (returned by things like eq and gt)
63	template<>
64	struct simd8<bool>: base8<bool> {
65	static really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(-(!!_value)); }
66
67	really_inline simd8<bool>() : base8 () {}
68	really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
69	// Splat constructor
70	really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
71
72	really_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); }
73	really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
74	};
75
76	template<typename T>
77	struct base8_numeric: base8<T> {
78	static really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
79	static really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
80	static really_inline simd8<T> load(const T values[`32`]) {
81	return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
82	}
83	// Repeat 16 values as many times as necessary (usually for lookup tables)
84	static really_inline simd8<T> repeat_16(
85	T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
86	T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
87	) {
88	return simd8<T>(
89	v0, v1, v2, v3, v4, v5, v6, v7,
90	v8, v9, v10,v11,v12,v13,v14,v15,
91	v0, v1, v2, v3, v4, v5, v6, v7,
92	v8, v9, v10,v11,v12,v13,v14,v15
93	);
94	}
95
96	really_inline base8_numeric() : base8<T>() {}
97	really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
98
99	// Store to array
100	really_inline void store(T dst[`32`]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i >(dst), this); }
101
102	// Addition/subtraction are the same for signed and unsigned
103	really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
104	really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
105	really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return (simd8<T>)this; }
106	really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return (simd8<T>)this; }
107
108	// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
109	template<typename L>
110	really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
111	return _mm256_shuffle_epi8(lookup_table, *this);
112	}
113	template<typename L>
114	really_inline simd8<L> lookup_16(
115	L replace0, L replace1, L replace2, L replace3,
116	L replace4, L replace5, L replace6, L replace7,
117	L replace8, L replace9, L replace10, L replace11,
118	L replace12, L replace13, L replace14, L replace15) const {
119	return lookup_16(simd8<L>::repeat_16(
120	replace0, replace1, replace2, replace3,
121	replace4, replace5, replace6, replace7,
122	replace8, replace9, replace10, replace11,
123	replace12, replace13, replace14, replace15
124	));
125	}
126	};
127
128	// Signed bytes
129	template<>
130	struct simd8<int8_t> : base8_numeric<int8_t> {
131	really_inline simd8() : base8_numeric<int8_t>() {}
132	really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
133	// Splat constructor
134	really_inline simd8(int8_t _value) : simd8 (splat(_value)) {}
135	// Array constructor
136	really_inline simd8(const int8_t values[`32`]) : simd8 (load(values)) {}
137	// Member-by-member initialization
138	really_inline simd8(
139	int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
140	int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
141	int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
142	int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
143	) : simd8 (_mm256_setr_epi8(
144	v0, v1, v2, v3, v4, v5, v6, v7,
145	v8, v9, v10,v11,v12,v13,v14,v15,
146	v16,v17,v18,v19,v20,v21,v22,v23,
147	v24,v25,v26,v27,v28,v29,v30,v31
148	)) {}
149	// Repeat 16 values as many times as necessary (usually for lookup tables)
150	really_inline static simd8<int8_t> repeat_16(
151	int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
152	int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
153	) {
154	return simd8<int8_t>(
155	v0, v1, v2, v3, v4, v5, v6, v7,
156	v8, v9, v10,v11,v12,v13,v14,v15,
157	v0, v1, v2, v3, v4, v5, v6, v7,
158	v8, v9, v10,v11,v12,v13,v14,v15
159	);
160	}
161
162	// Order-sensitive comparisons
163	really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
164	really_inline simd8<int8_t> min(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
165	really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
166	really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
167	};
168
169	// Unsigned bytes
170	template<>
171	struct simd8<uint8_t>: base8_numeric<uint8_t> {
172	really_inline simd8() : base8_numeric<uint8_t>() {}
173	really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
174	// Splat constructor
175	really_inline simd8(uint8_t _value) : simd8 (splat(_value)) {}
176	// Array constructor
177	really_inline simd8(const uint8_t values[`32`]) : simd8 (load(values)) {}
178	// Member-by-member initialization
179	really_inline simd8(
180	uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
181	uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
182	uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
183	uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
184	) : simd8 (_mm256_setr_epi8(
185	v0, v1, v2, v3, v4, v5, v6, v7,
186	v8, v9, v10,v11,v12,v13,v14,v15,
187	v16,v17,v18,v19,v20,v21,v22,v23,
188	v24,v25,v26,v27,v28,v29,v30,v31
189	)) {}
190	// Repeat 16 values as many times as necessary (usually for lookup tables)
191	really_inline static simd8<uint8_t> repeat_16(
192	uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
193	uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
194	) {
195	return simd8<uint8_t>(
196	v0, v1, v2, v3, v4, v5, v6, v7,
197	v8, v9, v10,v11,v12,v13,v14,v15,
198	v0, v1, v2, v3, v4, v5, v6, v7,
199	v8, v9, v10,v11,v12,v13,v14,v15
200	);
201	}
202
203	// Saturated math
204	really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
205	really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
206
207	// Order-specific operations
208	really_inline simd8<uint8_t> max(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
209	really_inline simd8<uint8_t> min(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
210	// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
211	really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
212	// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
213	really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
214	really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max(*this) == other; }
215	really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min(*this) == other; }
216	really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
217	really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
218
219	// Bit-specific operations
220	really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(`0`); }
221	really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
222	really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
223	really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
224	really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
225	really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
226	really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
227	really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
228	template<int N>
229	really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(`0xFFu` >> N); }
230	template<int N>
231	really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(`0xFFu` << N); }
232	// Get one of the bits and make a bitmask out of it.
233	// e.g. value.get_bit<7>() gets the high bit
234	template<int N>
235	really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, `7`-N)); }
236	};
237
238	template<typename T>
239	struct simd8x64 {
240	static const int NUM_CHUNKS = `64` / sizeof(simd8<T>);
241	const simd8<T> chunks[NUM_CHUNKS];
242
243	really_inline simd8x64() : chunks{simd8<T>(), simd8<T>()} {}
244	really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
245	really_inline simd8x64(const T ptr[`64`]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+`32`)} {}
246
247	template <typename F>
248	static really_inline void each_index(F const& each) {
249	each(`0`);
250	each(`1`);
251	}
252
253	really_inline void store(T ptr[`64`]) const {
254	this->chunks[`0`].store(ptr+sizeof(simd8<T>)*`0`);
255	this->chunks[`1`].store(ptr+sizeof(simd8<T>)*`1`);
256	}
257
258	template <typename F>
259	really_inline void each(F const& each_chunk) const
260	{
261	each_chunk(this->chunks[`0`]);
262	each_chunk(this->chunks[`1`]);
263	}
264
265	template <typename R=bool, typename F>
266	really_inline simd8x64<R> map(F const& map_chunk) const {
267	return simd8x64<R>(
268	map_chunk(this->chunks[`0`]),
269	map_chunk(this->chunks[`1`])
270	);
271	}
272
273	template <typename R=bool, typename F>
274	really_inline simd8x64<R> map(const simd8x64<uint8_t> b, F const& map_chunk) const {
275	return simd8x64<R>(
276	map_chunk(this->chunks[`0`], b.chunks[`0`]),
277	map_chunk(this->chunks[`1`], b.chunks[`1`])
278	);
279	}
280
281	template <typename F>
282	really_inline simd8<T> reduce(F const& reduce_pair) const {
283	return reduce_pair(this->chunks[`0`], this->chunks[`1`]);
284	}
285
286	really_inline uint64_t to_bitmask() const {
287	uint64_t r_lo = static_cast<uint32_t>(this->chunks[`0`].to_bitmask());
288	uint64_t r_hi = this->chunks[`1`].to_bitmask();
289	return r_lo \| (r_hi << `32`);
290	}
291
292	really_inline simd8x64<T> bit_or(const T m) const {
293	const simd8<T> mask = simd8<T>::splat(m);
294	return this->map( [&](auto a) { return a \| mask; } );
295	}
296
297	really_inline uint64_t eq(const T m) const {
298	const simd8<T> mask = simd8<T>::splat(m);
299	return this->map( [&](auto a) { return a == mask; } ).to_bitmask();
300	}
301
302	really_inline uint64_t lteq(const T m) const {
303	const simd8<T> mask = simd8<T>::splat(m);
304	return this->map( [&](auto a) { return a <= mask; } ).to_bitmask();
305	}
306
307	}; // struct simd8x64<T>
308
309	} // namespace simdjson::haswell::simd
310	UNTARGET_REGION
311
312	#endif // IS_X86_64
313	#endif // SIMDJSON_HASWELL_SIMD_H
314

Browse the source code of ClickHouse/contrib/simdjson/src/haswell/simd.h