state_compress.c source code [ClickHouse/contrib/hyperscan/src/util/state_compress.c]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Mask-based state compression, used by the NFA.
31	*/
32	#include "config.h"
33	#include "ue2common.h"
34	#include "arch.h"
35	#include "bitutils.h"
36	#include "unaligned.h"
37	#include "pack_bits.h"
38	#include "partial_store.h"
39	#include "popcount.h"
40	#include "state_compress.h"
41
42	#include <string.h>
43
44	/*
45	* 32-bit store/load.
46	*/
47
48	void storecompressed32(void ptr, const* u32 x, const* u32 *m, u32 bytes) {
49	assert(popcount32(m) <= bytes `8`);
50
51	u32 v = compress32(x, m);
52	partial_store_u32(ptr, v, bytes);
53	}
54
55	void loadcompressed32(u32 x, const* void ptr, const* u32 *m, u32 bytes) {
56	assert(popcount32(m) <= bytes `8`);
57
58	u32 v = partial_load_u32(ptr, bytes);
59	x = expand32(v, m);
60	}
61
62	/*
63	* 64-bit store/load.
64	*/
65
66	void storecompressed64(void ptr, const* u64a x, const* u64a *m, u32 bytes) {
67	assert(popcount64(m) <= bytes `8`);
68
69	u64a v = compress64(x, m);
70	partial_store_u64a(ptr, v, bytes);
71	}
72
73	void loadcompressed64(u64a x, const* void ptr, const* u64a *m, u32 bytes) {
74	assert(popcount64(m) <= bytes `8`);
75
76	u64a v = partial_load_u64a(ptr, bytes);
77	x = expand64(v, m);
78	}
79
80	/*
81	* 128-bit store/load.
82	*/
83
84	#if defined(ARCH_32_BIT)
85	static really_inline
86	void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) {
87	// First, decompose our vectors into 32-bit chunks.
88	u32 x[`4`];
89	memcpy(x, &xvec, sizeof(xvec));
90	u32 m[`4`];
91	memcpy(m, &mvec, sizeof(mvec));
92
93	// Count the number of bits of compressed state we're writing out per
94	// chunk.
95	u32 bits[`4`] = { popcount32(m[`0`]), popcount32(m[`1`]),
96	popcount32(m[`2`]), popcount32(m[`3`]) };
97
98	// Compress each 32-bit chunk individually.
99	u32 v[`4`] = { compress32(x[`0`], m[`0`]), compress32(x[`1`], m[`1`]),
100	compress32(x[`2`], m[`2`]), compress32(x[`3`], m[`3`]) };
101
102	// Write packed data out.
103	pack_bits_32(ptr, v, bits, `4`);
104	}
105	#endif
106
107	#if defined(ARCH_64_BIT)
108	static really_inline
109	void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
110	// First, decompose our vectors into 64-bit chunks.
111	u64a x[`2`];
112	memcpy(x, &xvec, sizeof(xvec));
113	u64a m[`2`];
114	memcpy(m, &mvec, sizeof(mvec));
115
116	// Count the number of bits of compressed state we're writing out per
117	// chunk.
118	u32 bits[`2`] = { popcount64(m[`0`]), popcount64(m[`1`]) };
119
120	// Compress each 64-bit chunk individually.
121	u64a v[`2`] = { compress64(x[`0`], m[`0`]), compress64(x[`1`], m[`1`]) };
122
123	// Write packed data out.
124	pack_bits_64(ptr, v, bits, `2`);
125	}
126	#endif
127
128	void storecompressed128(void ptr, const* m128 x, const* m128 *m,
129	UNUSED u32 bytes) {
130	#if defined(ARCH_64_BIT)
131	storecompressed128_64bit(ptr, x, m);
132	#else
133	storecompressed128_32bit(ptr, x, m);
134	#endif
135	}
136
137	#if defined(ARCH_32_BIT)
138	static really_inline
139	m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
140	// First, decompose our vectors into 32-bit chunks.
141	u32 m[`8`];
142	memcpy(m, &mvec, sizeof(mvec));
143
144	u32 bits[`4`] = { popcount32(m[`0`]), popcount32(m[`1`]),
145	popcount32(m[`2`]), popcount32(m[`3`]) };
146	u32 v[`4`];
147
148	unpack_bits_32(v, (const u8 *)ptr, bits, `4`);
149
150	u32 x[`4`] = { expand32(v[`0`], m[`0`]), expand32(v[`1`], m[`1`]),
151	expand32(v[`2`], m[`2`]), expand32(v[`3`], m[`3`]) };
152
153	return _mm_set_epi32(x[`3`], x[`2`], x[`1`], x[`0`]);
154	}
155	#endif
156
157	#if defined(ARCH_64_BIT)
158	static really_inline
159	m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
160	// First, decompose our vectors into 64-bit chunks.
161	u64a m[`2`] = { movq(mvec), movq(_mm_srli_si128(mvec, `8`)) };
162
163	u32 bits[`2`] = { popcount64(m[`0`]), popcount64(m[`1`]) };
164	u64a v[`2`];
165
166	unpack_bits_64(v, (const u8 *)ptr, bits, `2`);
167
168	u64a x[`2`] = { expand64(v[`0`], m[`0`]), expand64(v[`1`], m[`1`]) };
169
170	return _mm_set_epi64x(x[`1`], x[`0`]);
171	}
172	#endif
173
174	void loadcompressed128(m128 x, const* void ptr, const* m128 *m,
175	UNUSED u32 bytes) {
176	#if defined(ARCH_64_BIT)
177	x = loadcompressed128_64bit(ptr, m);
178	#else
179	x = loadcompressed128_32bit(ptr, m);
180	#endif
181	}
182
183	/*
184	* 256-bit store/load.
185	*/
186
187	#if defined(ARCH_32_BIT)
188	static really_inline
189	void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) {
190	// First, decompose our vectors into 32-bit chunks.
191	u32 x[`8`];
192	memcpy(x, &xvec, sizeof(xvec));
193	u32 m[`8`];
194	memcpy(m, &mvec, sizeof(mvec));
195
196	// Count the number of bits of compressed state we're writing out per
197	// chunk.
198	u32 bits[`8`] = { popcount32(m[`0`]), popcount32(m[`1`]),
199	popcount32(m[`2`]), popcount32(m[`3`]),
200	popcount32(m[`4`]), popcount32(m[`5`]),
201	popcount32(m[`6`]), popcount32(m[`7`])};
202
203	// Compress each 32-bit chunk individually.
204	u32 v[`8`] = { compress32(x[`0`], m[`0`]), compress32(x[`1`], m[`1`]),
205	compress32(x[`2`], m[`2`]), compress32(x[`3`], m[`3`]),
206	compress32(x[`4`], m[`4`]), compress32(x[`5`], m[`5`]),
207	compress32(x[`6`], m[`6`]), compress32(x[`7`], m[`7`]) };
208
209	// Write packed data out.
210	pack_bits_32(ptr, v, bits, `8`);
211	}
212	#endif
213
214	#if defined(ARCH_64_BIT)
215	static really_really_inline
216	void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) {
217	// First, decompose our vectors into 64-bit chunks.
218	u64a x[`4`];
219	memcpy(x, &xvec, sizeof(xvec));
220	u64a m[`4`];
221	memcpy(m, &mvec, sizeof(mvec));
222
223	// Count the number of bits of compressed state we're writing out per
224	// chunk.
225	u32 bits[`4`] = { popcount64(m[`0`]), popcount64(m[`1`]),
226	popcount64(m[`2`]), popcount64(m[`3`]) };
227
228	// Compress each 64-bit chunk individually.
229	u64a v[`4`] = { compress64(x[`0`], m[`0`]), compress64(x[`1`], m[`1`]),
230	compress64(x[`2`], m[`2`]), compress64(x[`3`], m[`3`]) };
231
232	// Write packed data out.
233	pack_bits_64(ptr, v, bits, `4`);
234	}
235	#endif
236
237	void storecompressed256(void ptr, const* m256 x, const* m256 *m,
238	UNUSED u32 bytes) {
239	#if defined(ARCH_64_BIT)
240	storecompressed256_64bit(ptr, x, m);
241	#else
242	storecompressed256_32bit(ptr, x, m);
243	#endif
244	}
245
246	#if defined(ARCH_32_BIT)
247	static really_inline
248	m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
249	// First, decompose our vectors into 32-bit chunks.
250	u32 m[`8`];
251	memcpy(m, &mvec, sizeof(mvec));
252
253	u32 bits[`8`] = { popcount32(m[`0`]), popcount32(m[`1`]),
254	popcount32(m[`2`]), popcount32(m[`3`]),
255	popcount32(m[`4`]), popcount32(m[`5`]),
256	popcount32(m[`6`]), popcount32(m[`7`])};
257	u32 v[`8`];
258
259	unpack_bits_32(v, (const u8 *)ptr, bits, `8`);
260
261	u32 x[`8`] = { expand32(v[`0`], m[`0`]), expand32(v[`1`], m[`1`]),
262	expand32(v[`2`], m[`2`]), expand32(v[`3`], m[`3`]),
263	expand32(v[`4`], m[`4`]), expand32(v[`5`], m[`5`]),
264	expand32(v[`6`], m[`6`]), expand32(v[`7`], m[`7`]) };
265
266	#if !defined(HAVE_AVX2)
267	m256 xvec = { .lo = _mm_set_epi32(x[`3`], x[`2`], x[`1`], x[`0`]),
268	.hi = _mm_set_epi32(x[`7`], x[`6`], x[`5`], x[`4`]) };
269	#else
270	m256 xvec = _mm256_set_epi32(x[`7`], x[`6`], x[`5`], x[`4`],
271	x[`3`], x[`2`], x[`1`], x[`0`]);
272	#endif
273	return xvec;
274	}
275	#endif
276
277	#if defined(ARCH_64_BIT)
278	static really_inline
279	m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
280	// First, decompose our vectors into 64-bit chunks.
281	u64a m[`4`];
282	memcpy(m, &mvec, sizeof(mvec));
283
284	u32 bits[`4`] = { popcount64(m[`0`]), popcount64(m[`1`]),
285	popcount64(m[`2`]), popcount64(m[`3`]) };
286	u64a v[`4`];
287
288	unpack_bits_64(v, (const u8 *)ptr, bits, `4`);
289
290	u64a x[`4`] = { expand64(v[`0`], m[`0`]), expand64(v[`1`], m[`1`]),
291	expand64(v[`2`], m[`2`]), expand64(v[`3`], m[`3`]) };
292
293	#if !defined(HAVE_AVX2)
294	m256 xvec = { .lo = _mm_set_epi64x(x[`1`], x[`0`]),
295	.hi = _mm_set_epi64x(x[`3`], x[`2`]) };
296	#else
297	m256 xvec = _mm256_set_epi64x(x[`3`], x[`2`], x[`1`], x[`0`]);
298	#endif
299	return xvec;
300	}
301	#endif
302
303	void loadcompressed256(m256 x, const* void ptr, const* m256 *m,
304	UNUSED u32 bytes) {
305	#if defined(ARCH_64_BIT)
306	x = loadcompressed256_64bit(ptr, m);
307	#else
308	x = loadcompressed256_32bit(ptr, m);
309	#endif
310	}
311
312	/*
313	* 384-bit store/load.
314	*/
315
316	#if defined(ARCH_32_BIT)
317	static really_inline
318	void storecompressed384_32bit(void *ptr, m384 xvec, m384 mvec) {
319	// First, decompose our vectors into 32-bit chunks.
320	u32 x[`12`];
321	memcpy(x, &xvec, sizeof(xvec));
322	u32 m[`12`];
323	memcpy(m, &mvec, sizeof(mvec));
324
325	// Count the number of bits of compressed state we're writing out per
326	// chunk.
327	u32 bits[`12`] = { popcount32(m[`0`]), popcount32(m[`1`]),
328	popcount32(m[`2`]), popcount32(m[`3`]),
329	popcount32(m[`4`]), popcount32(m[`5`]),
330	popcount32(m[`6`]), popcount32(m[`7`]),
331	popcount32(m[`8`]), popcount32(m[`9`]),
332	popcount32(m[`10`]), popcount32(m[`11`]) };
333
334	// Compress each 32-bit chunk individually.
335	u32 v[`12`] = { compress32(x[`0`], m[`0`]), compress32(x[`1`], m[`1`]),
336	compress32(x[`2`], m[`2`]), compress32(x[`3`], m[`3`]),
337	compress32(x[`4`], m[`4`]), compress32(x[`5`], m[`5`]),
338	compress32(x[`6`], m[`6`]), compress32(x[`7`], m[`7`]),
339	compress32(x[`8`], m[`8`]), compress32(x[`9`], m[`9`]),
340	compress32(x[`10`], m[`10`]), compress32(x[`11`], m[`11`])};
341
342	// Write packed data out.
343	pack_bits_32(ptr, v, bits, `12`);
344	}
345	#endif
346
347	#if defined(ARCH_64_BIT)
348	static really_inline
349	void storecompressed384_64bit(void *ptr, m384 xvec, m384 mvec) {
350	// First, decompose our vectors into 64-bit chunks.
351	u64a x[`6`];
352	memcpy(x, &xvec, sizeof(xvec));
353	u64a m[`6`];
354	memcpy(m, &mvec, sizeof(mvec));
355
356	// Count the number of bits of compressed state we're writing out per
357	// chunk.
358	u32 bits[`6`] = { popcount64(m[`0`]), popcount64(m[`1`]),
359	popcount64(m[`2`]), popcount64(m[`3`]),
360	popcount64(m[`4`]), popcount64(m[`5`]) };
361
362	// Compress each 64-bit chunk individually.
363	u64a v[`6`] = { compress64(x[`0`], m[`0`]), compress64(x[`1`], m[`1`]),
364	compress64(x[`2`], m[`2`]), compress64(x[`3`], m[`3`]),
365	compress64(x[`4`], m[`4`]), compress64(x[`5`], m[`5`]) };
366
367	// Write packed data out.
368	pack_bits_64(ptr, v, bits, `6`);
369	}
370	#endif
371
372	void storecompressed384(void ptr, const* m384 x, const* m384 *m,
373	UNUSED u32 bytes) {
374	#if defined(ARCH_64_BIT)
375	storecompressed384_64bit(ptr, x, m);
376	#else
377	storecompressed384_32bit(ptr, x, m);
378	#endif
379	}
380
381	#if defined(ARCH_32_BIT)
382	static really_inline
383	m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
384	// First, decompose our vectors into 32-bit chunks.
385	u32 m[`12`];
386	memcpy(m, &mvec, sizeof(mvec));
387
388	u32 bits[`12`] = { popcount32(m[`0`]), popcount32(m[`1`]),
389	popcount32(m[`2`]), popcount32(m[`3`]),
390	popcount32(m[`4`]), popcount32(m[`5`]),
391	popcount32(m[`6`]), popcount32(m[`7`]),
392	popcount32(m[`8`]), popcount32(m[`9`]),
393	popcount32(m[`10`]), popcount32(m[`11`]) };
394	u32 v[`12`];
395
396	unpack_bits_32(v, (const u8 *)ptr, bits, `12`);
397
398	u32 x[`12`] = { expand32(v[`0`], m[`0`]), expand32(v[`1`], m[`1`]),
399	expand32(v[`2`], m[`2`]), expand32(v[`3`], m[`3`]),
400	expand32(v[`4`], m[`4`]), expand32(v[`5`], m[`5`]),
401	expand32(v[`6`], m[`6`]), expand32(v[`7`], m[`7`]),
402	expand32(v[`8`], m[`8`]), expand32(v[`9`], m[`9`]),
403	expand32(v[`10`], m[`10`]), expand32(v[`11`], m[`11`]) };
404
405	m384 xvec = { .lo = _mm_set_epi32(x[`3`], x[`2`], x[`1`], x[`0`]),
406	.mid = _mm_set_epi32(x[`7`], x[`6`], x[`5`], x[`4`]),
407	.hi = _mm_set_epi32(x[`11`], x[`10`], x[`9`], x[`8`]) };
408	return xvec;
409	}
410	#endif
411
412	#if defined(ARCH_64_BIT)
413	static really_inline
414	m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
415	// First, decompose our vectors into 64-bit chunks.
416	u64a m[`6`];
417	memcpy(m, &mvec, sizeof(mvec));
418
419	u32 bits[`6`] = { popcount64(m[`0`]), popcount64(m[`1`]),
420	popcount64(m[`2`]), popcount64(m[`3`]),
421	popcount64(m[`4`]), popcount64(m[`5`]) };
422	u64a v[`6`];
423
424	unpack_bits_64(v, (const u8 *)ptr, bits, `6`);
425
426	u64a x[`6`] = { expand64(v[`0`], m[`0`]), expand64(v[`1`], m[`1`]),
427	expand64(v[`2`], m[`2`]), expand64(v[`3`], m[`3`]),
428	expand64(v[`4`], m[`4`]), expand64(v[`5`], m[`5`]) };
429
430	m384 xvec = { .lo = _mm_set_epi64x(x[`1`], x[`0`]),
431	.mid = _mm_set_epi64x(x[`3`], x[`2`]),
432	.hi = _mm_set_epi64x(x[`5`], x[`4`]) };
433	return xvec;
434	}
435	#endif
436
437	void loadcompressed384(m384 x, const* void ptr, const* m384 *m,
438	UNUSED u32 bytes) {
439	#if defined(ARCH_64_BIT)
440	x = loadcompressed384_64bit(ptr, m);
441	#else
442	x = loadcompressed384_32bit(ptr, m);
443	#endif
444	}
445
446	/*
447	* 512-bit store/load.
448	*/
449
450	#if defined(ARCH_32_BIT)
451	static really_inline
452	void storecompressed512_32bit(void *ptr, m512 xvec, m512 mvec) {
453	// First, decompose our vectors into 32-bit chunks.
454	u32 x[`16`];
455	memcpy(x, &xvec, sizeof(xvec));
456	u32 m[`16`];
457	memcpy(m, &mvec, sizeof(mvec));
458
459	// Count the number of bits of compressed state we're writing out per
460	// chunk.
461	u32 bits[`16`] = { popcount32(m[`0`]), popcount32(m[`1`]),
462	popcount32(m[`2`]), popcount32(m[`3`]),
463	popcount32(m[`4`]), popcount32(m[`5`]),
464	popcount32(m[`6`]), popcount32(m[`7`]),
465	popcount32(m[`8`]), popcount32(m[`9`]),
466	popcount32(m[`10`]), popcount32(m[`11`]),
467	popcount32(m[`12`]), popcount32(m[`13`]),
468	popcount32(m[`14`]), popcount32(m[`15`])};
469
470	// Compress each 32-bit chunk individually.
471	u32 v[`16`] = { compress32(x[`0`], m[`0`]), compress32(x[`1`], m[`1`]),
472	compress32(x[`2`], m[`2`]), compress32(x[`3`], m[`3`]),
473	compress32(x[`4`], m[`4`]), compress32(x[`5`], m[`5`]),
474	compress32(x[`6`], m[`6`]), compress32(x[`7`], m[`7`]),
475	compress32(x[`8`], m[`8`]), compress32(x[`9`], m[`9`]),
476	compress32(x[`10`], m[`10`]), compress32(x[`11`], m[`11`]),
477	compress32(x[`12`], m[`12`]), compress32(x[`13`], m[`13`]),
478	compress32(x[`14`], m[`14`]), compress32(x[`15`], m[`15`]) };
479
480	// Write packed data out.
481	pack_bits_32(ptr, v, bits, `16`);
482	}
483	#endif
484
485	#if defined(ARCH_64_BIT)
486	static really_inline
487	void storecompressed512_64bit(void *ptr, m512 xvec, m512 mvec) {
488	// First, decompose our vectors into 64-bit chunks.
489	u64a m[`8`];
490	memcpy(m, &mvec, sizeof(mvec));
491	u64a x[`8`];
492	memcpy(x, &xvec, sizeof(xvec));
493
494	// Count the number of bits of compressed state we're writing out per
495	// chunk.
496	u32 bits[`8`] = { popcount64(m[`0`]), popcount64(m[`1`]),
497	popcount64(m[`2`]), popcount64(m[`3`]),
498	popcount64(m[`4`]), popcount64(m[`5`]),
499	popcount64(m[`6`]), popcount64(m[`7`]) };
500
501	// Compress each 64-bit chunk individually.
502	u64a v[`8`] = { compress64(x[`0`], m[`0`]), compress64(x[`1`], m[`1`]),
503	compress64(x[`2`], m[`2`]), compress64(x[`3`], m[`3`]),
504	compress64(x[`4`], m[`4`]), compress64(x[`5`], m[`5`]),
505	compress64(x[`6`], m[`6`]), compress64(x[`7`], m[`7`]) };
506
507	// Write packed data out.
508	pack_bits_64(ptr, v, bits, `8`);
509	}
510	#endif
511
512	void storecompressed512(void ptr, const* m512 x, const* m512 *m,
513	UNUSED u32 bytes) {
514	#if defined(ARCH_64_BIT)
515	storecompressed512_64bit(ptr, x, m);
516	#else
517	storecompressed512_32bit(ptr, x, m);
518	#endif
519	}
520
521	#if defined(ARCH_32_BIT)
522	static really_inline
523	m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
524	// First, decompose our vectors into 32-bit chunks.
525	u32 m[`16`];
526	memcpy(m, &mvec, sizeof(mvec));
527
528	u32 bits[`16`] = { popcount32(m[`0`]), popcount32(m[`1`]),
529	popcount32(m[`2`]), popcount32(m[`3`]),
530	popcount32(m[`4`]), popcount32(m[`5`]),
531	popcount32(m[`6`]), popcount32(m[`7`]),
532	popcount32(m[`8`]), popcount32(m[`9`]),
533	popcount32(m[`10`]), popcount32(m[`11`]),
534	popcount32(m[`12`]), popcount32(m[`13`]),
535	popcount32(m[`14`]), popcount32(m[`15`]) };
536	u32 v[`16`];
537
538	unpack_bits_32(v, (const u8 *)ptr, bits, `16`);
539
540	u32 x[`16`] = { expand32(v[`0`], m[`0`]), expand32(v[`1`], m[`1`]),
541	expand32(v[`2`], m[`2`]), expand32(v[`3`], m[`3`]),
542	expand32(v[`4`], m[`4`]), expand32(v[`5`], m[`5`]),
543	expand32(v[`6`], m[`6`]), expand32(v[`7`], m[`7`]),
544	expand32(v[`8`], m[`8`]), expand32(v[`9`], m[`9`]),
545	expand32(v[`10`], m[`10`]), expand32(v[`11`], m[`11`]),
546	expand32(v[`12`], m[`12`]), expand32(v[`13`], m[`13`]),
547	expand32(v[`14`], m[`14`]), expand32(v[`15`], m[`15`]) };
548
549	m512 xvec;
550	#if defined(HAVE_AVX512)
551	xvec = _mm512_set_epi32(x[`15`], x[`14`], x[`13`], x[`12`],
552	x[`11`], x[`10`], x[`9`], x[`8`],
553	x[`7`], x[`6`], x[`5`], x[`4`],
554	x[`3`], x[`2`], x[`1`], x[`0`]);
555	#elif defined(HAVE_AVX2)
556	xvec.lo = _mm256_set_epi32(x[`7`], x[`6`], x[`5`], x[`4`],
557	x[`3`], x[`2`], x[`1`], x[`0`]);
558	xvec.hi = _mm256_set_epi32(x[`15`], x[`14`], x[`13`], x[`12`],
559	x[`11`], x[`10`], x[`9`], x[`8`]);
560	#else
561	xvec.lo.lo = _mm_set_epi32(x[`3`], x[`2`], x[`1`], x[`0`]);
562	xvec.lo.hi = _mm_set_epi32(x[`7`], x[`6`], x[`5`], x[`4`]);
563	xvec.hi.lo = _mm_set_epi32(x[`11`], x[`10`], x[`9`], x[`8`]);
564	xvec.hi.hi = _mm_set_epi32(x[`15`], x[`14`], x[`13`], x[`12`]);
565	#endif
566	return xvec;
567	}
568	#endif
569
570	#if defined(ARCH_64_BIT)
571	static really_inline
572	m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
573	// First, decompose our vectors into 64-bit chunks.
574	u64a m[`8`];
575	memcpy(m, &mvec, sizeof(mvec));
576
577	u32 bits[`8`] = { popcount64(m[`0`]), popcount64(m[`1`]),
578	popcount64(m[`2`]), popcount64(m[`3`]),
579	popcount64(m[`4`]), popcount64(m[`5`]),
580	popcount64(m[`6`]), popcount64(m[`7`]) };
581	u64a v[`8`];
582
583	unpack_bits_64(v, (const u8 *)ptr, bits, `8`);
584
585	u64a x[`8`] = { expand64(v[`0`], m[`0`]), expand64(v[`1`], m[`1`]),
586	expand64(v[`2`], m[`2`]), expand64(v[`3`], m[`3`]),
587	expand64(v[`4`], m[`4`]), expand64(v[`5`], m[`5`]),
588	expand64(v[`6`], m[`6`]), expand64(v[`7`], m[`7`]) };
589
590	#if defined(HAVE_AVX512)
591	m512 xvec = _mm512_set_epi64(x[`7`], x[`6`], x[`5`], x[`4`],
592	x[`3`], x[`2`], x[`1`], x[`0`]);
593	#elif defined(HAVE_AVX2)
594	m512 xvec = { .lo = _mm256_set_epi64x(x[`3`], x[`2`], x[`1`], x[`0`]),
595	.hi = _mm256_set_epi64x(x[`7`], x[`6`], x[`5`], x[`4`])};
596	#else
597	m512 xvec = { .lo = { _mm_set_epi64x(x[`1`], x[`0`]),
598	_mm_set_epi64x(x[`3`], x[`2`]) },
599	.hi = { _mm_set_epi64x(x[`5`], x[`4`]),
600	_mm_set_epi64x(x[`7`], x[`6`]) } };
601	#endif
602	return xvec;
603	}
604	#endif
605
606	void loadcompressed512(m512 x, const* void ptr, const* m512 *m,
607	UNUSED u32 bytes) {
608	#if defined(ARCH_64_BIT)
609	x = loadcompressed512_64bit(ptr, m);
610	#else
611	x = loadcompressed512_32bit(ptr, m);
612	#endif
613	}
614

Browse the source code of ClickHouse/contrib/hyperscan/src/util/state_compress.c