1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Mask-based state compression, used by the NFA.
31 */
32#include "config.h"
33#include "ue2common.h"
34#include "arch.h"
35#include "bitutils.h"
36#include "unaligned.h"
37#include "pack_bits.h"
38#include "partial_store.h"
39#include "popcount.h"
40#include "state_compress.h"
41
42#include <string.h>
43
44/*
45 * 32-bit store/load.
46 */
47
48void storecompressed32(void *ptr, const u32 *x, const u32 *m, u32 bytes) {
49 assert(popcount32(*m) <= bytes * 8);
50
51 u32 v = compress32(*x, *m);
52 partial_store_u32(ptr, v, bytes);
53}
54
55void loadcompressed32(u32 *x, const void *ptr, const u32 *m, u32 bytes) {
56 assert(popcount32(*m) <= bytes * 8);
57
58 u32 v = partial_load_u32(ptr, bytes);
59 *x = expand32(v, *m);
60}
61
62/*
63 * 64-bit store/load.
64 */
65
66void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes) {
67 assert(popcount64(*m) <= bytes * 8);
68
69 u64a v = compress64(*x, *m);
70 partial_store_u64a(ptr, v, bytes);
71}
72
73void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes) {
74 assert(popcount64(*m) <= bytes * 8);
75
76 u64a v = partial_load_u64a(ptr, bytes);
77 *x = expand64(v, *m);
78}
79
80/*
81 * 128-bit store/load.
82 */
83
84#if defined(ARCH_32_BIT)
85static really_inline
86void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) {
87 // First, decompose our vectors into 32-bit chunks.
88 u32 x[4];
89 memcpy(x, &xvec, sizeof(xvec));
90 u32 m[4];
91 memcpy(m, &mvec, sizeof(mvec));
92
93 // Count the number of bits of compressed state we're writing out per
94 // chunk.
95 u32 bits[4] = { popcount32(m[0]), popcount32(m[1]),
96 popcount32(m[2]), popcount32(m[3]) };
97
98 // Compress each 32-bit chunk individually.
99 u32 v[4] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
100 compress32(x[2], m[2]), compress32(x[3], m[3]) };
101
102 // Write packed data out.
103 pack_bits_32(ptr, v, bits, 4);
104}
105#endif
106
107#if defined(ARCH_64_BIT)
108static really_inline
109void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
110 // First, decompose our vectors into 64-bit chunks.
111 u64a x[2];
112 memcpy(x, &xvec, sizeof(xvec));
113 u64a m[2];
114 memcpy(m, &mvec, sizeof(mvec));
115
116 // Count the number of bits of compressed state we're writing out per
117 // chunk.
118 u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
119
120 // Compress each 64-bit chunk individually.
121 u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) };
122
123 // Write packed data out.
124 pack_bits_64(ptr, v, bits, 2);
125}
126#endif
127
128void storecompressed128(void *ptr, const m128 *x, const m128 *m,
129 UNUSED u32 bytes) {
130#if defined(ARCH_64_BIT)
131 storecompressed128_64bit(ptr, *x, *m);
132#else
133 storecompressed128_32bit(ptr, *x, *m);
134#endif
135}
136
137#if defined(ARCH_32_BIT)
138static really_inline
139m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
140 // First, decompose our vectors into 32-bit chunks.
141 u32 m[8];
142 memcpy(m, &mvec, sizeof(mvec));
143
144 u32 bits[4] = { popcount32(m[0]), popcount32(m[1]),
145 popcount32(m[2]), popcount32(m[3]) };
146 u32 v[4];
147
148 unpack_bits_32(v, (const u8 *)ptr, bits, 4);
149
150 u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
151 expand32(v[2], m[2]), expand32(v[3], m[3]) };
152
153 return _mm_set_epi32(x[3], x[2], x[1], x[0]);
154}
155#endif
156
157#if defined(ARCH_64_BIT)
158static really_inline
159m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
160 // First, decompose our vectors into 64-bit chunks.
161 u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) };
162
163 u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
164 u64a v[2];
165
166 unpack_bits_64(v, (const u8 *)ptr, bits, 2);
167
168 u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
169
170 return _mm_set_epi64x(x[1], x[0]);
171}
172#endif
173
174void loadcompressed128(m128 *x, const void *ptr, const m128 *m,
175 UNUSED u32 bytes) {
176#if defined(ARCH_64_BIT)
177 *x = loadcompressed128_64bit(ptr, *m);
178#else
179 *x = loadcompressed128_32bit(ptr, *m);
180#endif
181}
182
183/*
184 * 256-bit store/load.
185 */
186
187#if defined(ARCH_32_BIT)
188static really_inline
189void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) {
190 // First, decompose our vectors into 32-bit chunks.
191 u32 x[8];
192 memcpy(x, &xvec, sizeof(xvec));
193 u32 m[8];
194 memcpy(m, &mvec, sizeof(mvec));
195
196 // Count the number of bits of compressed state we're writing out per
197 // chunk.
198 u32 bits[8] = { popcount32(m[0]), popcount32(m[1]),
199 popcount32(m[2]), popcount32(m[3]),
200 popcount32(m[4]), popcount32(m[5]),
201 popcount32(m[6]), popcount32(m[7])};
202
203 // Compress each 32-bit chunk individually.
204 u32 v[8] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
205 compress32(x[2], m[2]), compress32(x[3], m[3]),
206 compress32(x[4], m[4]), compress32(x[5], m[5]),
207 compress32(x[6], m[6]), compress32(x[7], m[7]) };
208
209 // Write packed data out.
210 pack_bits_32(ptr, v, bits, 8);
211}
212#endif
213
214#if defined(ARCH_64_BIT)
215static really_really_inline
216void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) {
217 // First, decompose our vectors into 64-bit chunks.
218 u64a x[4];
219 memcpy(x, &xvec, sizeof(xvec));
220 u64a m[4];
221 memcpy(m, &mvec, sizeof(mvec));
222
223 // Count the number of bits of compressed state we're writing out per
224 // chunk.
225 u32 bits[4] = { popcount64(m[0]), popcount64(m[1]),
226 popcount64(m[2]), popcount64(m[3]) };
227
228 // Compress each 64-bit chunk individually.
229 u64a v[4] = { compress64(x[0], m[0]), compress64(x[1], m[1]),
230 compress64(x[2], m[2]), compress64(x[3], m[3]) };
231
232 // Write packed data out.
233 pack_bits_64(ptr, v, bits, 4);
234}
235#endif
236
237void storecompressed256(void *ptr, const m256 *x, const m256 *m,
238 UNUSED u32 bytes) {
239#if defined(ARCH_64_BIT)
240 storecompressed256_64bit(ptr, *x, *m);
241#else
242 storecompressed256_32bit(ptr, *x, *m);
243#endif
244}
245
246#if defined(ARCH_32_BIT)
247static really_inline
248m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
249 // First, decompose our vectors into 32-bit chunks.
250 u32 m[8];
251 memcpy(m, &mvec, sizeof(mvec));
252
253 u32 bits[8] = { popcount32(m[0]), popcount32(m[1]),
254 popcount32(m[2]), popcount32(m[3]),
255 popcount32(m[4]), popcount32(m[5]),
256 popcount32(m[6]), popcount32(m[7])};
257 u32 v[8];
258
259 unpack_bits_32(v, (const u8 *)ptr, bits, 8);
260
261 u32 x[8] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
262 expand32(v[2], m[2]), expand32(v[3], m[3]),
263 expand32(v[4], m[4]), expand32(v[5], m[5]),
264 expand32(v[6], m[6]), expand32(v[7], m[7]) };
265
266#if !defined(HAVE_AVX2)
267 m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
268 .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
269#else
270 m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
271 x[3], x[2], x[1], x[0]);
272#endif
273 return xvec;
274}
275#endif
276
277#if defined(ARCH_64_BIT)
278static really_inline
279m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
280 // First, decompose our vectors into 64-bit chunks.
281 u64a m[4];
282 memcpy(m, &mvec, sizeof(mvec));
283
284 u32 bits[4] = { popcount64(m[0]), popcount64(m[1]),
285 popcount64(m[2]), popcount64(m[3]) };
286 u64a v[4];
287
288 unpack_bits_64(v, (const u8 *)ptr, bits, 4);
289
290 u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
291 expand64(v[2], m[2]), expand64(v[3], m[3]) };
292
293#if !defined(HAVE_AVX2)
294 m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
295 .hi = _mm_set_epi64x(x[3], x[2]) };
296#else
297 m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
298#endif
299 return xvec;
300}
301#endif
302
303void loadcompressed256(m256 *x, const void *ptr, const m256 *m,
304 UNUSED u32 bytes) {
305#if defined(ARCH_64_BIT)
306 *x = loadcompressed256_64bit(ptr, *m);
307#else
308 *x = loadcompressed256_32bit(ptr, *m);
309#endif
310}
311
312/*
313 * 384-bit store/load.
314 */
315
316#if defined(ARCH_32_BIT)
317static really_inline
318void storecompressed384_32bit(void *ptr, m384 xvec, m384 mvec) {
319 // First, decompose our vectors into 32-bit chunks.
320 u32 x[12];
321 memcpy(x, &xvec, sizeof(xvec));
322 u32 m[12];
323 memcpy(m, &mvec, sizeof(mvec));
324
325 // Count the number of bits of compressed state we're writing out per
326 // chunk.
327 u32 bits[12] = { popcount32(m[0]), popcount32(m[1]),
328 popcount32(m[2]), popcount32(m[3]),
329 popcount32(m[4]), popcount32(m[5]),
330 popcount32(m[6]), popcount32(m[7]),
331 popcount32(m[8]), popcount32(m[9]),
332 popcount32(m[10]), popcount32(m[11]) };
333
334 // Compress each 32-bit chunk individually.
335 u32 v[12] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
336 compress32(x[2], m[2]), compress32(x[3], m[3]),
337 compress32(x[4], m[4]), compress32(x[5], m[5]),
338 compress32(x[6], m[6]), compress32(x[7], m[7]),
339 compress32(x[8], m[8]), compress32(x[9], m[9]),
340 compress32(x[10], m[10]), compress32(x[11], m[11])};
341
342 // Write packed data out.
343 pack_bits_32(ptr, v, bits, 12);
344}
345#endif
346
347#if defined(ARCH_64_BIT)
348static really_inline
349void storecompressed384_64bit(void *ptr, m384 xvec, m384 mvec) {
350 // First, decompose our vectors into 64-bit chunks.
351 u64a x[6];
352 memcpy(x, &xvec, sizeof(xvec));
353 u64a m[6];
354 memcpy(m, &mvec, sizeof(mvec));
355
356 // Count the number of bits of compressed state we're writing out per
357 // chunk.
358 u32 bits[6] = { popcount64(m[0]), popcount64(m[1]),
359 popcount64(m[2]), popcount64(m[3]),
360 popcount64(m[4]), popcount64(m[5]) };
361
362 // Compress each 64-bit chunk individually.
363 u64a v[6] = { compress64(x[0], m[0]), compress64(x[1], m[1]),
364 compress64(x[2], m[2]), compress64(x[3], m[3]),
365 compress64(x[4], m[4]), compress64(x[5], m[5]) };
366
367 // Write packed data out.
368 pack_bits_64(ptr, v, bits, 6);
369}
370#endif
371
372void storecompressed384(void *ptr, const m384 *x, const m384 *m,
373 UNUSED u32 bytes) {
374#if defined(ARCH_64_BIT)
375 storecompressed384_64bit(ptr, *x, *m);
376#else
377 storecompressed384_32bit(ptr, *x, *m);
378#endif
379}
380
381#if defined(ARCH_32_BIT)
382static really_inline
383m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
384 // First, decompose our vectors into 32-bit chunks.
385 u32 m[12];
386 memcpy(m, &mvec, sizeof(mvec));
387
388 u32 bits[12] = { popcount32(m[0]), popcount32(m[1]),
389 popcount32(m[2]), popcount32(m[3]),
390 popcount32(m[4]), popcount32(m[5]),
391 popcount32(m[6]), popcount32(m[7]),
392 popcount32(m[8]), popcount32(m[9]),
393 popcount32(m[10]), popcount32(m[11]) };
394 u32 v[12];
395
396 unpack_bits_32(v, (const u8 *)ptr, bits, 12);
397
398 u32 x[12] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
399 expand32(v[2], m[2]), expand32(v[3], m[3]),
400 expand32(v[4], m[4]), expand32(v[5], m[5]),
401 expand32(v[6], m[6]), expand32(v[7], m[7]),
402 expand32(v[8], m[8]), expand32(v[9], m[9]),
403 expand32(v[10], m[10]), expand32(v[11], m[11]) };
404
405 m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
406 .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]),
407 .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) };
408 return xvec;
409}
410#endif
411
412#if defined(ARCH_64_BIT)
413static really_inline
414m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
415 // First, decompose our vectors into 64-bit chunks.
416 u64a m[6];
417 memcpy(m, &mvec, sizeof(mvec));
418
419 u32 bits[6] = { popcount64(m[0]), popcount64(m[1]),
420 popcount64(m[2]), popcount64(m[3]),
421 popcount64(m[4]), popcount64(m[5]) };
422 u64a v[6];
423
424 unpack_bits_64(v, (const u8 *)ptr, bits, 6);
425
426 u64a x[6] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
427 expand64(v[2], m[2]), expand64(v[3], m[3]),
428 expand64(v[4], m[4]), expand64(v[5], m[5]) };
429
430 m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
431 .mid = _mm_set_epi64x(x[3], x[2]),
432 .hi = _mm_set_epi64x(x[5], x[4]) };
433 return xvec;
434}
435#endif
436
437void loadcompressed384(m384 *x, const void *ptr, const m384 *m,
438 UNUSED u32 bytes) {
439#if defined(ARCH_64_BIT)
440 *x = loadcompressed384_64bit(ptr, *m);
441#else
442 *x = loadcompressed384_32bit(ptr, *m);
443#endif
444}
445
446/*
447 * 512-bit store/load.
448 */
449
450#if defined(ARCH_32_BIT)
451static really_inline
452void storecompressed512_32bit(void *ptr, m512 xvec, m512 mvec) {
453 // First, decompose our vectors into 32-bit chunks.
454 u32 x[16];
455 memcpy(x, &xvec, sizeof(xvec));
456 u32 m[16];
457 memcpy(m, &mvec, sizeof(mvec));
458
459 // Count the number of bits of compressed state we're writing out per
460 // chunk.
461 u32 bits[16] = { popcount32(m[0]), popcount32(m[1]),
462 popcount32(m[2]), popcount32(m[3]),
463 popcount32(m[4]), popcount32(m[5]),
464 popcount32(m[6]), popcount32(m[7]),
465 popcount32(m[8]), popcount32(m[9]),
466 popcount32(m[10]), popcount32(m[11]),
467 popcount32(m[12]), popcount32(m[13]),
468 popcount32(m[14]), popcount32(m[15])};
469
470 // Compress each 32-bit chunk individually.
471 u32 v[16] = { compress32(x[0], m[0]), compress32(x[1], m[1]),
472 compress32(x[2], m[2]), compress32(x[3], m[3]),
473 compress32(x[4], m[4]), compress32(x[5], m[5]),
474 compress32(x[6], m[6]), compress32(x[7], m[7]),
475 compress32(x[8], m[8]), compress32(x[9], m[9]),
476 compress32(x[10], m[10]), compress32(x[11], m[11]),
477 compress32(x[12], m[12]), compress32(x[13], m[13]),
478 compress32(x[14], m[14]), compress32(x[15], m[15]) };
479
480 // Write packed data out.
481 pack_bits_32(ptr, v, bits, 16);
482}
483#endif
484
485#if defined(ARCH_64_BIT)
486static really_inline
487void storecompressed512_64bit(void *ptr, m512 xvec, m512 mvec) {
488 // First, decompose our vectors into 64-bit chunks.
489 u64a m[8];
490 memcpy(m, &mvec, sizeof(mvec));
491 u64a x[8];
492 memcpy(x, &xvec, sizeof(xvec));
493
494 // Count the number of bits of compressed state we're writing out per
495 // chunk.
496 u32 bits[8] = { popcount64(m[0]), popcount64(m[1]),
497 popcount64(m[2]), popcount64(m[3]),
498 popcount64(m[4]), popcount64(m[5]),
499 popcount64(m[6]), popcount64(m[7]) };
500
501 // Compress each 64-bit chunk individually.
502 u64a v[8] = { compress64(x[0], m[0]), compress64(x[1], m[1]),
503 compress64(x[2], m[2]), compress64(x[3], m[3]),
504 compress64(x[4], m[4]), compress64(x[5], m[5]),
505 compress64(x[6], m[6]), compress64(x[7], m[7]) };
506
507 // Write packed data out.
508 pack_bits_64(ptr, v, bits, 8);
509}
510#endif
511
512void storecompressed512(void *ptr, const m512 *x, const m512 *m,
513 UNUSED u32 bytes) {
514#if defined(ARCH_64_BIT)
515 storecompressed512_64bit(ptr, *x, *m);
516#else
517 storecompressed512_32bit(ptr, *x, *m);
518#endif
519}
520
521#if defined(ARCH_32_BIT)
522static really_inline
523m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
524 // First, decompose our vectors into 32-bit chunks.
525 u32 m[16];
526 memcpy(m, &mvec, sizeof(mvec));
527
528 u32 bits[16] = { popcount32(m[0]), popcount32(m[1]),
529 popcount32(m[2]), popcount32(m[3]),
530 popcount32(m[4]), popcount32(m[5]),
531 popcount32(m[6]), popcount32(m[7]),
532 popcount32(m[8]), popcount32(m[9]),
533 popcount32(m[10]), popcount32(m[11]),
534 popcount32(m[12]), popcount32(m[13]),
535 popcount32(m[14]), popcount32(m[15]) };
536 u32 v[16];
537
538 unpack_bits_32(v, (const u8 *)ptr, bits, 16);
539
540 u32 x[16] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
541 expand32(v[2], m[2]), expand32(v[3], m[3]),
542 expand32(v[4], m[4]), expand32(v[5], m[5]),
543 expand32(v[6], m[6]), expand32(v[7], m[7]),
544 expand32(v[8], m[8]), expand32(v[9], m[9]),
545 expand32(v[10], m[10]), expand32(v[11], m[11]),
546 expand32(v[12], m[12]), expand32(v[13], m[13]),
547 expand32(v[14], m[14]), expand32(v[15], m[15]) };
548
549 m512 xvec;
550#if defined(HAVE_AVX512)
551 xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12],
552 x[11], x[10], x[9], x[8],
553 x[7], x[6], x[5], x[4],
554 x[3], x[2], x[1], x[0]);
555#elif defined(HAVE_AVX2)
556 xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4],
557 x[3], x[2], x[1], x[0]);
558 xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
559 x[11], x[10], x[9], x[8]);
560#else
561 xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
562 xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
563 xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
564 xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
565#endif
566 return xvec;
567}
568#endif
569
570#if defined(ARCH_64_BIT)
571static really_inline
572m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
573 // First, decompose our vectors into 64-bit chunks.
574 u64a m[8];
575 memcpy(m, &mvec, sizeof(mvec));
576
577 u32 bits[8] = { popcount64(m[0]), popcount64(m[1]),
578 popcount64(m[2]), popcount64(m[3]),
579 popcount64(m[4]), popcount64(m[5]),
580 popcount64(m[6]), popcount64(m[7]) };
581 u64a v[8];
582
583 unpack_bits_64(v, (const u8 *)ptr, bits, 8);
584
585 u64a x[8] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
586 expand64(v[2], m[2]), expand64(v[3], m[3]),
587 expand64(v[4], m[4]), expand64(v[5], m[5]),
588 expand64(v[6], m[6]), expand64(v[7], m[7]) };
589
590#if defined(HAVE_AVX512)
591 m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4],
592 x[3], x[2], x[1], x[0]);
593#elif defined(HAVE_AVX2)
594 m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
595 .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
596#else
597 m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
598 _mm_set_epi64x(x[3], x[2]) },
599 .hi = { _mm_set_epi64x(x[5], x[4]),
600 _mm_set_epi64x(x[7], x[6]) } };
601#endif
602 return xvec;
603}
604#endif
605
606void loadcompressed512(m512 *x, const void *ptr, const m512 *m,
607 UNUSED u32 bytes) {
608#if defined(ARCH_64_BIT)
609 *x = loadcompressed512_64bit(ptr, *m);
610#else
611 *x = loadcompressed512_32bit(ptr, *m);
612#endif
613}
614