1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Mask-based state compression, used by the NFA. |
31 | */ |
32 | #include "config.h" |
33 | #include "ue2common.h" |
34 | #include "arch.h" |
35 | #include "bitutils.h" |
36 | #include "unaligned.h" |
37 | #include "pack_bits.h" |
38 | #include "partial_store.h" |
39 | #include "popcount.h" |
40 | #include "state_compress.h" |
41 | |
42 | #include <string.h> |
43 | |
44 | /* |
45 | * 32-bit store/load. |
46 | */ |
47 | |
48 | void storecompressed32(void *ptr, const u32 *x, const u32 *m, u32 bytes) { |
49 | assert(popcount32(*m) <= bytes * 8); |
50 | |
51 | u32 v = compress32(*x, *m); |
52 | partial_store_u32(ptr, v, bytes); |
53 | } |
54 | |
55 | void loadcompressed32(u32 *x, const void *ptr, const u32 *m, u32 bytes) { |
56 | assert(popcount32(*m) <= bytes * 8); |
57 | |
58 | u32 v = partial_load_u32(ptr, bytes); |
59 | *x = expand32(v, *m); |
60 | } |
61 | |
62 | /* |
63 | * 64-bit store/load. |
64 | */ |
65 | |
66 | void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes) { |
67 | assert(popcount64(*m) <= bytes * 8); |
68 | |
69 | u64a v = compress64(*x, *m); |
70 | partial_store_u64a(ptr, v, bytes); |
71 | } |
72 | |
73 | void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes) { |
74 | assert(popcount64(*m) <= bytes * 8); |
75 | |
76 | u64a v = partial_load_u64a(ptr, bytes); |
77 | *x = expand64(v, *m); |
78 | } |
79 | |
80 | /* |
81 | * 128-bit store/load. |
82 | */ |
83 | |
84 | #if defined(ARCH_32_BIT) |
85 | static really_inline |
86 | void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) { |
87 | // First, decompose our vectors into 32-bit chunks. |
88 | u32 x[4]; |
89 | memcpy(x, &xvec, sizeof(xvec)); |
90 | u32 m[4]; |
91 | memcpy(m, &mvec, sizeof(mvec)); |
92 | |
93 | // Count the number of bits of compressed state we're writing out per |
94 | // chunk. |
95 | u32 bits[4] = { popcount32(m[0]), popcount32(m[1]), |
96 | popcount32(m[2]), popcount32(m[3]) }; |
97 | |
98 | // Compress each 32-bit chunk individually. |
99 | u32 v[4] = { compress32(x[0], m[0]), compress32(x[1], m[1]), |
100 | compress32(x[2], m[2]), compress32(x[3], m[3]) }; |
101 | |
102 | // Write packed data out. |
103 | pack_bits_32(ptr, v, bits, 4); |
104 | } |
105 | #endif |
106 | |
107 | #if defined(ARCH_64_BIT) |
108 | static really_inline |
109 | void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) { |
110 | // First, decompose our vectors into 64-bit chunks. |
111 | u64a x[2]; |
112 | memcpy(x, &xvec, sizeof(xvec)); |
113 | u64a m[2]; |
114 | memcpy(m, &mvec, sizeof(mvec)); |
115 | |
116 | // Count the number of bits of compressed state we're writing out per |
117 | // chunk. |
118 | u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; |
119 | |
120 | // Compress each 64-bit chunk individually. |
121 | u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) }; |
122 | |
123 | // Write packed data out. |
124 | pack_bits_64(ptr, v, bits, 2); |
125 | } |
126 | #endif |
127 | |
128 | void storecompressed128(void *ptr, const m128 *x, const m128 *m, |
129 | UNUSED u32 bytes) { |
130 | #if defined(ARCH_64_BIT) |
131 | storecompressed128_64bit(ptr, *x, *m); |
132 | #else |
133 | storecompressed128_32bit(ptr, *x, *m); |
134 | #endif |
135 | } |
136 | |
137 | #if defined(ARCH_32_BIT) |
138 | static really_inline |
139 | m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { |
140 | // First, decompose our vectors into 32-bit chunks. |
141 | u32 m[8]; |
142 | memcpy(m, &mvec, sizeof(mvec)); |
143 | |
144 | u32 bits[4] = { popcount32(m[0]), popcount32(m[1]), |
145 | popcount32(m[2]), popcount32(m[3]) }; |
146 | u32 v[4]; |
147 | |
148 | unpack_bits_32(v, (const u8 *)ptr, bits, 4); |
149 | |
150 | u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), |
151 | expand32(v[2], m[2]), expand32(v[3], m[3]) }; |
152 | |
153 | return _mm_set_epi32(x[3], x[2], x[1], x[0]); |
154 | } |
155 | #endif |
156 | |
157 | #if defined(ARCH_64_BIT) |
158 | static really_inline |
159 | m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { |
160 | // First, decompose our vectors into 64-bit chunks. |
161 | u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) }; |
162 | |
163 | u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; |
164 | u64a v[2]; |
165 | |
166 | unpack_bits_64(v, (const u8 *)ptr, bits, 2); |
167 | |
168 | u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; |
169 | |
170 | return _mm_set_epi64x(x[1], x[0]); |
171 | } |
172 | #endif |
173 | |
174 | void loadcompressed128(m128 *x, const void *ptr, const m128 *m, |
175 | UNUSED u32 bytes) { |
176 | #if defined(ARCH_64_BIT) |
177 | *x = loadcompressed128_64bit(ptr, *m); |
178 | #else |
179 | *x = loadcompressed128_32bit(ptr, *m); |
180 | #endif |
181 | } |
182 | |
183 | /* |
184 | * 256-bit store/load. |
185 | */ |
186 | |
187 | #if defined(ARCH_32_BIT) |
188 | static really_inline |
189 | void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) { |
190 | // First, decompose our vectors into 32-bit chunks. |
191 | u32 x[8]; |
192 | memcpy(x, &xvec, sizeof(xvec)); |
193 | u32 m[8]; |
194 | memcpy(m, &mvec, sizeof(mvec)); |
195 | |
196 | // Count the number of bits of compressed state we're writing out per |
197 | // chunk. |
198 | u32 bits[8] = { popcount32(m[0]), popcount32(m[1]), |
199 | popcount32(m[2]), popcount32(m[3]), |
200 | popcount32(m[4]), popcount32(m[5]), |
201 | popcount32(m[6]), popcount32(m[7])}; |
202 | |
203 | // Compress each 32-bit chunk individually. |
204 | u32 v[8] = { compress32(x[0], m[0]), compress32(x[1], m[1]), |
205 | compress32(x[2], m[2]), compress32(x[3], m[3]), |
206 | compress32(x[4], m[4]), compress32(x[5], m[5]), |
207 | compress32(x[6], m[6]), compress32(x[7], m[7]) }; |
208 | |
209 | // Write packed data out. |
210 | pack_bits_32(ptr, v, bits, 8); |
211 | } |
212 | #endif |
213 | |
214 | #if defined(ARCH_64_BIT) |
215 | static really_really_inline |
216 | void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) { |
217 | // First, decompose our vectors into 64-bit chunks. |
218 | u64a x[4]; |
219 | memcpy(x, &xvec, sizeof(xvec)); |
220 | u64a m[4]; |
221 | memcpy(m, &mvec, sizeof(mvec)); |
222 | |
223 | // Count the number of bits of compressed state we're writing out per |
224 | // chunk. |
225 | u32 bits[4] = { popcount64(m[0]), popcount64(m[1]), |
226 | popcount64(m[2]), popcount64(m[3]) }; |
227 | |
228 | // Compress each 64-bit chunk individually. |
229 | u64a v[4] = { compress64(x[0], m[0]), compress64(x[1], m[1]), |
230 | compress64(x[2], m[2]), compress64(x[3], m[3]) }; |
231 | |
232 | // Write packed data out. |
233 | pack_bits_64(ptr, v, bits, 4); |
234 | } |
235 | #endif |
236 | |
237 | void storecompressed256(void *ptr, const m256 *x, const m256 *m, |
238 | UNUSED u32 bytes) { |
239 | #if defined(ARCH_64_BIT) |
240 | storecompressed256_64bit(ptr, *x, *m); |
241 | #else |
242 | storecompressed256_32bit(ptr, *x, *m); |
243 | #endif |
244 | } |
245 | |
246 | #if defined(ARCH_32_BIT) |
247 | static really_inline |
248 | m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { |
249 | // First, decompose our vectors into 32-bit chunks. |
250 | u32 m[8]; |
251 | memcpy(m, &mvec, sizeof(mvec)); |
252 | |
253 | u32 bits[8] = { popcount32(m[0]), popcount32(m[1]), |
254 | popcount32(m[2]), popcount32(m[3]), |
255 | popcount32(m[4]), popcount32(m[5]), |
256 | popcount32(m[6]), popcount32(m[7])}; |
257 | u32 v[8]; |
258 | |
259 | unpack_bits_32(v, (const u8 *)ptr, bits, 8); |
260 | |
261 | u32 x[8] = { expand32(v[0], m[0]), expand32(v[1], m[1]), |
262 | expand32(v[2], m[2]), expand32(v[3], m[3]), |
263 | expand32(v[4], m[4]), expand32(v[5], m[5]), |
264 | expand32(v[6], m[6]), expand32(v[7], m[7]) }; |
265 | |
266 | #if !defined(HAVE_AVX2) |
267 | m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), |
268 | .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) }; |
269 | #else |
270 | m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4], |
271 | x[3], x[2], x[1], x[0]); |
272 | #endif |
273 | return xvec; |
274 | } |
275 | #endif |
276 | |
277 | #if defined(ARCH_64_BIT) |
278 | static really_inline |
279 | m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { |
280 | // First, decompose our vectors into 64-bit chunks. |
281 | u64a m[4]; |
282 | memcpy(m, &mvec, sizeof(mvec)); |
283 | |
284 | u32 bits[4] = { popcount64(m[0]), popcount64(m[1]), |
285 | popcount64(m[2]), popcount64(m[3]) }; |
286 | u64a v[4]; |
287 | |
288 | unpack_bits_64(v, (const u8 *)ptr, bits, 4); |
289 | |
290 | u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]), |
291 | expand64(v[2], m[2]), expand64(v[3], m[3]) }; |
292 | |
293 | #if !defined(HAVE_AVX2) |
294 | m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), |
295 | .hi = _mm_set_epi64x(x[3], x[2]) }; |
296 | #else |
297 | m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]); |
298 | #endif |
299 | return xvec; |
300 | } |
301 | #endif |
302 | |
303 | void loadcompressed256(m256 *x, const void *ptr, const m256 *m, |
304 | UNUSED u32 bytes) { |
305 | #if defined(ARCH_64_BIT) |
306 | *x = loadcompressed256_64bit(ptr, *m); |
307 | #else |
308 | *x = loadcompressed256_32bit(ptr, *m); |
309 | #endif |
310 | } |
311 | |
312 | /* |
313 | * 384-bit store/load. |
314 | */ |
315 | |
316 | #if defined(ARCH_32_BIT) |
317 | static really_inline |
318 | void storecompressed384_32bit(void *ptr, m384 xvec, m384 mvec) { |
319 | // First, decompose our vectors into 32-bit chunks. |
320 | u32 x[12]; |
321 | memcpy(x, &xvec, sizeof(xvec)); |
322 | u32 m[12]; |
323 | memcpy(m, &mvec, sizeof(mvec)); |
324 | |
325 | // Count the number of bits of compressed state we're writing out per |
326 | // chunk. |
327 | u32 bits[12] = { popcount32(m[0]), popcount32(m[1]), |
328 | popcount32(m[2]), popcount32(m[3]), |
329 | popcount32(m[4]), popcount32(m[5]), |
330 | popcount32(m[6]), popcount32(m[7]), |
331 | popcount32(m[8]), popcount32(m[9]), |
332 | popcount32(m[10]), popcount32(m[11]) }; |
333 | |
334 | // Compress each 32-bit chunk individually. |
335 | u32 v[12] = { compress32(x[0], m[0]), compress32(x[1], m[1]), |
336 | compress32(x[2], m[2]), compress32(x[3], m[3]), |
337 | compress32(x[4], m[4]), compress32(x[5], m[5]), |
338 | compress32(x[6], m[6]), compress32(x[7], m[7]), |
339 | compress32(x[8], m[8]), compress32(x[9], m[9]), |
340 | compress32(x[10], m[10]), compress32(x[11], m[11])}; |
341 | |
342 | // Write packed data out. |
343 | pack_bits_32(ptr, v, bits, 12); |
344 | } |
345 | #endif |
346 | |
347 | #if defined(ARCH_64_BIT) |
348 | static really_inline |
349 | void storecompressed384_64bit(void *ptr, m384 xvec, m384 mvec) { |
350 | // First, decompose our vectors into 64-bit chunks. |
351 | u64a x[6]; |
352 | memcpy(x, &xvec, sizeof(xvec)); |
353 | u64a m[6]; |
354 | memcpy(m, &mvec, sizeof(mvec)); |
355 | |
356 | // Count the number of bits of compressed state we're writing out per |
357 | // chunk. |
358 | u32 bits[6] = { popcount64(m[0]), popcount64(m[1]), |
359 | popcount64(m[2]), popcount64(m[3]), |
360 | popcount64(m[4]), popcount64(m[5]) }; |
361 | |
362 | // Compress each 64-bit chunk individually. |
363 | u64a v[6] = { compress64(x[0], m[0]), compress64(x[1], m[1]), |
364 | compress64(x[2], m[2]), compress64(x[3], m[3]), |
365 | compress64(x[4], m[4]), compress64(x[5], m[5]) }; |
366 | |
367 | // Write packed data out. |
368 | pack_bits_64(ptr, v, bits, 6); |
369 | } |
370 | #endif |
371 | |
372 | void storecompressed384(void *ptr, const m384 *x, const m384 *m, |
373 | UNUSED u32 bytes) { |
374 | #if defined(ARCH_64_BIT) |
375 | storecompressed384_64bit(ptr, *x, *m); |
376 | #else |
377 | storecompressed384_32bit(ptr, *x, *m); |
378 | #endif |
379 | } |
380 | |
381 | #if defined(ARCH_32_BIT) |
382 | static really_inline |
383 | m384 loadcompressed384_32bit(const void *ptr, m384 mvec) { |
384 | // First, decompose our vectors into 32-bit chunks. |
385 | u32 m[12]; |
386 | memcpy(m, &mvec, sizeof(mvec)); |
387 | |
388 | u32 bits[12] = { popcount32(m[0]), popcount32(m[1]), |
389 | popcount32(m[2]), popcount32(m[3]), |
390 | popcount32(m[4]), popcount32(m[5]), |
391 | popcount32(m[6]), popcount32(m[7]), |
392 | popcount32(m[8]), popcount32(m[9]), |
393 | popcount32(m[10]), popcount32(m[11]) }; |
394 | u32 v[12]; |
395 | |
396 | unpack_bits_32(v, (const u8 *)ptr, bits, 12); |
397 | |
398 | u32 x[12] = { expand32(v[0], m[0]), expand32(v[1], m[1]), |
399 | expand32(v[2], m[2]), expand32(v[3], m[3]), |
400 | expand32(v[4], m[4]), expand32(v[5], m[5]), |
401 | expand32(v[6], m[6]), expand32(v[7], m[7]), |
402 | expand32(v[8], m[8]), expand32(v[9], m[9]), |
403 | expand32(v[10], m[10]), expand32(v[11], m[11]) }; |
404 | |
405 | m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), |
406 | .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]), |
407 | .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) }; |
408 | return xvec; |
409 | } |
410 | #endif |
411 | |
412 | #if defined(ARCH_64_BIT) |
413 | static really_inline |
414 | m384 loadcompressed384_64bit(const void *ptr, m384 mvec) { |
415 | // First, decompose our vectors into 64-bit chunks. |
416 | u64a m[6]; |
417 | memcpy(m, &mvec, sizeof(mvec)); |
418 | |
419 | u32 bits[6] = { popcount64(m[0]), popcount64(m[1]), |
420 | popcount64(m[2]), popcount64(m[3]), |
421 | popcount64(m[4]), popcount64(m[5]) }; |
422 | u64a v[6]; |
423 | |
424 | unpack_bits_64(v, (const u8 *)ptr, bits, 6); |
425 | |
426 | u64a x[6] = { expand64(v[0], m[0]), expand64(v[1], m[1]), |
427 | expand64(v[2], m[2]), expand64(v[3], m[3]), |
428 | expand64(v[4], m[4]), expand64(v[5], m[5]) }; |
429 | |
430 | m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), |
431 | .mid = _mm_set_epi64x(x[3], x[2]), |
432 | .hi = _mm_set_epi64x(x[5], x[4]) }; |
433 | return xvec; |
434 | } |
435 | #endif |
436 | |
437 | void loadcompressed384(m384 *x, const void *ptr, const m384 *m, |
438 | UNUSED u32 bytes) { |
439 | #if defined(ARCH_64_BIT) |
440 | *x = loadcompressed384_64bit(ptr, *m); |
441 | #else |
442 | *x = loadcompressed384_32bit(ptr, *m); |
443 | #endif |
444 | } |
445 | |
446 | /* |
447 | * 512-bit store/load. |
448 | */ |
449 | |
450 | #if defined(ARCH_32_BIT) |
451 | static really_inline |
452 | void storecompressed512_32bit(void *ptr, m512 xvec, m512 mvec) { |
453 | // First, decompose our vectors into 32-bit chunks. |
454 | u32 x[16]; |
455 | memcpy(x, &xvec, sizeof(xvec)); |
456 | u32 m[16]; |
457 | memcpy(m, &mvec, sizeof(mvec)); |
458 | |
459 | // Count the number of bits of compressed state we're writing out per |
460 | // chunk. |
461 | u32 bits[16] = { popcount32(m[0]), popcount32(m[1]), |
462 | popcount32(m[2]), popcount32(m[3]), |
463 | popcount32(m[4]), popcount32(m[5]), |
464 | popcount32(m[6]), popcount32(m[7]), |
465 | popcount32(m[8]), popcount32(m[9]), |
466 | popcount32(m[10]), popcount32(m[11]), |
467 | popcount32(m[12]), popcount32(m[13]), |
468 | popcount32(m[14]), popcount32(m[15])}; |
469 | |
470 | // Compress each 32-bit chunk individually. |
471 | u32 v[16] = { compress32(x[0], m[0]), compress32(x[1], m[1]), |
472 | compress32(x[2], m[2]), compress32(x[3], m[3]), |
473 | compress32(x[4], m[4]), compress32(x[5], m[5]), |
474 | compress32(x[6], m[6]), compress32(x[7], m[7]), |
475 | compress32(x[8], m[8]), compress32(x[9], m[9]), |
476 | compress32(x[10], m[10]), compress32(x[11], m[11]), |
477 | compress32(x[12], m[12]), compress32(x[13], m[13]), |
478 | compress32(x[14], m[14]), compress32(x[15], m[15]) }; |
479 | |
480 | // Write packed data out. |
481 | pack_bits_32(ptr, v, bits, 16); |
482 | } |
483 | #endif |
484 | |
485 | #if defined(ARCH_64_BIT) |
486 | static really_inline |
487 | void storecompressed512_64bit(void *ptr, m512 xvec, m512 mvec) { |
488 | // First, decompose our vectors into 64-bit chunks. |
489 | u64a m[8]; |
490 | memcpy(m, &mvec, sizeof(mvec)); |
491 | u64a x[8]; |
492 | memcpy(x, &xvec, sizeof(xvec)); |
493 | |
494 | // Count the number of bits of compressed state we're writing out per |
495 | // chunk. |
496 | u32 bits[8] = { popcount64(m[0]), popcount64(m[1]), |
497 | popcount64(m[2]), popcount64(m[3]), |
498 | popcount64(m[4]), popcount64(m[5]), |
499 | popcount64(m[6]), popcount64(m[7]) }; |
500 | |
501 | // Compress each 64-bit chunk individually. |
502 | u64a v[8] = { compress64(x[0], m[0]), compress64(x[1], m[1]), |
503 | compress64(x[2], m[2]), compress64(x[3], m[3]), |
504 | compress64(x[4], m[4]), compress64(x[5], m[5]), |
505 | compress64(x[6], m[6]), compress64(x[7], m[7]) }; |
506 | |
507 | // Write packed data out. |
508 | pack_bits_64(ptr, v, bits, 8); |
509 | } |
510 | #endif |
511 | |
512 | void storecompressed512(void *ptr, const m512 *x, const m512 *m, |
513 | UNUSED u32 bytes) { |
514 | #if defined(ARCH_64_BIT) |
515 | storecompressed512_64bit(ptr, *x, *m); |
516 | #else |
517 | storecompressed512_32bit(ptr, *x, *m); |
518 | #endif |
519 | } |
520 | |
521 | #if defined(ARCH_32_BIT) |
522 | static really_inline |
523 | m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { |
524 | // First, decompose our vectors into 32-bit chunks. |
525 | u32 m[16]; |
526 | memcpy(m, &mvec, sizeof(mvec)); |
527 | |
528 | u32 bits[16] = { popcount32(m[0]), popcount32(m[1]), |
529 | popcount32(m[2]), popcount32(m[3]), |
530 | popcount32(m[4]), popcount32(m[5]), |
531 | popcount32(m[6]), popcount32(m[7]), |
532 | popcount32(m[8]), popcount32(m[9]), |
533 | popcount32(m[10]), popcount32(m[11]), |
534 | popcount32(m[12]), popcount32(m[13]), |
535 | popcount32(m[14]), popcount32(m[15]) }; |
536 | u32 v[16]; |
537 | |
538 | unpack_bits_32(v, (const u8 *)ptr, bits, 16); |
539 | |
540 | u32 x[16] = { expand32(v[0], m[0]), expand32(v[1], m[1]), |
541 | expand32(v[2], m[2]), expand32(v[3], m[3]), |
542 | expand32(v[4], m[4]), expand32(v[5], m[5]), |
543 | expand32(v[6], m[6]), expand32(v[7], m[7]), |
544 | expand32(v[8], m[8]), expand32(v[9], m[9]), |
545 | expand32(v[10], m[10]), expand32(v[11], m[11]), |
546 | expand32(v[12], m[12]), expand32(v[13], m[13]), |
547 | expand32(v[14], m[14]), expand32(v[15], m[15]) }; |
548 | |
549 | m512 xvec; |
550 | #if defined(HAVE_AVX512) |
551 | xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12], |
552 | x[11], x[10], x[9], x[8], |
553 | x[7], x[6], x[5], x[4], |
554 | x[3], x[2], x[1], x[0]); |
555 | #elif defined(HAVE_AVX2) |
556 | xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4], |
557 | x[3], x[2], x[1], x[0]); |
558 | xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12], |
559 | x[11], x[10], x[9], x[8]); |
560 | #else |
561 | xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); |
562 | xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); |
563 | xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); |
564 | xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]); |
565 | #endif |
566 | return xvec; |
567 | } |
568 | #endif |
569 | |
570 | #if defined(ARCH_64_BIT) |
571 | static really_inline |
572 | m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { |
573 | // First, decompose our vectors into 64-bit chunks. |
574 | u64a m[8]; |
575 | memcpy(m, &mvec, sizeof(mvec)); |
576 | |
577 | u32 bits[8] = { popcount64(m[0]), popcount64(m[1]), |
578 | popcount64(m[2]), popcount64(m[3]), |
579 | popcount64(m[4]), popcount64(m[5]), |
580 | popcount64(m[6]), popcount64(m[7]) }; |
581 | u64a v[8]; |
582 | |
583 | unpack_bits_64(v, (const u8 *)ptr, bits, 8); |
584 | |
585 | u64a x[8] = { expand64(v[0], m[0]), expand64(v[1], m[1]), |
586 | expand64(v[2], m[2]), expand64(v[3], m[3]), |
587 | expand64(v[4], m[4]), expand64(v[5], m[5]), |
588 | expand64(v[6], m[6]), expand64(v[7], m[7]) }; |
589 | |
590 | #if defined(HAVE_AVX512) |
591 | m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4], |
592 | x[3], x[2], x[1], x[0]); |
593 | #elif defined(HAVE_AVX2) |
594 | m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), |
595 | .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; |
596 | #else |
597 | m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]), |
598 | _mm_set_epi64x(x[3], x[2]) }, |
599 | .hi = { _mm_set_epi64x(x[5], x[4]), |
600 | _mm_set_epi64x(x[7], x[6]) } }; |
601 | #endif |
602 | return xvec; |
603 | } |
604 | #endif |
605 | |
606 | void loadcompressed512(m512 *x, const void *ptr, const m512 *m, |
607 | UNUSED u32 bytes) { |
608 | #if defined(ARCH_64_BIT) |
609 | *x = loadcompressed512_64bit(ptr, *m); |
610 | #else |
611 | *x = loadcompressed512_32bit(ptr, *m); |
612 | #endif |
613 | } |
614 | |