1#ifndef BITSET_UTIL_H
2#define BITSET_UTIL_H
3
4#include <stdint.h>
5
6#include <roaring/portability.h>
7#include <roaring/utilasm.h>
8
9/*
10 * Set all bits in indexes [begin,end) to true.
11 */
12static inline void bitset_set_range(uint64_t *bitmap, uint32_t start,
13 uint32_t end) {
14 if (start == end) return;
15 uint32_t firstword = start / 64;
16 uint32_t endword = (end - 1) / 64;
17 if (firstword == endword) {
18 bitmap[firstword] |= ((~UINT64_C(0)) << (start % 64)) &
19 ((~UINT64_C(0)) >> ((~end + 1) % 64));
20 return;
21 }
22 bitmap[firstword] |= (~UINT64_C(0)) << (start % 64);
23 for (uint32_t i = firstword + 1; i < endword; i++) bitmap[i] = ~UINT64_C(0);
24 bitmap[endword] |= (~UINT64_C(0)) >> ((~end + 1) % 64);
25}
26
27
28/*
29 * Find the cardinality of the bitset in [begin,begin+lenminusone]
30 */
31static inline int bitset_lenrange_cardinality(uint64_t *bitmap, uint32_t start,
32 uint32_t lenminusone) {
33 uint32_t firstword = start / 64;
34 uint32_t endword = (start + lenminusone) / 64;
35 if (firstword == endword) {
36 return hamming(bitmap[firstword] &
37 ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
38 << (start % 64));
39 }
40 int answer = hamming(bitmap[firstword] & ((~UINT64_C(0)) << (start % 64)));
41 for (uint32_t i = firstword + 1; i < endword; i++) {
42 answer += hamming(bitmap[i]);
43 }
44 answer +=
45 hamming(bitmap[endword] &
46 (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64));
47 return answer;
48}
49
50/*
51 * Check whether the cardinality of the bitset in [begin,begin+lenminusone] is 0
52 */
53static inline bool bitset_lenrange_empty(uint64_t *bitmap, uint32_t start,
54 uint32_t lenminusone) {
55 uint32_t firstword = start / 64;
56 uint32_t endword = (start + lenminusone) / 64;
57 if (firstword == endword) {
58 return (bitmap[firstword] & ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
59 << (start % 64)) == 0;
60 }
61 if(((bitmap[firstword] & ((~UINT64_C(0)) << (start%64)))) != 0) return false;
62 for (uint32_t i = firstword + 1; i < endword; i++) {
63 if(bitmap[i] != 0) return false;
64 }
65 if((bitmap[endword] & (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)) != 0) return false;
66 return true;
67}
68
69
70/*
71 * Set all bits in indexes [begin,begin+lenminusone] to true.
72 */
73static inline void bitset_set_lenrange(uint64_t *bitmap, uint32_t start,
74 uint32_t lenminusone) {
75 uint32_t firstword = start / 64;
76 uint32_t endword = (start + lenminusone) / 64;
77 if (firstword == endword) {
78 bitmap[firstword] |= ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
79 << (start % 64);
80 return;
81 }
82 uint64_t temp = bitmap[endword];
83 bitmap[firstword] |= (~UINT64_C(0)) << (start % 64);
84 for (uint32_t i = firstword + 1; i < endword; i += 2)
85 bitmap[i] = bitmap[i + 1] = ~UINT64_C(0);
86 bitmap[endword] =
87 temp | (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64);
88}
89
90/*
91 * Flip all the bits in indexes [begin,end).
92 */
93static inline void bitset_flip_range(uint64_t *bitmap, uint32_t start,
94 uint32_t end) {
95 if (start == end) return;
96 uint32_t firstword = start / 64;
97 uint32_t endword = (end - 1) / 64;
98 bitmap[firstword] ^= ~((~UINT64_C(0)) << (start % 64));
99 for (uint32_t i = firstword; i < endword; i++) bitmap[i] = ~bitmap[i];
100 bitmap[endword] ^= ((~UINT64_C(0)) >> ((~end + 1) % 64));
101}
102
103/*
104 * Set all bits in indexes [begin,end) to false.
105 */
106static inline void bitset_reset_range(uint64_t *bitmap, uint32_t start,
107 uint32_t end) {
108 if (start == end) return;
109 uint32_t firstword = start / 64;
110 uint32_t endword = (end - 1) / 64;
111 if (firstword == endword) {
112 bitmap[firstword] &= ~(((~UINT64_C(0)) << (start % 64)) &
113 ((~UINT64_C(0)) >> ((~end + 1) % 64)));
114 return;
115 }
116 bitmap[firstword] &= ~((~UINT64_C(0)) << (start % 64));
117 for (uint32_t i = firstword + 1; i < endword; i++) bitmap[i] = UINT64_C(0);
118 bitmap[endword] &= ~((~UINT64_C(0)) >> ((~end + 1) % 64));
119}
120
121/*
122 * Given a bitset containing "length" 64-bit words, write out the position
123 * of all the set bits to "out", values start at "base".
124 *
125 * The "out" pointer should be sufficient to store the actual number of bits
126 * set.
127 *
128 * Returns how many values were actually decoded.
129 *
130 * This function should only be expected to be faster than
131 * bitset_extract_setbits
132 * when the density of the bitset is high.
133 *
134 * This function uses AVX2 decoding.
135 */
136size_t bitset_extract_setbits_avx2(uint64_t *bitset, size_t length, void *vout,
137 size_t outcapacity, uint32_t base);
138
139/*
140 * Given a bitset containing "length" 64-bit words, write out the position
141 * of all the set bits to "out", values start at "base".
142 *
143 * The "out" pointer should be sufficient to store the actual number of bits
144 *set.
145 *
146 * Returns how many values were actually decoded.
147 */
148size_t bitset_extract_setbits(uint64_t *bitset, size_t length, void *vout,
149 uint32_t base);
150
151/*
152 * Given a bitset containing "length" 64-bit words, write out the position
153 * of all the set bits to "out" as 16-bit integers, values start at "base" (can
154 *be set to zero)
155 *
156 * The "out" pointer should be sufficient to store the actual number of bits
157 *set.
158 *
159 * Returns how many values were actually decoded.
160 *
161 * This function should only be expected to be faster than
162 *bitset_extract_setbits_uint16
163 * when the density of the bitset is high.
164 *
165 * This function uses SSE decoding.
166 */
167size_t bitset_extract_setbits_sse_uint16(const uint64_t *bitset, size_t length,
168 uint16_t *out, size_t outcapacity,
169 uint16_t base);
170
171/*
172 * Given a bitset containing "length" 64-bit words, write out the position
173 * of all the set bits to "out", values start at "base"
174 * (can be set to zero)
175 *
176 * The "out" pointer should be sufficient to store the actual number of bits
177 *set.
178 *
179 * Returns how many values were actually decoded.
180 */
181size_t bitset_extract_setbits_uint16(const uint64_t *bitset, size_t length,
182 uint16_t *out, uint16_t base);
183
184/*
185 * Given two bitsets containing "length" 64-bit words, write out the position
186 * of all the common set bits to "out", values start at "base"
187 * (can be set to zero)
188 *
189 * The "out" pointer should be sufficient to store the actual number of bits
190 * set.
191 *
192 * Returns how many values were actually decoded.
193 */
194size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__ bitset1,
195 const uint64_t * __restrict__ bitset2,
196 size_t length, uint16_t *out,
197 uint16_t base);
198
199/*
200 * Given a bitset having cardinality card, set all bit values in the list (there
201 * are length of them)
202 * and return the updated cardinality. This evidently assumes that the bitset
203 * already contained data.
204 */
205uint64_t bitset_set_list_withcard(void *bitset, uint64_t card,
206 const uint16_t *list, uint64_t length);
207/*
208 * Given a bitset, set all bit values in the list (there
209 * are length of them).
210 */
211void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length);
212
213/*
214 * Given a bitset having cardinality card, unset all bit values in the list
215 * (there are length of them)
216 * and return the updated cardinality. This evidently assumes that the bitset
217 * already contained data.
218 */
219uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list,
220 uint64_t length);
221
222/*
223 * Given a bitset having cardinality card, toggle all bit values in the list
224 * (there are length of them)
225 * and return the updated cardinality. This evidently assumes that the bitset
226 * already contained data.
227 */
228
229uint64_t bitset_flip_list_withcard(void *bitset, uint64_t card,
230 const uint16_t *list, uint64_t length);
231
232void bitset_flip_list(void *bitset, const uint16_t *list, uint64_t length);
233
234#ifdef USEAVX
235/***
236 * BEGIN Harley-Seal popcount functions.
237 */
238
239/**
240 * Compute the population count of a 256-bit word
241 * This is not especially fast, but it is convenient as part of other functions.
242 */
243static inline __m256i popcount256(__m256i v) {
244 const __m256i lookuppos = _mm256_setr_epi8(
245 /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2,
246 /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3,
247 /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3,
248 /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4,
249
250 /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2,
251 /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3,
252 /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3,
253 /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4);
254 const __m256i lookupneg = _mm256_setr_epi8(
255 /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2,
256 /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3,
257 /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3,
258 /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4,
259
260 /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2,
261 /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3,
262 /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3,
263 /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4);
264 const __m256i low_mask = _mm256_set1_epi8(0x0f);
265
266 const __m256i lo = _mm256_and_si256(v, low_mask);
267 const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
268 const __m256i popcnt1 = _mm256_shuffle_epi8(lookuppos, lo);
269 const __m256i popcnt2 = _mm256_shuffle_epi8(lookupneg, hi);
270 return _mm256_sad_epu8(popcnt1, popcnt2);
271}
272
273/**
274 * Simple CSA over 256 bits
275 */
276static inline void CSA(__m256i *h, __m256i *l, __m256i a, __m256i b,
277 __m256i c) {
278 const __m256i u = _mm256_xor_si256(a, b);
279 *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
280 *l = _mm256_xor_si256(u, c);
281}
282
283/**
284 * Fast Harley-Seal AVX population count function
285 */
286inline static uint64_t avx2_harley_seal_popcount256(const __m256i *data,
287 const uint64_t size) {
288 __m256i total = _mm256_setzero_si256();
289 __m256i ones = _mm256_setzero_si256();
290 __m256i twos = _mm256_setzero_si256();
291 __m256i fours = _mm256_setzero_si256();
292 __m256i eights = _mm256_setzero_si256();
293 __m256i sixteens = _mm256_setzero_si256();
294 __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
295
296 const uint64_t limit = size - size % 16;
297 uint64_t i = 0;
298
299 for (; i < limit; i += 16) {
300 CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i),
301 _mm256_lddqu_si256(data + i + 1));
302 CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 2),
303 _mm256_lddqu_si256(data + i + 3));
304 CSA(&foursA, &twos, twos, twosA, twosB);
305 CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 4),
306 _mm256_lddqu_si256(data + i + 5));
307 CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 6),
308 _mm256_lddqu_si256(data + i + 7));
309 CSA(&foursB, &twos, twos, twosA, twosB);
310 CSA(&eightsA, &fours, fours, foursA, foursB);
311 CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 8),
312 _mm256_lddqu_si256(data + i + 9));
313 CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 10),
314 _mm256_lddqu_si256(data + i + 11));
315 CSA(&foursA, &twos, twos, twosA, twosB);
316 CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 12),
317 _mm256_lddqu_si256(data + i + 13));
318 CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 14),
319 _mm256_lddqu_si256(data + i + 15));
320 CSA(&foursB, &twos, twos, twosA, twosB);
321 CSA(&eightsB, &fours, fours, foursA, foursB);
322 CSA(&sixteens, &eights, eights, eightsA, eightsB);
323
324 total = _mm256_add_epi64(total, popcount256(sixteens));
325 }
326
327 total = _mm256_slli_epi64(total, 4); // * 16
328 total = _mm256_add_epi64(
329 total, _mm256_slli_epi64(popcount256(eights), 3)); // += 8 * ...
330 total = _mm256_add_epi64(
331 total, _mm256_slli_epi64(popcount256(fours), 2)); // += 4 * ...
332 total = _mm256_add_epi64(
333 total, _mm256_slli_epi64(popcount256(twos), 1)); // += 2 * ...
334 total = _mm256_add_epi64(total, popcount256(ones));
335 for (; i < size; i++)
336 total =
337 _mm256_add_epi64(total, popcount256(_mm256_lddqu_si256(data + i)));
338
339 return (uint64_t)(_mm256_extract_epi64(total, 0)) +
340 (uint64_t)(_mm256_extract_epi64(total, 1)) +
341 (uint64_t)(_mm256_extract_epi64(total, 2)) +
342 (uint64_t)(_mm256_extract_epi64(total, 3));
343}
344
345#define AVXPOPCNTFNC(opname, avx_intrinsic) \
346 static inline uint64_t avx2_harley_seal_popcount256_##opname( \
347 const __m256i *data1, const __m256i *data2, const uint64_t size) { \
348 __m256i total = _mm256_setzero_si256(); \
349 __m256i ones = _mm256_setzero_si256(); \
350 __m256i twos = _mm256_setzero_si256(); \
351 __m256i fours = _mm256_setzero_si256(); \
352 __m256i eights = _mm256_setzero_si256(); \
353 __m256i sixteens = _mm256_setzero_si256(); \
354 __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \
355 __m256i A1, A2; \
356 const uint64_t limit = size - size % 16; \
357 uint64_t i = 0; \
358 for (; i < limit; i += 16) { \
359 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
360 _mm256_lddqu_si256(data2 + i)); \
361 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \
362 _mm256_lddqu_si256(data2 + i + 1)); \
363 CSA(&twosA, &ones, ones, A1, A2); \
364 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \
365 _mm256_lddqu_si256(data2 + i + 2)); \
366 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \
367 _mm256_lddqu_si256(data2 + i + 3)); \
368 CSA(&twosB, &ones, ones, A1, A2); \
369 CSA(&foursA, &twos, twos, twosA, twosB); \
370 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \
371 _mm256_lddqu_si256(data2 + i + 4)); \
372 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \
373 _mm256_lddqu_si256(data2 + i + 5)); \
374 CSA(&twosA, &ones, ones, A1, A2); \
375 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \
376 _mm256_lddqu_si256(data2 + i + 6)); \
377 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \
378 _mm256_lddqu_si256(data2 + i + 7)); \
379 CSA(&twosB, &ones, ones, A1, A2); \
380 CSA(&foursB, &twos, twos, twosA, twosB); \
381 CSA(&eightsA, &fours, fours, foursA, foursB); \
382 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \
383 _mm256_lddqu_si256(data2 + i + 8)); \
384 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \
385 _mm256_lddqu_si256(data2 + i + 9)); \
386 CSA(&twosA, &ones, ones, A1, A2); \
387 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \
388 _mm256_lddqu_si256(data2 + i + 10)); \
389 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \
390 _mm256_lddqu_si256(data2 + i + 11)); \
391 CSA(&twosB, &ones, ones, A1, A2); \
392 CSA(&foursA, &twos, twos, twosA, twosB); \
393 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \
394 _mm256_lddqu_si256(data2 + i + 12)); \
395 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \
396 _mm256_lddqu_si256(data2 + i + 13)); \
397 CSA(&twosA, &ones, ones, A1, A2); \
398 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \
399 _mm256_lddqu_si256(data2 + i + 14)); \
400 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \
401 _mm256_lddqu_si256(data2 + i + 15)); \
402 CSA(&twosB, &ones, ones, A1, A2); \
403 CSA(&foursB, &twos, twos, twosA, twosB); \
404 CSA(&eightsB, &fours, fours, foursA, foursB); \
405 CSA(&sixteens, &eights, eights, eightsA, eightsB); \
406 total = _mm256_add_epi64(total, popcount256(sixteens)); \
407 } \
408 total = _mm256_slli_epi64(total, 4); \
409 total = _mm256_add_epi64(total, \
410 _mm256_slli_epi64(popcount256(eights), 3)); \
411 total = \
412 _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \
413 total = \
414 _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \
415 total = _mm256_add_epi64(total, popcount256(ones)); \
416 for (; i < size; i++) { \
417 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
418 _mm256_lddqu_si256(data2 + i)); \
419 total = _mm256_add_epi64(total, popcount256(A1)); \
420 } \
421 return (uint64_t)(_mm256_extract_epi64(total, 0)) + \
422 (uint64_t)(_mm256_extract_epi64(total, 1)) + \
423 (uint64_t)(_mm256_extract_epi64(total, 2)) + \
424 (uint64_t)(_mm256_extract_epi64(total, 3)); \
425 } \
426 static inline uint64_t avx2_harley_seal_popcount256andstore_##opname( \
427 const __m256i *__restrict__ data1, const __m256i *__restrict__ data2, \
428 __m256i *__restrict__ out, const uint64_t size) { \
429 __m256i total = _mm256_setzero_si256(); \
430 __m256i ones = _mm256_setzero_si256(); \
431 __m256i twos = _mm256_setzero_si256(); \
432 __m256i fours = _mm256_setzero_si256(); \
433 __m256i eights = _mm256_setzero_si256(); \
434 __m256i sixteens = _mm256_setzero_si256(); \
435 __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \
436 __m256i A1, A2; \
437 const uint64_t limit = size - size % 16; \
438 uint64_t i = 0; \
439 for (; i < limit; i += 16) { \
440 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
441 _mm256_lddqu_si256(data2 + i)); \
442 _mm256_storeu_si256(out + i, A1); \
443 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \
444 _mm256_lddqu_si256(data2 + i + 1)); \
445 _mm256_storeu_si256(out + i + 1, A2); \
446 CSA(&twosA, &ones, ones, A1, A2); \
447 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \
448 _mm256_lddqu_si256(data2 + i + 2)); \
449 _mm256_storeu_si256(out + i + 2, A1); \
450 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \
451 _mm256_lddqu_si256(data2 + i + 3)); \
452 _mm256_storeu_si256(out + i + 3, A2); \
453 CSA(&twosB, &ones, ones, A1, A2); \
454 CSA(&foursA, &twos, twos, twosA, twosB); \
455 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \
456 _mm256_lddqu_si256(data2 + i + 4)); \
457 _mm256_storeu_si256(out + i + 4, A1); \
458 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \
459 _mm256_lddqu_si256(data2 + i + 5)); \
460 _mm256_storeu_si256(out + i + 5, A2); \
461 CSA(&twosA, &ones, ones, A1, A2); \
462 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \
463 _mm256_lddqu_si256(data2 + i + 6)); \
464 _mm256_storeu_si256(out + i + 6, A1); \
465 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \
466 _mm256_lddqu_si256(data2 + i + 7)); \
467 _mm256_storeu_si256(out + i + 7, A2); \
468 CSA(&twosB, &ones, ones, A1, A2); \
469 CSA(&foursB, &twos, twos, twosA, twosB); \
470 CSA(&eightsA, &fours, fours, foursA, foursB); \
471 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \
472 _mm256_lddqu_si256(data2 + i + 8)); \
473 _mm256_storeu_si256(out + i + 8, A1); \
474 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \
475 _mm256_lddqu_si256(data2 + i + 9)); \
476 _mm256_storeu_si256(out + i + 9, A2); \
477 CSA(&twosA, &ones, ones, A1, A2); \
478 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \
479 _mm256_lddqu_si256(data2 + i + 10)); \
480 _mm256_storeu_si256(out + i + 10, A1); \
481 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \
482 _mm256_lddqu_si256(data2 + i + 11)); \
483 _mm256_storeu_si256(out + i + 11, A2); \
484 CSA(&twosB, &ones, ones, A1, A2); \
485 CSA(&foursA, &twos, twos, twosA, twosB); \
486 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \
487 _mm256_lddqu_si256(data2 + i + 12)); \
488 _mm256_storeu_si256(out + i + 12, A1); \
489 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \
490 _mm256_lddqu_si256(data2 + i + 13)); \
491 _mm256_storeu_si256(out + i + 13, A2); \
492 CSA(&twosA, &ones, ones, A1, A2); \
493 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \
494 _mm256_lddqu_si256(data2 + i + 14)); \
495 _mm256_storeu_si256(out + i + 14, A1); \
496 A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \
497 _mm256_lddqu_si256(data2 + i + 15)); \
498 _mm256_storeu_si256(out + i + 15, A2); \
499 CSA(&twosB, &ones, ones, A1, A2); \
500 CSA(&foursB, &twos, twos, twosA, twosB); \
501 CSA(&eightsB, &fours, fours, foursA, foursB); \
502 CSA(&sixteens, &eights, eights, eightsA, eightsB); \
503 total = _mm256_add_epi64(total, popcount256(sixteens)); \
504 } \
505 total = _mm256_slli_epi64(total, 4); \
506 total = _mm256_add_epi64(total, \
507 _mm256_slli_epi64(popcount256(eights), 3)); \
508 total = \
509 _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \
510 total = \
511 _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \
512 total = _mm256_add_epi64(total, popcount256(ones)); \
513 for (; i < size; i++) { \
514 A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
515 _mm256_lddqu_si256(data2 + i)); \
516 _mm256_storeu_si256(out + i, A1); \
517 total = _mm256_add_epi64(total, popcount256(A1)); \
518 } \
519 return (uint64_t)(_mm256_extract_epi64(total, 0)) + \
520 (uint64_t)(_mm256_extract_epi64(total, 1)) + \
521 (uint64_t)(_mm256_extract_epi64(total, 2)) + \
522 (uint64_t)(_mm256_extract_epi64(total, 3)); \
523 }
524
525AVXPOPCNTFNC(or, _mm256_or_si256)
526AVXPOPCNTFNC(union, _mm256_or_si256)
527AVXPOPCNTFNC(and, _mm256_and_si256)
528AVXPOPCNTFNC(intersection, _mm256_and_si256)
529AVXPOPCNTFNC (xor, _mm256_xor_si256)
530AVXPOPCNTFNC(andnot, _mm256_andnot_si256)
531
532/***
533 * END Harley-Seal popcount functions.
534 */
535
536#endif // USEAVX
537
538#endif
539