1 | #ifndef BITSET_UTIL_H |
2 | #define BITSET_UTIL_H |
3 | |
4 | #include <stdint.h> |
5 | |
6 | #include <roaring/portability.h> |
7 | #include <roaring/utilasm.h> |
8 | |
9 | /* |
10 | * Set all bits in indexes [begin,end) to true. |
11 | */ |
12 | static inline void bitset_set_range(uint64_t *bitmap, uint32_t start, |
13 | uint32_t end) { |
14 | if (start == end) return; |
15 | uint32_t firstword = start / 64; |
16 | uint32_t endword = (end - 1) / 64; |
17 | if (firstword == endword) { |
18 | bitmap[firstword] |= ((~UINT64_C(0)) << (start % 64)) & |
19 | ((~UINT64_C(0)) >> ((~end + 1) % 64)); |
20 | return; |
21 | } |
22 | bitmap[firstword] |= (~UINT64_C(0)) << (start % 64); |
23 | for (uint32_t i = firstword + 1; i < endword; i++) bitmap[i] = ~UINT64_C(0); |
24 | bitmap[endword] |= (~UINT64_C(0)) >> ((~end + 1) % 64); |
25 | } |
26 | |
27 | |
28 | /* |
29 | * Find the cardinality of the bitset in [begin,begin+lenminusone] |
30 | */ |
31 | static inline int bitset_lenrange_cardinality(uint64_t *bitmap, uint32_t start, |
32 | uint32_t lenminusone) { |
33 | uint32_t firstword = start / 64; |
34 | uint32_t endword = (start + lenminusone) / 64; |
35 | if (firstword == endword) { |
36 | return hamming(bitmap[firstword] & |
37 | ((~UINT64_C(0)) >> ((63 - lenminusone) % 64)) |
38 | << (start % 64)); |
39 | } |
40 | int answer = hamming(bitmap[firstword] & ((~UINT64_C(0)) << (start % 64))); |
41 | for (uint32_t i = firstword + 1; i < endword; i++) { |
42 | answer += hamming(bitmap[i]); |
43 | } |
44 | answer += |
45 | hamming(bitmap[endword] & |
46 | (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)); |
47 | return answer; |
48 | } |
49 | |
50 | /* |
51 | * Check whether the cardinality of the bitset in [begin,begin+lenminusone] is 0 |
52 | */ |
53 | static inline bool bitset_lenrange_empty(uint64_t *bitmap, uint32_t start, |
54 | uint32_t lenminusone) { |
55 | uint32_t firstword = start / 64; |
56 | uint32_t endword = (start + lenminusone) / 64; |
57 | if (firstword == endword) { |
58 | return (bitmap[firstword] & ((~UINT64_C(0)) >> ((63 - lenminusone) % 64)) |
59 | << (start % 64)) == 0; |
60 | } |
61 | if(((bitmap[firstword] & ((~UINT64_C(0)) << (start%64)))) != 0) return false; |
62 | for (uint32_t i = firstword + 1; i < endword; i++) { |
63 | if(bitmap[i] != 0) return false; |
64 | } |
65 | if((bitmap[endword] & (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)) != 0) return false; |
66 | return true; |
67 | } |
68 | |
69 | |
70 | /* |
71 | * Set all bits in indexes [begin,begin+lenminusone] to true. |
72 | */ |
73 | static inline void bitset_set_lenrange(uint64_t *bitmap, uint32_t start, |
74 | uint32_t lenminusone) { |
75 | uint32_t firstword = start / 64; |
76 | uint32_t endword = (start + lenminusone) / 64; |
77 | if (firstword == endword) { |
78 | bitmap[firstword] |= ((~UINT64_C(0)) >> ((63 - lenminusone) % 64)) |
79 | << (start % 64); |
80 | return; |
81 | } |
82 | uint64_t temp = bitmap[endword]; |
83 | bitmap[firstword] |= (~UINT64_C(0)) << (start % 64); |
84 | for (uint32_t i = firstword + 1; i < endword; i += 2) |
85 | bitmap[i] = bitmap[i + 1] = ~UINT64_C(0); |
86 | bitmap[endword] = |
87 | temp | (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64); |
88 | } |
89 | |
90 | /* |
91 | * Flip all the bits in indexes [begin,end). |
92 | */ |
93 | static inline void bitset_flip_range(uint64_t *bitmap, uint32_t start, |
94 | uint32_t end) { |
95 | if (start == end) return; |
96 | uint32_t firstword = start / 64; |
97 | uint32_t endword = (end - 1) / 64; |
98 | bitmap[firstword] ^= ~((~UINT64_C(0)) << (start % 64)); |
99 | for (uint32_t i = firstword; i < endword; i++) bitmap[i] = ~bitmap[i]; |
100 | bitmap[endword] ^= ((~UINT64_C(0)) >> ((~end + 1) % 64)); |
101 | } |
102 | |
103 | /* |
104 | * Set all bits in indexes [begin,end) to false. |
105 | */ |
106 | static inline void bitset_reset_range(uint64_t *bitmap, uint32_t start, |
107 | uint32_t end) { |
108 | if (start == end) return; |
109 | uint32_t firstword = start / 64; |
110 | uint32_t endword = (end - 1) / 64; |
111 | if (firstword == endword) { |
112 | bitmap[firstword] &= ~(((~UINT64_C(0)) << (start % 64)) & |
113 | ((~UINT64_C(0)) >> ((~end + 1) % 64))); |
114 | return; |
115 | } |
116 | bitmap[firstword] &= ~((~UINT64_C(0)) << (start % 64)); |
117 | for (uint32_t i = firstword + 1; i < endword; i++) bitmap[i] = UINT64_C(0); |
118 | bitmap[endword] &= ~((~UINT64_C(0)) >> ((~end + 1) % 64)); |
119 | } |
120 | |
121 | /* |
122 | * Given a bitset containing "length" 64-bit words, write out the position |
123 | * of all the set bits to "out", values start at "base". |
124 | * |
125 | * The "out" pointer should be sufficient to store the actual number of bits |
126 | * set. |
127 | * |
128 | * Returns how many values were actually decoded. |
129 | * |
130 | * This function should only be expected to be faster than |
131 | * bitset_extract_setbits |
132 | * when the density of the bitset is high. |
133 | * |
134 | * This function uses AVX2 decoding. |
135 | */ |
136 | size_t (uint64_t *bitset, size_t length, void *vout, |
137 | size_t outcapacity, uint32_t base); |
138 | |
139 | /* |
140 | * Given a bitset containing "length" 64-bit words, write out the position |
141 | * of all the set bits to "out", values start at "base". |
142 | * |
143 | * The "out" pointer should be sufficient to store the actual number of bits |
144 | *set. |
145 | * |
146 | * Returns how many values were actually decoded. |
147 | */ |
148 | size_t (uint64_t *bitset, size_t length, void *vout, |
149 | uint32_t base); |
150 | |
151 | /* |
152 | * Given a bitset containing "length" 64-bit words, write out the position |
153 | * of all the set bits to "out" as 16-bit integers, values start at "base" (can |
154 | *be set to zero) |
155 | * |
156 | * The "out" pointer should be sufficient to store the actual number of bits |
157 | *set. |
158 | * |
159 | * Returns how many values were actually decoded. |
160 | * |
161 | * This function should only be expected to be faster than |
162 | *bitset_extract_setbits_uint16 |
163 | * when the density of the bitset is high. |
164 | * |
165 | * This function uses SSE decoding. |
166 | */ |
167 | size_t (const uint64_t *bitset, size_t length, |
168 | uint16_t *out, size_t outcapacity, |
169 | uint16_t base); |
170 | |
171 | /* |
172 | * Given a bitset containing "length" 64-bit words, write out the position |
173 | * of all the set bits to "out", values start at "base" |
174 | * (can be set to zero) |
175 | * |
176 | * The "out" pointer should be sufficient to store the actual number of bits |
177 | *set. |
178 | * |
179 | * Returns how many values were actually decoded. |
180 | */ |
181 | size_t (const uint64_t *bitset, size_t length, |
182 | uint16_t *out, uint16_t base); |
183 | |
184 | /* |
185 | * Given two bitsets containing "length" 64-bit words, write out the position |
186 | * of all the common set bits to "out", values start at "base" |
187 | * (can be set to zero) |
188 | * |
189 | * The "out" pointer should be sufficient to store the actual number of bits |
190 | * set. |
191 | * |
192 | * Returns how many values were actually decoded. |
193 | */ |
194 | size_t (const uint64_t * __restrict__ bitset1, |
195 | const uint64_t * __restrict__ bitset2, |
196 | size_t length, uint16_t *out, |
197 | uint16_t base); |
198 | |
199 | /* |
200 | * Given a bitset having cardinality card, set all bit values in the list (there |
201 | * are length of them) |
202 | * and return the updated cardinality. This evidently assumes that the bitset |
203 | * already contained data. |
204 | */ |
205 | uint64_t bitset_set_list_withcard(void *bitset, uint64_t card, |
206 | const uint16_t *list, uint64_t length); |
207 | /* |
208 | * Given a bitset, set all bit values in the list (there |
209 | * are length of them). |
210 | */ |
211 | void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length); |
212 | |
213 | /* |
214 | * Given a bitset having cardinality card, unset all bit values in the list |
215 | * (there are length of them) |
216 | * and return the updated cardinality. This evidently assumes that the bitset |
217 | * already contained data. |
218 | */ |
219 | uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list, |
220 | uint64_t length); |
221 | |
222 | /* |
223 | * Given a bitset having cardinality card, toggle all bit values in the list |
224 | * (there are length of them) |
225 | * and return the updated cardinality. This evidently assumes that the bitset |
226 | * already contained data. |
227 | */ |
228 | |
229 | uint64_t bitset_flip_list_withcard(void *bitset, uint64_t card, |
230 | const uint16_t *list, uint64_t length); |
231 | |
232 | void bitset_flip_list(void *bitset, const uint16_t *list, uint64_t length); |
233 | |
234 | #ifdef USEAVX |
235 | /*** |
236 | * BEGIN Harley-Seal popcount functions. |
237 | */ |
238 | |
239 | /** |
240 | * Compute the population count of a 256-bit word |
241 | * This is not especially fast, but it is convenient as part of other functions. |
242 | */ |
243 | static inline __m256i popcount256(__m256i v) { |
244 | const __m256i lookuppos = _mm256_setr_epi8( |
245 | /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2, |
246 | /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3, |
247 | /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3, |
248 | /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4, |
249 | |
250 | /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2, |
251 | /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3, |
252 | /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3, |
253 | /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4); |
254 | const __m256i lookupneg = _mm256_setr_epi8( |
255 | /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2, |
256 | /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3, |
257 | /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3, |
258 | /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4, |
259 | |
260 | /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2, |
261 | /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3, |
262 | /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3, |
263 | /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4); |
264 | const __m256i low_mask = _mm256_set1_epi8(0x0f); |
265 | |
266 | const __m256i lo = _mm256_and_si256(v, low_mask); |
267 | const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); |
268 | const __m256i popcnt1 = _mm256_shuffle_epi8(lookuppos, lo); |
269 | const __m256i popcnt2 = _mm256_shuffle_epi8(lookupneg, hi); |
270 | return _mm256_sad_epu8(popcnt1, popcnt2); |
271 | } |
272 | |
273 | /** |
274 | * Simple CSA over 256 bits |
275 | */ |
276 | static inline void CSA(__m256i *h, __m256i *l, __m256i a, __m256i b, |
277 | __m256i c) { |
278 | const __m256i u = _mm256_xor_si256(a, b); |
279 | *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); |
280 | *l = _mm256_xor_si256(u, c); |
281 | } |
282 | |
283 | /** |
284 | * Fast Harley-Seal AVX population count function |
285 | */ |
286 | inline static uint64_t avx2_harley_seal_popcount256(const __m256i *data, |
287 | const uint64_t size) { |
288 | __m256i total = _mm256_setzero_si256(); |
289 | __m256i ones = _mm256_setzero_si256(); |
290 | __m256i twos = _mm256_setzero_si256(); |
291 | __m256i fours = _mm256_setzero_si256(); |
292 | __m256i eights = _mm256_setzero_si256(); |
293 | __m256i sixteens = _mm256_setzero_si256(); |
294 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; |
295 | |
296 | const uint64_t limit = size - size % 16; |
297 | uint64_t i = 0; |
298 | |
299 | for (; i < limit; i += 16) { |
300 | CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i), |
301 | _mm256_lddqu_si256(data + i + 1)); |
302 | CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 2), |
303 | _mm256_lddqu_si256(data + i + 3)); |
304 | CSA(&foursA, &twos, twos, twosA, twosB); |
305 | CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 4), |
306 | _mm256_lddqu_si256(data + i + 5)); |
307 | CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 6), |
308 | _mm256_lddqu_si256(data + i + 7)); |
309 | CSA(&foursB, &twos, twos, twosA, twosB); |
310 | CSA(&eightsA, &fours, fours, foursA, foursB); |
311 | CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 8), |
312 | _mm256_lddqu_si256(data + i + 9)); |
313 | CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 10), |
314 | _mm256_lddqu_si256(data + i + 11)); |
315 | CSA(&foursA, &twos, twos, twosA, twosB); |
316 | CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 12), |
317 | _mm256_lddqu_si256(data + i + 13)); |
318 | CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 14), |
319 | _mm256_lddqu_si256(data + i + 15)); |
320 | CSA(&foursB, &twos, twos, twosA, twosB); |
321 | CSA(&eightsB, &fours, fours, foursA, foursB); |
322 | CSA(&sixteens, &eights, eights, eightsA, eightsB); |
323 | |
324 | total = _mm256_add_epi64(total, popcount256(sixteens)); |
325 | } |
326 | |
327 | total = _mm256_slli_epi64(total, 4); // * 16 |
328 | total = _mm256_add_epi64( |
329 | total, _mm256_slli_epi64(popcount256(eights), 3)); // += 8 * ... |
330 | total = _mm256_add_epi64( |
331 | total, _mm256_slli_epi64(popcount256(fours), 2)); // += 4 * ... |
332 | total = _mm256_add_epi64( |
333 | total, _mm256_slli_epi64(popcount256(twos), 1)); // += 2 * ... |
334 | total = _mm256_add_epi64(total, popcount256(ones)); |
335 | for (; i < size; i++) |
336 | total = |
337 | _mm256_add_epi64(total, popcount256(_mm256_lddqu_si256(data + i))); |
338 | |
339 | return (uint64_t)(_mm256_extract_epi64(total, 0)) + |
340 | (uint64_t)(_mm256_extract_epi64(total, 1)) + |
341 | (uint64_t)(_mm256_extract_epi64(total, 2)) + |
342 | (uint64_t)(_mm256_extract_epi64(total, 3)); |
343 | } |
344 | |
345 | #define AVXPOPCNTFNC(opname, avx_intrinsic) \ |
346 | static inline uint64_t avx2_harley_seal_popcount256_##opname( \ |
347 | const __m256i *data1, const __m256i *data2, const uint64_t size) { \ |
348 | __m256i total = _mm256_setzero_si256(); \ |
349 | __m256i ones = _mm256_setzero_si256(); \ |
350 | __m256i twos = _mm256_setzero_si256(); \ |
351 | __m256i fours = _mm256_setzero_si256(); \ |
352 | __m256i eights = _mm256_setzero_si256(); \ |
353 | __m256i sixteens = _mm256_setzero_si256(); \ |
354 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \ |
355 | __m256i A1, A2; \ |
356 | const uint64_t limit = size - size % 16; \ |
357 | uint64_t i = 0; \ |
358 | for (; i < limit; i += 16) { \ |
359 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ |
360 | _mm256_lddqu_si256(data2 + i)); \ |
361 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \ |
362 | _mm256_lddqu_si256(data2 + i + 1)); \ |
363 | CSA(&twosA, &ones, ones, A1, A2); \ |
364 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \ |
365 | _mm256_lddqu_si256(data2 + i + 2)); \ |
366 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \ |
367 | _mm256_lddqu_si256(data2 + i + 3)); \ |
368 | CSA(&twosB, &ones, ones, A1, A2); \ |
369 | CSA(&foursA, &twos, twos, twosA, twosB); \ |
370 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \ |
371 | _mm256_lddqu_si256(data2 + i + 4)); \ |
372 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \ |
373 | _mm256_lddqu_si256(data2 + i + 5)); \ |
374 | CSA(&twosA, &ones, ones, A1, A2); \ |
375 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \ |
376 | _mm256_lddqu_si256(data2 + i + 6)); \ |
377 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \ |
378 | _mm256_lddqu_si256(data2 + i + 7)); \ |
379 | CSA(&twosB, &ones, ones, A1, A2); \ |
380 | CSA(&foursB, &twos, twos, twosA, twosB); \ |
381 | CSA(&eightsA, &fours, fours, foursA, foursB); \ |
382 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \ |
383 | _mm256_lddqu_si256(data2 + i + 8)); \ |
384 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \ |
385 | _mm256_lddqu_si256(data2 + i + 9)); \ |
386 | CSA(&twosA, &ones, ones, A1, A2); \ |
387 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \ |
388 | _mm256_lddqu_si256(data2 + i + 10)); \ |
389 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \ |
390 | _mm256_lddqu_si256(data2 + i + 11)); \ |
391 | CSA(&twosB, &ones, ones, A1, A2); \ |
392 | CSA(&foursA, &twos, twos, twosA, twosB); \ |
393 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \ |
394 | _mm256_lddqu_si256(data2 + i + 12)); \ |
395 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \ |
396 | _mm256_lddqu_si256(data2 + i + 13)); \ |
397 | CSA(&twosA, &ones, ones, A1, A2); \ |
398 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \ |
399 | _mm256_lddqu_si256(data2 + i + 14)); \ |
400 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \ |
401 | _mm256_lddqu_si256(data2 + i + 15)); \ |
402 | CSA(&twosB, &ones, ones, A1, A2); \ |
403 | CSA(&foursB, &twos, twos, twosA, twosB); \ |
404 | CSA(&eightsB, &fours, fours, foursA, foursB); \ |
405 | CSA(&sixteens, &eights, eights, eightsA, eightsB); \ |
406 | total = _mm256_add_epi64(total, popcount256(sixteens)); \ |
407 | } \ |
408 | total = _mm256_slli_epi64(total, 4); \ |
409 | total = _mm256_add_epi64(total, \ |
410 | _mm256_slli_epi64(popcount256(eights), 3)); \ |
411 | total = \ |
412 | _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \ |
413 | total = \ |
414 | _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \ |
415 | total = _mm256_add_epi64(total, popcount256(ones)); \ |
416 | for (; i < size; i++) { \ |
417 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ |
418 | _mm256_lddqu_si256(data2 + i)); \ |
419 | total = _mm256_add_epi64(total, popcount256(A1)); \ |
420 | } \ |
421 | return (uint64_t)(_mm256_extract_epi64(total, 0)) + \ |
422 | (uint64_t)(_mm256_extract_epi64(total, 1)) + \ |
423 | (uint64_t)(_mm256_extract_epi64(total, 2)) + \ |
424 | (uint64_t)(_mm256_extract_epi64(total, 3)); \ |
425 | } \ |
426 | static inline uint64_t avx2_harley_seal_popcount256andstore_##opname( \ |
427 | const __m256i *__restrict__ data1, const __m256i *__restrict__ data2, \ |
428 | __m256i *__restrict__ out, const uint64_t size) { \ |
429 | __m256i total = _mm256_setzero_si256(); \ |
430 | __m256i ones = _mm256_setzero_si256(); \ |
431 | __m256i twos = _mm256_setzero_si256(); \ |
432 | __m256i fours = _mm256_setzero_si256(); \ |
433 | __m256i eights = _mm256_setzero_si256(); \ |
434 | __m256i sixteens = _mm256_setzero_si256(); \ |
435 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \ |
436 | __m256i A1, A2; \ |
437 | const uint64_t limit = size - size % 16; \ |
438 | uint64_t i = 0; \ |
439 | for (; i < limit; i += 16) { \ |
440 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ |
441 | _mm256_lddqu_si256(data2 + i)); \ |
442 | _mm256_storeu_si256(out + i, A1); \ |
443 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \ |
444 | _mm256_lddqu_si256(data2 + i + 1)); \ |
445 | _mm256_storeu_si256(out + i + 1, A2); \ |
446 | CSA(&twosA, &ones, ones, A1, A2); \ |
447 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \ |
448 | _mm256_lddqu_si256(data2 + i + 2)); \ |
449 | _mm256_storeu_si256(out + i + 2, A1); \ |
450 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \ |
451 | _mm256_lddqu_si256(data2 + i + 3)); \ |
452 | _mm256_storeu_si256(out + i + 3, A2); \ |
453 | CSA(&twosB, &ones, ones, A1, A2); \ |
454 | CSA(&foursA, &twos, twos, twosA, twosB); \ |
455 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \ |
456 | _mm256_lddqu_si256(data2 + i + 4)); \ |
457 | _mm256_storeu_si256(out + i + 4, A1); \ |
458 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \ |
459 | _mm256_lddqu_si256(data2 + i + 5)); \ |
460 | _mm256_storeu_si256(out + i + 5, A2); \ |
461 | CSA(&twosA, &ones, ones, A1, A2); \ |
462 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \ |
463 | _mm256_lddqu_si256(data2 + i + 6)); \ |
464 | _mm256_storeu_si256(out + i + 6, A1); \ |
465 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \ |
466 | _mm256_lddqu_si256(data2 + i + 7)); \ |
467 | _mm256_storeu_si256(out + i + 7, A2); \ |
468 | CSA(&twosB, &ones, ones, A1, A2); \ |
469 | CSA(&foursB, &twos, twos, twosA, twosB); \ |
470 | CSA(&eightsA, &fours, fours, foursA, foursB); \ |
471 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \ |
472 | _mm256_lddqu_si256(data2 + i + 8)); \ |
473 | _mm256_storeu_si256(out + i + 8, A1); \ |
474 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \ |
475 | _mm256_lddqu_si256(data2 + i + 9)); \ |
476 | _mm256_storeu_si256(out + i + 9, A2); \ |
477 | CSA(&twosA, &ones, ones, A1, A2); \ |
478 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \ |
479 | _mm256_lddqu_si256(data2 + i + 10)); \ |
480 | _mm256_storeu_si256(out + i + 10, A1); \ |
481 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \ |
482 | _mm256_lddqu_si256(data2 + i + 11)); \ |
483 | _mm256_storeu_si256(out + i + 11, A2); \ |
484 | CSA(&twosB, &ones, ones, A1, A2); \ |
485 | CSA(&foursA, &twos, twos, twosA, twosB); \ |
486 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \ |
487 | _mm256_lddqu_si256(data2 + i + 12)); \ |
488 | _mm256_storeu_si256(out + i + 12, A1); \ |
489 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \ |
490 | _mm256_lddqu_si256(data2 + i + 13)); \ |
491 | _mm256_storeu_si256(out + i + 13, A2); \ |
492 | CSA(&twosA, &ones, ones, A1, A2); \ |
493 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \ |
494 | _mm256_lddqu_si256(data2 + i + 14)); \ |
495 | _mm256_storeu_si256(out + i + 14, A1); \ |
496 | A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \ |
497 | _mm256_lddqu_si256(data2 + i + 15)); \ |
498 | _mm256_storeu_si256(out + i + 15, A2); \ |
499 | CSA(&twosB, &ones, ones, A1, A2); \ |
500 | CSA(&foursB, &twos, twos, twosA, twosB); \ |
501 | CSA(&eightsB, &fours, fours, foursA, foursB); \ |
502 | CSA(&sixteens, &eights, eights, eightsA, eightsB); \ |
503 | total = _mm256_add_epi64(total, popcount256(sixteens)); \ |
504 | } \ |
505 | total = _mm256_slli_epi64(total, 4); \ |
506 | total = _mm256_add_epi64(total, \ |
507 | _mm256_slli_epi64(popcount256(eights), 3)); \ |
508 | total = \ |
509 | _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \ |
510 | total = \ |
511 | _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \ |
512 | total = _mm256_add_epi64(total, popcount256(ones)); \ |
513 | for (; i < size; i++) { \ |
514 | A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \ |
515 | _mm256_lddqu_si256(data2 + i)); \ |
516 | _mm256_storeu_si256(out + i, A1); \ |
517 | total = _mm256_add_epi64(total, popcount256(A1)); \ |
518 | } \ |
519 | return (uint64_t)(_mm256_extract_epi64(total, 0)) + \ |
520 | (uint64_t)(_mm256_extract_epi64(total, 1)) + \ |
521 | (uint64_t)(_mm256_extract_epi64(total, 2)) + \ |
522 | (uint64_t)(_mm256_extract_epi64(total, 3)); \ |
523 | } |
524 | |
525 | AVXPOPCNTFNC(or, _mm256_or_si256) |
526 | AVXPOPCNTFNC(union, _mm256_or_si256) |
527 | AVXPOPCNTFNC(and, _mm256_and_si256) |
528 | AVXPOPCNTFNC(intersection, _mm256_and_si256) |
529 | AVXPOPCNTFNC (xor, _mm256_xor_si256) |
530 | AVXPOPCNTFNC(andnot, _mm256_andnot_si256) |
531 | |
532 | /*** |
533 | * END Harley-Seal popcount functions. |
534 | */ |
535 | |
536 | #endif // USEAVX |
537 | |
538 | #endif |
539 | |