1/*
2 * bitset.c
3 *
4 */
5#ifndef _POSIX_C_SOURCE
6#define _POSIX_C_SOURCE 200809L
7#endif
8#include <assert.h>
9#include <stdio.h>
10#include <stdlib.h>
11#include <string.h>
12
13#include <roaring/bitset_util.h>
14#include <roaring/containers/bitset.h>
15#include <roaring/portability.h>
16#include <roaring/utilasm.h>
17
18extern inline int bitset_container_cardinality(const bitset_container_t *bitset);
19extern inline bool bitset_container_nonzero_cardinality(bitset_container_t *bitset);
20extern inline void bitset_container_set(bitset_container_t *bitset, uint16_t pos);
21extern inline void bitset_container_unset(bitset_container_t *bitset, uint16_t pos);
22extern inline bool bitset_container_get(const bitset_container_t *bitset,
23 uint16_t pos);
24extern inline int32_t bitset_container_serialized_size_in_bytes(void);
25extern inline bool bitset_container_add(bitset_container_t *bitset, uint16_t pos);
26extern inline bool bitset_container_remove(bitset_container_t *bitset, uint16_t pos);
27extern inline bool bitset_container_contains(const bitset_container_t *bitset,
28 uint16_t pos);
29
30void bitset_container_clear(bitset_container_t *bitset) {
31 memset(bitset->array, 0, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
32 bitset->cardinality = 0;
33}
34
35void bitset_container_set_all(bitset_container_t *bitset) {
36 memset(bitset->array, INT64_C(-1),
37 sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
38 bitset->cardinality = (1 << 16);
39}
40
41
42
43/* Create a new bitset. Return NULL in case of failure. */
44bitset_container_t *bitset_container_create(void) {
45 bitset_container_t *bitset =
46 (bitset_container_t *)malloc(sizeof(bitset_container_t));
47
48 if (!bitset) {
49 return NULL;
50 }
51 // sizeof(__m256i) == 32
52 bitset->array = (uint64_t *)roaring_bitmap_aligned_malloc(
53 32, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
54 if (!bitset->array) {
55 free(bitset);
56 return NULL;
57 }
58 bitset_container_clear(bitset);
59 return bitset;
60}
61
62/* Copy one container into another. We assume that they are distinct. */
63void bitset_container_copy(const bitset_container_t *source,
64 bitset_container_t *dest) {
65 dest->cardinality = source->cardinality;
66 memcpy(dest->array, source->array,
67 sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
68}
69
70void bitset_container_add_from_range(bitset_container_t *bitset, uint32_t min,
71 uint32_t max, uint16_t step) {
72 if (step == 0) return; // refuse to crash
73 if ((64 % step) == 0) { // step divides 64
74 uint64_t mask = 0; // construct the repeated mask
75 for (uint32_t value = (min % step); value < 64; value += step) {
76 mask |= ((uint64_t)1 << value);
77 }
78 uint32_t firstword = min / 64;
79 uint32_t endword = (max - 1) / 64;
80 bitset->cardinality = (max - min + step - 1) / step;
81 if (firstword == endword) {
82 bitset->array[firstword] |=
83 mask & (((~UINT64_C(0)) << (min % 64)) &
84 ((~UINT64_C(0)) >> ((~max + 1) % 64)));
85 return;
86 }
87 bitset->array[firstword] = mask & ((~UINT64_C(0)) << (min % 64));
88 for (uint32_t i = firstword + 1; i < endword; i++)
89 bitset->array[i] = mask;
90 bitset->array[endword] = mask & ((~UINT64_C(0)) >> ((~max + 1) % 64));
91 } else {
92 for (uint32_t value = min; value < max; value += step) {
93 bitset_container_add(bitset, value);
94 }
95 }
96}
97
98/* Free memory. */
99void bitset_container_free(bitset_container_t *bitset) {
100 if(bitset->array != NULL) {// Jon Strabala reports that some tools complain otherwise
101 roaring_bitmap_aligned_free(bitset->array);
102 bitset->array = NULL; // pedantic
103 }
104 free(bitset);
105}
106
107/* duplicate container. */
108bitset_container_t *bitset_container_clone(const bitset_container_t *src) {
109 bitset_container_t *bitset =
110 (bitset_container_t *)malloc(sizeof(bitset_container_t));
111
112 if (!bitset) {
113 return NULL;
114 }
115 // sizeof(__m256i) == 32
116 bitset->array = (uint64_t *)roaring_bitmap_aligned_malloc(
117 32, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
118 if (!bitset->array) {
119 free(bitset);
120 return NULL;
121 }
122 bitset->cardinality = src->cardinality;
123 memcpy(bitset->array, src->array,
124 sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
125 return bitset;
126}
127
128void bitset_container_set_range(bitset_container_t *bitset, uint32_t begin,
129 uint32_t end) {
130 bitset_set_range(bitset->array, begin, end);
131 bitset->cardinality =
132 bitset_container_compute_cardinality(bitset); // could be smarter
133}
134
135
136bool bitset_container_intersect(const bitset_container_t *src_1,
137 const bitset_container_t *src_2) {
138 // could vectorize, but this is probably already quite fast in practice
139 const uint64_t * __restrict__ array_1 = src_1->array;
140 const uint64_t * __restrict__ array_2 = src_2->array;
141 for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i ++) {
142 if((array_1[i] & array_2[i]) != 0) return true;
143 }
144 return false;
145}
146
147
148#ifdef USEAVX
149#ifndef WORDS_IN_AVX2_REG
150#define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
151#endif
152/* Get the number of bits set (force computation) */
153int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
154 return (int) avx2_harley_seal_popcount256(
155 (const __m256i *)bitset->array,
156 BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));
157}
158
159#elif defined(USENEON)
160int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
161 uint16x8_t n0 = vdupq_n_u16(0);
162 uint16x8_t n1 = vdupq_n_u16(0);
163 uint16x8_t n2 = vdupq_n_u16(0);
164 uint16x8_t n3 = vdupq_n_u16(0);
165 for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {
166 uint64x2_t c0 = vld1q_u64(&bitset->array[i + 0]);
167 n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0))));
168 uint64x2_t c1 = vld1q_u64(&bitset->array[i + 2]);
169 n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1))));
170 uint64x2_t c2 = vld1q_u64(&bitset->array[i + 4]);
171 n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2))));
172 uint64x2_t c3 = vld1q_u64(&bitset->array[i + 6]);
173 n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3))));
174 }
175 uint64x2_t n = vdupq_n_u64(0);
176 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0)));
177 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1)));
178 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2)));
179 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3)));
180 return vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1);
181}
182
183#else
184
185/* Get the number of bits set (force computation) */
186int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
187 const uint64_t *array = bitset->array;
188 int32_t sum = 0;
189 for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 4) {
190 sum += hamming(array[i]);
191 sum += hamming(array[i + 1]);
192 sum += hamming(array[i + 2]);
193 sum += hamming(array[i + 3]);
194 }
195 return sum;
196}
197
198#endif
199
200#ifdef USEAVX
201
202#define BITSET_CONTAINER_FN_REPEAT 8
203#ifndef WORDS_IN_AVX2_REG
204#define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
205#endif
206#define LOOP_SIZE \
207 BITSET_CONTAINER_SIZE_IN_WORDS / \
208 ((WORDS_IN_AVX2_REG)*BITSET_CONTAINER_FN_REPEAT)
209
210/* Computes a binary operation (eg union) on bitset1 and bitset2 and write the
211 result to bitsetout */
212// clang-format off
213#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic) \
214int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \
215 const bitset_container_t *src_2, \
216 bitset_container_t *dst) { \
217 const uint8_t * __restrict__ array_1 = (const uint8_t *)src_1->array; \
218 const uint8_t * __restrict__ array_2 = (const uint8_t *)src_2->array; \
219 /* not using the blocking optimization for some reason*/ \
220 uint8_t *out = (uint8_t*)dst->array; \
221 const int innerloop = 8; \
222 for (size_t i = 0; \
223 i < BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG); \
224 i+=innerloop) {\
225 __m256i A1, A2, AO; \
226 A1 = _mm256_lddqu_si256((const __m256i *)(array_1)); \
227 A2 = _mm256_lddqu_si256((const __m256i *)(array_2)); \
228 AO = avx_intrinsic(A2, A1); \
229 _mm256_storeu_si256((__m256i *)out, AO); \
230 A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 32)); \
231 A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 32)); \
232 AO = avx_intrinsic(A2, A1); \
233 _mm256_storeu_si256((__m256i *)(out+32), AO); \
234 A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 64)); \
235 A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 64)); \
236 AO = avx_intrinsic(A2, A1); \
237 _mm256_storeu_si256((__m256i *)(out+64), AO); \
238 A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 96)); \
239 A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 96)); \
240 AO = avx_intrinsic(A2, A1); \
241 _mm256_storeu_si256((__m256i *)(out+96), AO); \
242 A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 128)); \
243 A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 128)); \
244 AO = avx_intrinsic(A2, A1); \
245 _mm256_storeu_si256((__m256i *)(out+128), AO); \
246 A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 160)); \
247 A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 160)); \
248 AO = avx_intrinsic(A2, A1); \
249 _mm256_storeu_si256((__m256i *)(out+160), AO); \
250 A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 192)); \
251 A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 192)); \
252 AO = avx_intrinsic(A2, A1); \
253 _mm256_storeu_si256((__m256i *)(out+192), AO); \
254 A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 224)); \
255 A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 224)); \
256 AO = avx_intrinsic(A2, A1); \
257 _mm256_storeu_si256((__m256i *)(out+224), AO); \
258 out+=256; \
259 array_1 += 256; \
260 array_2 += 256; \
261 } \
262 dst->cardinality = BITSET_UNKNOWN_CARDINALITY; \
263 return dst->cardinality; \
264} \
265/* next, a version that updates cardinality*/ \
266int bitset_container_##opname(const bitset_container_t *src_1, \
267 const bitset_container_t *src_2, \
268 bitset_container_t *dst) { \
269 const __m256i * __restrict__ array_1 = (const __m256i *) src_1->array; \
270 const __m256i * __restrict__ array_2 = (const __m256i *) src_2->array; \
271 __m256i *out = (__m256i *) dst->array; \
272 dst->cardinality = (int32_t)avx2_harley_seal_popcount256andstore_##opname(array_2,\
273 array_1, out,BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));\
274 return dst->cardinality; \
275} \
276/* next, a version that just computes the cardinality*/ \
277int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
278 const bitset_container_t *src_2) { \
279 const __m256i * __restrict__ data1 = (const __m256i *) src_1->array; \
280 const __m256i * __restrict__ data2 = (const __m256i *) src_2->array; \
281 return (int)avx2_harley_seal_popcount256_##opname(data2, \
282 data1, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));\
283}
284
285#elif defined(USENEON)
286
287#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic) \
288int bitset_container_##opname(const bitset_container_t *src_1, \
289 const bitset_container_t *src_2, \
290 bitset_container_t *dst) { \
291 const uint64_t * __restrict__ array_1 = src_1->array; \
292 const uint64_t * __restrict__ array_2 = src_2->array; \
293 uint64_t *out = dst->array; \
294 uint16x8_t n0 = vdupq_n_u16(0); \
295 uint16x8_t n1 = vdupq_n_u16(0); \
296 uint16x8_t n2 = vdupq_n_u16(0); \
297 uint16x8_t n3 = vdupq_n_u16(0); \
298 for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) { \
299 uint64x2_t c0 = neon_intrinsic(vld1q_u64(&array_1[i + 0]), \
300 vld1q_u64(&array_2[i + 0])); \
301 n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0)))); \
302 vst1q_u64(&out[i + 0], c0); \
303 uint64x2_t c1 = neon_intrinsic(vld1q_u64(&array_1[i + 2]), \
304 vld1q_u64(&array_2[i + 2])); \
305 n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1)))); \
306 vst1q_u64(&out[i + 2], c1); \
307 uint64x2_t c2 = neon_intrinsic(vld1q_u64(&array_1[i + 4]), \
308 vld1q_u64(&array_2[i + 4])); \
309 n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2)))); \
310 vst1q_u64(&out[i + 4], c2); \
311 uint64x2_t c3 = neon_intrinsic(vld1q_u64(&array_1[i + 6]), \
312 vld1q_u64(&array_2[i + 6])); \
313 n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3)))); \
314 vst1q_u64(&out[i + 6], c3); \
315 } \
316 uint64x2_t n = vdupq_n_u64(0); \
317 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0))); \
318 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1))); \
319 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2))); \
320 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3))); \
321 dst->cardinality = vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1); \
322 return dst->cardinality; \
323} \
324int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \
325 const bitset_container_t *src_2, \
326 bitset_container_t *dst) { \
327 const uint64_t * __restrict__ array_1 = src_1->array; \
328 const uint64_t * __restrict__ array_2 = src_2->array; \
329 uint64_t *out = dst->array; \
330 for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) { \
331 vst1q_u64(&out[i + 0], neon_intrinsic(vld1q_u64(&array_1[i + 0]), \
332 vld1q_u64(&array_2[i + 0]))); \
333 vst1q_u64(&out[i + 2], neon_intrinsic(vld1q_u64(&array_1[i + 2]), \
334 vld1q_u64(&array_2[i + 2]))); \
335 vst1q_u64(&out[i + 4], neon_intrinsic(vld1q_u64(&array_1[i + 4]), \
336 vld1q_u64(&array_2[i + 4]))); \
337 vst1q_u64(&out[i + 6], neon_intrinsic(vld1q_u64(&array_1[i + 6]), \
338 vld1q_u64(&array_2[i + 6]))); \
339 } \
340 dst->cardinality = BITSET_UNKNOWN_CARDINALITY; \
341 return dst->cardinality; \
342} \
343int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
344 const bitset_container_t *src_2) { \
345 const uint64_t * __restrict__ array_1 = src_1->array; \
346 const uint64_t * __restrict__ array_2 = src_2->array; \
347 uint16x8_t n0 = vdupq_n_u16(0); \
348 uint16x8_t n1 = vdupq_n_u16(0); \
349 uint16x8_t n2 = vdupq_n_u16(0); \
350 uint16x8_t n3 = vdupq_n_u16(0); \
351 for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) { \
352 uint64x2_t c0 = neon_intrinsic(vld1q_u64(&array_1[i + 0]), \
353 vld1q_u64(&array_2[i + 0])); \
354 n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0)))); \
355 uint64x2_t c1 = neon_intrinsic(vld1q_u64(&array_1[i + 2]), \
356 vld1q_u64(&array_2[i + 2])); \
357 n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1)))); \
358 uint64x2_t c2 = neon_intrinsic(vld1q_u64(&array_1[i + 4]), \
359 vld1q_u64(&array_2[i + 4])); \
360 n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2)))); \
361 uint64x2_t c3 = neon_intrinsic(vld1q_u64(&array_1[i + 6]), \
362 vld1q_u64(&array_2[i + 6])); \
363 n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3)))); \
364 } \
365 uint64x2_t n = vdupq_n_u64(0); \
366 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0))); \
367 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1))); \
368 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2))); \
369 n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3))); \
370 return vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1); \
371}
372
373#else /* not USEAVX */
374
375#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic) \
376int bitset_container_##opname(const bitset_container_t *src_1, \
377 const bitset_container_t *src_2, \
378 bitset_container_t *dst) { \
379 const uint64_t * __restrict__ array_1 = src_1->array; \
380 const uint64_t * __restrict__ array_2 = src_2->array; \
381 uint64_t *out = dst->array; \
382 int32_t sum = 0; \
383 for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) { \
384 const uint64_t word_1 = (array_1[i])opsymbol(array_2[i]), \
385 word_2 = (array_1[i + 1])opsymbol(array_2[i + 1]); \
386 out[i] = word_1; \
387 out[i + 1] = word_2; \
388 sum += hamming(word_1); \
389 sum += hamming(word_2); \
390 } \
391 dst->cardinality = sum; \
392 return dst->cardinality; \
393} \
394int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \
395 const bitset_container_t *src_2, \
396 bitset_container_t *dst) { \
397 const uint64_t * __restrict__ array_1 = src_1->array; \
398 const uint64_t * __restrict__ array_2 = src_2->array; \
399 uint64_t *out = dst->array; \
400 for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i++) { \
401 out[i] = (array_1[i])opsymbol(array_2[i]); \
402 } \
403 dst->cardinality = BITSET_UNKNOWN_CARDINALITY; \
404 return dst->cardinality; \
405} \
406int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
407 const bitset_container_t *src_2) { \
408 const uint64_t * __restrict__ array_1 = src_1->array; \
409 const uint64_t * __restrict__ array_2 = src_2->array; \
410 int32_t sum = 0; \
411 for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) { \
412 const uint64_t word_1 = (array_1[i])opsymbol(array_2[i]), \
413 word_2 = (array_1[i + 1])opsymbol(array_2[i + 1]); \
414 sum += hamming(word_1); \
415 sum += hamming(word_2); \
416 } \
417 return sum; \
418}
419
420#endif
421
422// we duplicate the function because other containers use the "or" term, makes API more consistent
423BITSET_CONTAINER_FN(or, |, _mm256_or_si256, vorrq_u64)
424BITSET_CONTAINER_FN(union, |, _mm256_or_si256, vorrq_u64)
425
426// we duplicate the function because other containers use the "intersection" term, makes API more consistent
427BITSET_CONTAINER_FN(and, &, _mm256_and_si256, vandq_u64)
428BITSET_CONTAINER_FN(intersection, &, _mm256_and_si256, vandq_u64)
429
430BITSET_CONTAINER_FN(xor, ^, _mm256_xor_si256, veorq_u64)
431BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64)
432// clang-format On
433
434
435
436int bitset_container_to_uint32_array( void *vout, const bitset_container_t *cont, uint32_t base) {
437#ifdef USEAVX2FORDECODING
438 if(cont->cardinality >= 8192)// heuristic
439 return (int) bitset_extract_setbits_avx2(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,cont->cardinality,base);
440 else
441 return (int) bitset_extract_setbits(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,base);
442#else
443 return (int) bitset_extract_setbits(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,base);
444#endif
445}
446
447/*
448 * Print this container using printf (useful for debugging).
449 */
450void bitset_container_printf(const bitset_container_t * v) {
451 printf("{");
452 uint32_t base = 0;
453 bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable
454 for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
455 uint64_t w = v->array[i];
456 while (w != 0) {
457 uint64_t t = w & (~w + 1);
458 int r = __builtin_ctzll(w);
459 if(iamfirst) {// predicted to be false
460 printf("%u",base + r);
461 iamfirst = false;
462 } else {
463 printf(",%u",base + r);
464 }
465 w ^= t;
466 }
467 base += 64;
468 }
469 printf("}");
470}
471
472
473/*
474 * Print this container using printf as a comma-separated list of 32-bit integers starting at base.
475 */
476void bitset_container_printf_as_uint32_array(const bitset_container_t * v, uint32_t base) {
477 bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable
478 for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
479 uint64_t w = v->array[i];
480 while (w != 0) {
481 uint64_t t = w & (~w + 1);
482 int r = __builtin_ctzll(w);
483 if(iamfirst) {// predicted to be false
484 printf("%u", r + base);
485 iamfirst = false;
486 } else {
487 printf(",%u",r + base);
488 }
489 w ^= t;
490 }
491 base += 64;
492 }
493}
494
495
496// TODO: use the fast lower bound, also
497int bitset_container_number_of_runs(bitset_container_t *b) {
498 int num_runs = 0;
499 uint64_t next_word = b->array[0];
500
501 for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS-1; ++i) {
502 uint64_t word = next_word;
503 next_word = b->array[i+1];
504 num_runs += hamming((~word) & (word << 1)) + ( (word >> 63) & ~next_word);
505 }
506
507 uint64_t word = next_word;
508 num_runs += hamming((~word) & (word << 1));
509 if((word & 0x8000000000000000ULL) != 0)
510 num_runs++;
511 return num_runs;
512}
513
514int32_t bitset_container_serialize(const bitset_container_t *container, char *buf) {
515 int32_t l = sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS;
516 memcpy(buf, container->array, l);
517 return(l);
518}
519
520
521
522int32_t bitset_container_write(const bitset_container_t *container,
523 char *buf) {
524 memcpy(buf, container->array, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
525 return bitset_container_size_in_bytes(container);
526}
527
528
529int32_t bitset_container_read(int32_t cardinality, bitset_container_t *container,
530 const char *buf) {
531 container->cardinality = cardinality;
532 memcpy(container->array, buf, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
533 return bitset_container_size_in_bytes(container);
534}
535
536uint32_t bitset_container_serialization_len() {
537 return(sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
538}
539
540void* bitset_container_deserialize(const char *buf, size_t buf_len) {
541 bitset_container_t *ptr;
542 size_t l = sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS;
543
544 if(l != buf_len)
545 return(NULL);
546
547 if((ptr = (bitset_container_t *)malloc(sizeof(bitset_container_t))) != NULL) {
548 memcpy(ptr, buf, sizeof(bitset_container_t));
549 // sizeof(__m256i) == 32
550 ptr->array = (uint64_t *) roaring_bitmap_aligned_malloc(32, l);
551 if (! ptr->array) {
552 free(ptr);
553 return NULL;
554 }
555 memcpy(ptr->array, buf, l);
556 ptr->cardinality = bitset_container_compute_cardinality(ptr);
557 }
558
559 return((void*)ptr);
560}
561
562bool bitset_container_iterate(const bitset_container_t *cont, uint32_t base, roaring_iterator iterator, void *ptr) {
563 for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
564 uint64_t w = cont->array[i];
565 while (w != 0) {
566 uint64_t t = w & (~w + 1);
567 int r = __builtin_ctzll(w);
568 if(!iterator(r + base, ptr)) return false;
569 w ^= t;
570 }
571 base += 64;
572 }
573 return true;
574}
575
576bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base, roaring_iterator64 iterator, uint64_t high_bits, void *ptr) {
577 for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
578 uint64_t w = cont->array[i];
579 while (w != 0) {
580 uint64_t t = w & (~w + 1);
581 int r = __builtin_ctzll(w);
582 if(!iterator(high_bits | (uint64_t)(r + base), ptr)) return false;
583 w ^= t;
584 }
585 base += 64;
586 }
587 return true;
588}
589
590
591bool bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) {
592 if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
593 if(container1->cardinality != container2->cardinality) {
594 return false;
595 }
596 if (container1->cardinality == INT32_C(0x10000)) {
597 return true;
598 }
599 }
600#ifdef USEAVX
601 const __m256i *ptr1 = (const __m256i*)container1->array;
602 const __m256i *ptr2 = (const __m256i*)container2->array;
603 for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)/32; i++) {
604 __m256i r1 = _mm256_load_si256(ptr1+i);
605 __m256i r2 = _mm256_load_si256(ptr2+i);
606 int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
607 if ((uint32_t)mask != UINT32_MAX) {
608 return false;
609 }
610 }
611#else
612 return memcmp(container1->array,
613 container2->array,
614 BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)) == 0;
615#endif
616 return true;
617}
618
619bool bitset_container_is_subset(const bitset_container_t *container1,
620 const bitset_container_t *container2) {
621 if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
622 if(container1->cardinality > container2->cardinality) {
623 return false;
624 }
625 }
626 for(int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
627 if((container1->array[i] & container2->array[i]) != container1->array[i]) {
628 return false;
629 }
630 }
631 return true;
632}
633
634bool bitset_container_select(const bitset_container_t *container, uint32_t *start_rank, uint32_t rank, uint32_t *element) {
635 int card = bitset_container_cardinality(container);
636 if(rank >= *start_rank + card) {
637 *start_rank += card;
638 return false;
639 }
640 const uint64_t *array = container->array;
641 int32_t size;
642 for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 1) {
643 size = hamming(array[i]);
644 if(rank <= *start_rank + size) {
645 uint64_t w = container->array[i];
646 uint16_t base = i*64;
647 while (w != 0) {
648 uint64_t t = w & (~w + 1);
649 int r = __builtin_ctzll(w);
650 if(*start_rank == rank) {
651 *element = r+base;
652 return true;
653 }
654 w ^= t;
655 *start_rank += 1;
656 }
657 }
658 else
659 *start_rank += size;
660 }
661 assert(false);
662 __builtin_unreachable();
663}
664
665
666/* Returns the smallest value (assumes not empty) */
667uint16_t bitset_container_minimum(const bitset_container_t *container) {
668 for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
669 uint64_t w = container->array[i];
670 if (w != 0) {
671 int r = __builtin_ctzll(w);
672 return r + i * 64;
673 }
674 }
675 return UINT16_MAX;
676}
677
678/* Returns the largest value (assumes not empty) */
679uint16_t bitset_container_maximum(const bitset_container_t *container) {
680 for (int32_t i = BITSET_CONTAINER_SIZE_IN_WORDS - 1; i > 0; --i ) {
681 uint64_t w = container->array[i];
682 if (w != 0) {
683 int r = __builtin_clzll(w);
684 return i * 64 + 63 - r;
685 }
686 }
687 return 0;
688}
689
690/* Returns the number of values equal or smaller than x */
691int bitset_container_rank(const bitset_container_t *container, uint16_t x) {
692 // credit: aqrit
693 int sum = 0;
694 int i = 0;
695 for (int end = x / 64; i < end; i++){
696 sum += hamming(container->array[i]);
697 }
698 uint64_t lastword = container->array[i];
699 uint64_t lastpos = UINT64_C(1) << (x % 64);
700 uint64_t mask = lastpos + lastpos - 1; // smear right
701 sum += hamming(lastword & mask);
702 return sum;
703}
704
705/* Returns the index of the first value equal or larger than x, or -1 */
706int bitset_container_index_equalorlarger(const bitset_container_t *container, uint16_t x) {
707 uint32_t x32 = x;
708 uint32_t k = x32 / 64;
709 uint64_t word = container->array[k];
710 const int diff = x32 - k * 64; // in [0,64)
711 word = (word >> diff) << diff; // a mask is faster, but we don't care
712 while(word == 0) {
713 k++;
714 if(k == BITSET_CONTAINER_SIZE_IN_WORDS) return -1;
715 word = container->array[k];
716 }
717 return k * 64 + __builtin_ctzll(word);
718}
719