multibit.h source code [ClickHouse/contrib/hyperscan/src/util/multibit.h]

1	/*
2	* Copyright (c) 2015-2018, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Multibit: fast bitset structure, main runtime.
31	*
32	* Structure
33	*
34	* For sizes <= MMB_FLAT_MAX_BITS, a flat bit vector is used, stored as N
35	* 64-bit blocks followed by one "runt block".
36	*
37	* In larger cases, we use a sequence of blocks forming a tree. Each bit in an
38	* internal block indicates whether its child block contains valid data. Every
39	* level bar the last is complete. The last level is just a basic bit vector.
40	*
41	* -----------------------------------------------------------------------------
42	* WARNING:
43	*
44	* mmbit code assumes that it is legal to load 8 bytes before the end of the
45	* mmbit. This means that for small mmbits (< 8byte), data may be read from
46	* before the base pointer. It is the user's responsibility to ensure that this
47	* is possible.
48	* -----------------------------------------------------------------------------
49	*/
50	#ifndef MULTIBIT_H
51	#define MULTIBIT_H
52
53	#include "config.h"
54	#include "ue2common.h"
55	#include "bitutils.h"
56	#include "partial_store.h"
57	#include "unaligned.h"
58	#include "multibit_internal.h"
59
60	#include <string.h>
61
62	#ifdef __cplusplus
63	extern "C" {
64	#endif
65
66	#define MMB_ONE (1ULL)
67	#define MMB_ALL_ONES (0xffffffffffffffffULL)
68
69	/* \brief Number of bits in a block. /
70	#define MMB_KEY_BITS (sizeof(MMB_TYPE) * 8)
71
72	#define MMB_KEY_MASK (MMB_KEY_BITS - 1)
73
74	// Key structure defines
75	#define MMB_KEY_SHIFT 6
76
77	/* \brief Max size of a flat multibit. /
78	#define MMB_FLAT_MAX_BITS 256
79
80	// Utility functions and data
81	// see multibit.c for contents
82	extern const u8 mmbit_keyshift_lut[`32`];
83	extern const u8 mmbit_maxlevel_from_keyshift_lut[`32`];
84	extern const u8 mmbit_maxlevel_direct_lut[`32`];
85	extern const u32 mmbit_root_offset_from_level[`7`];
86	extern const u64a mmbit_zero_to_lut[`65`];
87
88	static really_inline
89	MMB_TYPE mmb_load(const u8 * bits) {
90	return unaligned_load_u64a(bits);
91	}
92
93	static really_inline
94	void mmb_store(u8 *bits, MMB_TYPE val) {
95	unaligned_store_u64a(bits, val);
96	}
97
98	static really_inline
99	void mmb_store_partial(u8 *bits, MMB_TYPE val, u32 block_bits) {
100	assert(block_bits <= MMB_KEY_BITS);
101	partial_store_u64a(bits, val, ROUNDUP_N(block_bits, `8U`) / `8U`);
102	}
103
104	static really_inline
105	MMB_TYPE mmb_single_bit(u32 bit) {
106	assert(bit < MMB_KEY_BITS);
107	return MMB_ONE << bit;
108	}
109
110	static really_inline
111	MMB_TYPE mmb_mask_zero_to(u32 bit) {
112	assert(bit <= MMB_KEY_BITS);
113	#ifdef ARCH_32_BIT
114	return mmbit_zero_to_lut[bit];
115	#else
116	if (bit == MMB_KEY_BITS) {
117	return MMB_ALL_ONES;
118	} else {
119	return mmb_single_bit(bit) - MMB_ONE;
120	}
121	#endif
122	}
123
124	/* \brief Returns a mask of set bits up to position \a bit. Does not handle*
125	* the case where bit == MMB_KEY_BITS. */
126	static really_inline
127	MMB_TYPE mmb_mask_zero_to_nocheck(u32 bit) {
128	assert(bit < MMB_KEY_BITS);
129	#ifdef ARCH_32_BIT
130	return mmbit_zero_to_lut[bit];
131	#else
132	return mmb_single_bit(bit) - MMB_ONE;
133	#endif
134	}
135
136	static really_inline
137	u32 mmb_test(MMB_TYPE val, u32 bit) {
138	assert(bit < MMB_KEY_BITS);
139	return (val >> bit) & MMB_ONE;
140	}
141
142	static really_inline
143	void mmb_set(MMB_TYPE * val, u32 bit) {
144	assert(bit < MMB_KEY_BITS);
145	*val \|= mmb_single_bit(bit);
146	}
147
148	static really_inline
149	void mmb_clear(MMB_TYPE * val, u32 bit) {
150	assert(bit < MMB_KEY_BITS);
151	*val &= ~mmb_single_bit(bit);
152	}
153
154	static really_inline
155	u32 mmb_ctz(MMB_TYPE val) {
156	return ctz64(val);
157	}
158
159	static really_inline
160	u32 mmb_popcount(MMB_TYPE val) {
161	return popcount64(val);
162	}
163
164	#ifndef MMMB_DEBUG
165	#define MDEBUG_PRINTF(x, ...) do { } while(0)
166	#else
167	#define MDEBUG_PRINTF DEBUG_PRINTF
168	#endif
169
170	// Switch the following define on to trace writes to multibit.
171	//#define MMB_TRACE_WRITES
172	#ifdef MMB_TRACE_WRITES
173	#define MMB_TRACE(format, ...) \
174	printf("mmb [%u bits @ %p] " format, total_bits, bits, ##__VA_ARGS__)
175	#else
176	#define MMB_TRACE(format, ...) \
177	do { \
178	} while (0)
179	#endif
180
181	static really_inline
182	u32 mmbit_keyshift(u32 total_bits) {
183	assert(total_bits > `1`);
184	u32 n = clz32(total_bits - `1`); // subtract one as we're rounding down
185	return mmbit_keyshift_lut[n];
186	}
187
188	static really_inline
189	u32 mmbit_maxlevel(u32 total_bits) {
190	assert(total_bits > `1`);
191	u32 n = clz32(total_bits - `1`); // subtract one as we're rounding down
192	u32 max_level = mmbit_maxlevel_direct_lut[n];
193	assert(max_level <= MMB_MAX_LEVEL);
194	return max_level;
195	}
196
197	static really_inline
198	u32 mmbit_maxlevel_from_keyshift(u32 ks) {
199	assert(ks <= `30`);
200	assert(ks % MMB_KEY_SHIFT == `0`);
201
202	u32 max_level = mmbit_maxlevel_from_keyshift_lut[ks];
203	assert(max_level <= MMB_MAX_LEVEL);
204	return max_level;
205	}
206
207	/* \brief get our keyshift for the current level /
208	static really_inline
209	u32 mmbit_get_ks(u32 max_level, u32 level) {
210	assert(max_level <= MMB_MAX_LEVEL);
211	assert(level <= max_level);
212	return (max_level - level) * MMB_KEY_SHIFT;
213	}
214
215	/* \brief get our key value for the current level /
216	static really_inline
217	u32 mmbit_get_key_val(u32 max_level, u32 level, u32 key) {
218	return (key >> mmbit_get_ks(max_level, level)) & MMB_KEY_MASK;
219	}
220
221	/* \brief get the level root for the current level /
222	static really_inline
223	u8 mmbit_get_level_root(u8 bits, u32 level) {
224	assert(level < ARRAY_LENGTH(mmbit_root_offset_from_level));
225	return bits + mmbit_root_offset_from_level[level] * sizeof(MMB_TYPE);
226	}
227
228	/* \brief get the level root for the current level as const /
229	static really_inline
230	const u8 mmbit_get_level_root_const(const* u8 *bits, u32 level) {
231	assert(level < ARRAY_LENGTH(mmbit_root_offset_from_level));
232	return bits + mmbit_root_offset_from_level[level] * sizeof(MMB_TYPE);
233	}
234
235	/* \brief get the block for this key on the current level as a u8 ptr /
236	static really_inline
237	u8 mmbit_get_block_ptr(u8 bits, u32 max_level, u32 level, u32 key) {
238	u8 *level_root = mmbit_get_level_root(bits, level);
239	u32 ks = mmbit_get_ks(max_level, level);
240	return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
241	}
242
243	/* \brief get the block for this key on the current level as a const u8 ptr /
244	static really_inline
245	const u8 mmbit_get_block_ptr_const(const* u8 *bits, u32 max_level, u32 level,
246	u32 key) {
247	const u8 *level_root = mmbit_get_level_root_const(bits, level);
248	u32 ks = mmbit_get_ks(max_level, level);
249	return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE);
250	}
251
252	/* \brief get the _byte_ for this key on the current level as a u8 ptr /
253	static really_inline
254	u8 mmbit_get_byte_ptr(u8 bits, u32 max_level, u32 level, u32 key) {
255	u8 *level_root = mmbit_get_level_root(bits, level);
256	u32 ks = mmbit_get_ks(max_level, level);
257	return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT - `3`));
258	}
259
260	/* \brief get our key value for the current level /
261	static really_inline
262	u32 mmbit_get_key_val_byte(u32 max_level, u32 level, u32 key) {
263	return (key >> (mmbit_get_ks(max_level, level))) & `0x7`;
264	}
265
266	/* \brief Load a flat bitvector block corresponding to N bits. /
267	static really_inline
268	MMB_TYPE mmbit_get_flat_block(const u8 *bits, u32 n_bits) {
269	assert(n_bits <= MMB_KEY_BITS);
270	u32 n_bytes = ROUNDUP_N(n_bits, `8`) / `8`;
271	switch (n_bytes) {
272	case `1`:
273	return *bits;
274	case `2`:
275	return unaligned_load_u16(bits);
276	case `3`:
277	case `4`: {
278	u32 rv;
279	assert(n_bytes <= sizeof(rv));
280	memcpy(&rv, bits + n_bytes - sizeof(rv), sizeof(rv));
281	rv >>= (sizeof(rv) - n_bytes) * `8`; / need to shift to get things in*
282	* the right position and remove
283	* junk */
284	assert(rv == partial_load_u32(bits, n_bytes));
285	return rv;
286	}
287	default: {
288	u64a rv;
289	assert(n_bytes <= sizeof(rv));
290	memcpy(&rv, bits + n_bytes - sizeof(rv), sizeof(rv));
291	rv >>= (sizeof(rv) - n_bytes) * `8`; / need to shift to get things in*
292	* the right position and remove
293	* junk */
294	assert(rv == partial_load_u64a(bits, n_bytes));
295	return rv;
296	}
297	}
298	}
299
300	/* \brief True if this multibit is small enough to use a flat model /
301	static really_inline
302	u32 mmbit_is_flat_model(u32 total_bits) {
303	return total_bits <= MMB_FLAT_MAX_BITS;
304	}
305
306	static really_inline
307	u32 mmbit_flat_size(u32 total_bits) {
308	assert(mmbit_is_flat_model(total_bits));
309	return ROUNDUP_N(total_bits, `8`) / `8`;
310	}
311
312	static really_inline
313	u32 mmbit_flat_select_byte(u32 key, UNUSED u32 total_bits) {
314	return key / `8`;
315	}
316
317	/* \brief returns the dense index of the bit in the given mask. /
318	static really_inline
319	u32 mmbit_mask_index(u32 bit, MMB_TYPE mask) {
320	assert(bit < MMB_KEY_BITS);
321	assert(mmb_test(mask, bit));
322
323	mask &= mmb_mask_zero_to(bit);
324	if (mask == `0ULL`) {
325	return `0`; // Common case.
326	}
327	return mmb_popcount(mask);
328	}
329
330	/* \brief Clear all bits. /
331	static really_inline
332	void mmbit_clear(u8 *bits, u32 total_bits) {
333	MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
334	MMB_TRACE("CLEAR\n");
335	if (!total_bits) {
336	return;
337	}
338	if (mmbit_is_flat_model(total_bits)) {
339	memset(bits, `0`, mmbit_flat_size(total_bits));
340	return;
341	}
342	mmb_store(bits, `0`);
343	}
344
345	/* \brief Specialisation of \ref mmbit_set for flat models. /
346	static really_inline
347	char mmbit_set_flat(u8 *bits, u32 total_bits, u32 key) {
348	bits += mmbit_flat_select_byte(key, total_bits);
349	u8 mask = `1U` << (key % `8`);
350	char was_set = !!(*bits & mask);
351	*bits \|= mask;
352	return was_set;
353	}
354
355	static really_inline
356	char mmbit_set_big(u8 *bits, u32 total_bits, u32 key) {
357	const u32 max_level = mmbit_maxlevel(total_bits);
358	u32 level = `0`;
359	do {
360	u8 * byte_ptr = mmbit_get_byte_ptr(bits, max_level, level, key);
361	u8 keymask = `1U` << mmbit_get_key_val_byte(max_level, level, key);
362	u8 byte = *byte_ptr;
363	if (likely(!(byte & keymask))) {
364	*byte_ptr = byte \| keymask;
365	while (level++ != max_level) {
366	u8 *block_ptr_1 = mmbit_get_block_ptr(bits, max_level, level, key);
367	MMB_TYPE keymask_1 = mmb_single_bit(mmbit_get_key_val(max_level, level, key));
368	mmb_store(block_ptr_1, keymask_1);
369	}
370	return `0`;
371	}
372	} while (level++ != max_level);
373	return `1`;
374	}
375
376	/* Internal version of \ref mmbit_set without MMB_TRACE, so it can be used by*
377	* \ref mmbit_sparse_iter_dump. */
378	static really_inline
379	char mmbit_set_i(u8 *bits, u32 total_bits, u32 key) {
380	assert(key < total_bits);
381	if (mmbit_is_flat_model(total_bits)) {
382	return mmbit_set_flat(bits, total_bits, key);
383	} else {
384	return mmbit_set_big(bits, total_bits, key);
385	}
386	}
387
388	static really_inline
389	char mmbit_isset(const u8 *bits, u32 total_bits, u32 key);
390
391	/* \brief Sets the given key in the multibit. Returns 0 if the key was NOT*
392	* already set, 1 otherwise. */
393	static really_inline
394	char mmbit_set(u8 *bits, u32 total_bits, u32 key) {
395	MDEBUG_PRINTF("%p total_bits %u key %u\n", bits, total_bits, key);
396	char status = mmbit_set_i(bits, total_bits, key);
397	MMB_TRACE("SET %u (prev status: %d)\n", key, (int)status);
398	assert(mmbit_isset(bits, total_bits, key));
399	return status;
400	}
401
402	/* \brief Specialisation of \ref mmbit_isset for flat models. /
403	static really_inline
404	char mmbit_isset_flat(const u8 *bits, u32 total_bits, u32 key) {
405	bits += mmbit_flat_select_byte(key, total_bits);
406	return !!(*bits & (`1U` << (key % `8U`)));
407	}
408
409	static really_inline
410	char mmbit_isset_big(const u8 *bits, u32 total_bits, u32 key) {
411	const u32 max_level = mmbit_maxlevel(total_bits);
412	u32 level = `0`;
413	do {
414	const u8 *block_ptr = mmbit_get_block_ptr_const(bits, max_level, level, key);
415	MMB_TYPE block = mmb_load(block_ptr);
416	if (!mmb_test(block, mmbit_get_key_val(max_level, level, key))) {
417	return `0`;
418	}
419	} while (level++ != max_level);
420	return `1`;
421	}
422
423	/* \brief Returns whether the given key is set. /
424	static really_inline
425	char mmbit_isset(const u8 *bits, u32 total_bits, u32 key) {
426	MDEBUG_PRINTF("%p total_bits %u key %u\n", bits, total_bits, key);
427	assert(key < total_bits);
428	if (mmbit_is_flat_model(total_bits)) {
429	return mmbit_isset_flat(bits, total_bits, key);
430	} else {
431	return mmbit_isset_big(bits, total_bits, key);
432	}
433	}
434
435	/* \brief Specialisation of \ref mmbit_unset for flat models. /
436	static really_inline
437	void mmbit_unset_flat(u8 *bits, u32 total_bits, u32 key) {
438	bits += mmbit_flat_select_byte(key, total_bits);
439	*bits &= ~(`1U` << (key % `8U`));
440	}
441
442	// TODO:
443	// build two versions of this - unset_dangerous that doesn't clear the summary
444	// block and a regular unset that actually clears ALL the way up the levels if
445	// possible - might make a utility function for the clear
446	static really_inline
447	void mmbit_unset_big(u8 *bits, u32 total_bits, u32 key) {
448	/ This function is lazy as it does not clear the summary block*
449	* entry if the child becomes empty. This is not a correctness problem as the
450	* summary block entries are used to mean that their children are valid
451	* rather than that they have a set child. */
452	const u32 max_level = mmbit_maxlevel(total_bits);
453	u32 level = `0`;
454	do {
455	u8 *block_ptr = mmbit_get_block_ptr(bits, max_level, level, key);
456	u32 key_val = mmbit_get_key_val(max_level, level, key);
457	MMB_TYPE block = mmb_load(block_ptr);
458	if (!mmb_test(block, key_val)) {
459	return;
460	}
461	if (level == max_level) {
462	mmb_clear(&block, key_val);
463	mmb_store(block_ptr, block);
464	}
465	} while (level++ != max_level);
466	}
467
468	/* \brief Switch off a given key. /
469	static really_inline
470	void mmbit_unset(u8 *bits, u32 total_bits, u32 key) {
471	MDEBUG_PRINTF("%p total_bits %u key %u\n", bits, total_bits, key);
472	assert(key < total_bits);
473	MMB_TRACE("UNSET %u (prev status: %d)\n", key,
474	(int)mmbit_isset(bits, total_bits, key));
475
476	if (mmbit_is_flat_model(total_bits)) {
477	mmbit_unset_flat(bits, total_bits, key);
478	} else {
479	mmbit_unset_big(bits, total_bits, key);
480	}
481	}
482
483	/* \brief Specialisation of \ref mmbit_iterate for flat models. /
484	static really_inline
485	u32 mmbit_iterate_flat(const u8 *bits, u32 total_bits, u32 it_in) {
486	// Short cut for single-block cases.
487	if (total_bits <= MMB_KEY_BITS) {
488	MMB_TYPE block = mmbit_get_flat_block(bits, total_bits);
489	if (it_in != MMB_INVALID) {
490	it_in++;
491	assert(it_in < total_bits);
492	block &= ~mmb_mask_zero_to(it_in);
493	}
494	if (block) {
495	return mmb_ctz(block);
496	}
497	return MMB_INVALID;
498	}
499
500	const u32 last_block = total_bits / MMB_KEY_BITS;
501	u32 start; // starting block index
502
503	if (it_in != MMB_INVALID) {
504	it_in++;
505	assert(it_in < total_bits);
506
507	start = (ROUNDUP_N(it_in, MMB_KEY_BITS) / MMB_KEY_BITS) - `1`;
508	u32 start_key = start * MMB_KEY_BITS;
509	u32 block_size = MIN(MMB_KEY_BITS, total_bits - start_key);
510	MMB_TYPE block =
511	mmbit_get_flat_block(bits + (start * sizeof(MMB_TYPE)), block_size);
512	block &= ~mmb_mask_zero_to(it_in - start_key);
513
514	if (block) {
515	return start_key + mmb_ctz(block);
516	} else if (start_key + MMB_KEY_BITS >= total_bits) {
517	return MMB_INVALID; // That was the final block.
518	}
519	start++;
520	} else {
521	start = `0`;
522	}
523
524	// Remaining full-sized blocks.
525	for (; start < last_block; start++) {
526	MMB_TYPE block = mmb_load(bits + (start * sizeof(MMB_TYPE)));
527	if (block) {
528	return (start * MMB_KEY_BITS) + mmb_ctz(block);
529	}
530	}
531
532	// We may have a final, smaller than full-sized, block to deal with at the
533	// end.
534	if (total_bits % MMB_KEY_BITS) {
535	u32 start_key = start * MMB_KEY_BITS;
536	u32 block_size = MIN(MMB_KEY_BITS, total_bits - start_key);
537	MMB_TYPE block =
538	mmbit_get_flat_block(bits + (start * sizeof(MMB_TYPE)), block_size);
539	if (block) {
540	return start_key + mmb_ctz(block);
541	}
542	}
543
544	return MMB_INVALID;
545	}
546
547	static really_inline
548	u32 mmbit_iterate_big(const u8 * bits, u32 total_bits, u32 it_in) {
549	const u32 max_level = mmbit_maxlevel(total_bits);
550	u32 level = `0`;
551	u32 key = `0`;
552	u32 key_rem = `0`;
553
554	if (it_in != MMB_INVALID) {
555	// We're continuing a previous iteration, so we need to go
556	// to max_level so we can pick up where we left off.
557	// NOTE: assumes that we're valid down the whole tree
558	key = it_in >> MMB_KEY_SHIFT;
559	key_rem = (it_in & MMB_KEY_MASK) + `1`;
560	level = max_level;
561	}
562	while (`1`) {
563	if (key_rem < MMB_KEY_BITS) {
564	const u8 *block_ptr = mmbit_get_level_root_const(bits, level) +
565	key * sizeof(MMB_TYPE);
566	MMB_TYPE block
567	= mmb_load(block_ptr) & ~mmb_mask_zero_to_nocheck(key_rem);
568	if (block) {
569	key = (key << MMB_KEY_SHIFT) + mmb_ctz(block);
570	if (level++ == max_level) {
571	break;
572	}
573	key_rem = `0`;
574	continue; // jump the rootwards step if we found a 'tree' non-zero bit
575	}
576	}
577	// rootwards step (block is zero or key_rem == MMB_KEY_BITS)
578	if (level-- == `0`) {
579	return MMB_INVALID; // if we don't find anything and we're at the top level, we're done
580	}
581	key_rem = (key & MMB_KEY_MASK) + `1`;
582	key >>= MMB_KEY_SHIFT;
583	}
584	assert(key < total_bits);
585	assert(mmbit_isset(bits, total_bits, key));
586	return key;
587	}
588
589	/* \brief Unbounded iterator. Returns the index of the next set bit after \a*
590	* it_in, or MMB_INVALID.
591	*
592	* Note: assumes that if you pass in a value of it_in other than MMB_INVALID,
593	* that bit must be on (assumes all its summary blocks are set).
594	*/
595	static really_inline
596	u32 mmbit_iterate(const u8 *bits, u32 total_bits, u32 it_in) {
597	MDEBUG_PRINTF("%p total_bits %u it_in %u\n", bits, total_bits, it_in);
598	assert(it_in < total_bits \|\| it_in == MMB_INVALID);
599	if (!total_bits) {
600	return MMB_INVALID;
601	}
602	if (it_in == total_bits - `1`) {
603	return MMB_INVALID; // it_in is the last key.
604	}
605
606	u32 key;
607	if (mmbit_is_flat_model(total_bits)) {
608	key = mmbit_iterate_flat(bits, total_bits, it_in);
609	} else {
610	key = mmbit_iterate_big(bits, total_bits, it_in);
611	}
612	assert(key == MMB_INVALID \|\| mmbit_isset(bits, total_bits, key));
613	return key;
614	}
615
616	/* \brief Specialisation of \ref mmbit_any and \ref mmbit_any_precise for flat*
617	* models. */
618	static really_inline
619	char mmbit_any_flat(const u8 *bits, u32 total_bits) {
620	if (total_bits <= MMB_KEY_BITS) {
621	return !!mmbit_get_flat_block(bits, total_bits);
622	}
623
624	const u8 *end = bits + mmbit_flat_size(total_bits);
625	for (const u8 last = end - sizeof*(MMB_TYPE); bits < last;
626	bits += sizeof(MMB_TYPE)) {
627	if (mmb_load(bits)) {
628	return `1`;
629	}
630	}
631
632	// Overlapping load at the end.
633	return !!mmb_load(end - sizeof(MMB_TYPE));
634	}
635
636	/* \brief True if any keys are (or might be) on in the given multibit.*
637	*
638	* NOTE: mmbit_any is sloppy (may return true when only summary bits are set).
639	* Use \ref mmbit_any_precise if you need/want a correct answer.
640	*/
641	static really_inline
642	char mmbit_any(const u8 *bits, u32 total_bits) {
643	MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
644	if (!total_bits) {
645	return `0`;
646	}
647	if (mmbit_is_flat_model(total_bits)) {
648	return mmbit_any_flat(bits, total_bits);
649	}
650	return !!mmb_load(bits);
651	}
652
653	/* \brief True if there are any keys on. Guaranteed precise. /
654	static really_inline
655	char mmbit_any_precise(const u8 *bits, u32 total_bits) {
656	MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
657	if (!total_bits) {
658	return `0`;
659	}
660	if (mmbit_is_flat_model(total_bits)) {
661	return mmbit_any_flat(bits, total_bits);
662	}
663
664	return mmbit_iterate_big(bits, total_bits, MMB_INVALID) != MMB_INVALID;
665	}
666
667	static really_inline
668	char mmbit_all_flat(const u8 *bits, u32 total_bits) {
669	while (total_bits > MMB_KEY_BITS) {
670	if (mmb_load(bits) != MMB_ALL_ONES) {
671	return `0`;
672	}
673	bits += sizeof(MMB_TYPE);
674	total_bits -= MMB_KEY_BITS;
675	}
676	while (total_bits > `8`) {
677	if (*bits != `0xff`) {
678	return `0`;
679	}
680	bits++;
681	total_bits -= `8`;
682	}
683	u8 mask = (u8)mmb_mask_zero_to_nocheck(total_bits);
684	return (*bits & mask) == mask;
685	}
686
687	static really_inline
688	char mmbit_all_big(const u8 *bits, u32 total_bits) {
689	u32 ks = mmbit_keyshift(total_bits);
690
691	u32 level = `0`;
692	for (;;) {
693	// Number of bits we expect to see switched on on this level.
694	u32 level_bits;
695	if (ks != `0`) {
696	u32 next_level_width = MMB_KEY_BITS << (ks - MMB_KEY_SHIFT);
697	level_bits = ROUNDUP_N(total_bits, next_level_width) >> ks;
698	} else {
699	level_bits = total_bits;
700	}
701
702	const u8 *block_ptr = mmbit_get_level_root_const(bits, level);
703
704	// All full-size blocks should be all-ones.
705	while (level_bits >= MMB_KEY_BITS) {
706	MMB_TYPE block = mmb_load(block_ptr);
707	if (block != MMB_ALL_ONES) {
708	return `0`;
709	}
710	block_ptr += sizeof(MMB_TYPE);
711	level_bits -= MMB_KEY_BITS;
712	}
713
714	// If we have bits remaining, we have a runt block on the end.
715	if (level_bits > `0`) {
716	MMB_TYPE block = mmb_load(block_ptr);
717	MMB_TYPE mask = mmb_mask_zero_to_nocheck(level_bits);
718	if ((block & mask) != mask) {
719	return `0`;
720	}
721	}
722
723	if (ks == `0`) {
724	break;
725	}
726
727	ks -= MMB_KEY_SHIFT;
728	level++;
729	}
730
731	return `1`;
732	}
733
734	/* \brief True if all keys are on. Guaranteed precise. /
735	static really_inline
736	char mmbit_all(const u8 *bits, u32 total_bits) {
737	MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
738
739	if (mmbit_is_flat_model(total_bits)) {
740	return mmbit_all_flat(bits, total_bits);
741	}
742	return mmbit_all_big(bits, total_bits);
743	}
744
745	static really_inline
746	MMB_TYPE get_flat_masks(u32 base, u32 it_start, u32 it_end) {
747	if (it_end <= base) {
748	return `0`;
749	}
750	u32 udiff = it_end - base;
751	MMB_TYPE mask = udiff < `64` ? mmb_mask_zero_to_nocheck(udiff) : MMB_ALL_ONES;
752	if (it_start >= base) {
753	u32 ldiff = it_start - base;
754	MMB_TYPE lmask = ldiff < `64` ? ~mmb_mask_zero_to_nocheck(ldiff) : `0`;
755	mask &= lmask;
756	}
757	return mask;
758	}
759
760	/* \brief Specialisation of \ref mmbit_iterate_bounded for flat models. /
761	static really_inline
762	u32 mmbit_iterate_bounded_flat(const u8 *bits, u32 total_bits, u32 begin,
763	u32 end) {
764	// Short cut for single-block cases.
765	if (total_bits <= MMB_KEY_BITS) {
766	MMB_TYPE block = mmbit_get_flat_block(bits, total_bits);
767	block &= get_flat_masks(`0`, begin, end);
768	if (block) {
769	return mmb_ctz(block);
770	}
771	return MMB_INVALID;
772	}
773
774	const u32 last_block = ROUNDDOWN_N(total_bits, MMB_KEY_BITS);
775
776	// Iterate over full-sized blocks.
777	for (u32 i = ROUNDDOWN_N(begin, MMB_KEY_BITS), e = MIN(end, last_block);
778	i < e; i += MMB_KEY_BITS) {
779	const u8 *block_ptr = bits + i / `8`;
780	MMB_TYPE block = mmb_load(block_ptr);
781	block &= get_flat_masks(i, begin, end);
782	if (block) {
783	return i + mmb_ctz(block);
784	}
785	}
786
787	// Final block, which is less than full-sized.
788	if (end > last_block) {
789	const u8 *block_ptr = bits + last_block / `8`;
790	u32 num_bits = total_bits - last_block;
791	MMB_TYPE block = mmbit_get_flat_block(block_ptr, num_bits);
792	block &= get_flat_masks(last_block, begin, end);
793	if (block) {
794	return last_block + mmb_ctz(block);
795	}
796	}
797
798	return MMB_INVALID;
799	}
800
801	static really_inline
802	MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u64a block_min, u64a block_max,
803	u64a block_base) {
804	const u32 level_shift = (max_level - level) * MMB_KEY_SHIFT;
805	u64a lshift = (block_min - block_base) >> level_shift;
806	u64a ushift = (block_max - block_base) >> level_shift;
807	MMB_TYPE lmask = lshift < `64` ? ~mmb_mask_zero_to_nocheck(lshift) : `0`;
808	MMB_TYPE umask =
809	ushift < `63` ? mmb_mask_zero_to_nocheck(ushift + `1`) : MMB_ALL_ONES;
810	return lmask & umask;
811	}
812
813	static really_inline
814	u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32 it_end) {
815	u64a key = `0`;
816	u32 ks = mmbit_keyshift(total_bits);
817	const u32 max_level = mmbit_maxlevel_from_keyshift(ks);
818	u32 level = `0`;
819	--it_end; // make end-limit inclusive
820	for (;;) {
821	assert(level <= max_level);
822
823	u64a block_width = MMB_KEY_BITS << ks;
824	u64a block_base = key * block_width;
825	u64a block_min = MAX(it_start, block_base);
826	u64a block_max = MIN(it_end, block_base + block_width - `1`);
827	const u8 *block_ptr =
828	mmbit_get_level_root_const(bits, level) + key * sizeof(MMB_TYPE);
829	MMB_TYPE block = mmb_load(block_ptr);
830	block &= get_lowhi_masks(level, max_level, block_min, block_max, block_base);
831	if (block) {
832	// Found a bit, go down a level
833	key = (key << MMB_KEY_SHIFT) + mmb_ctz(block);
834	if (level++ == max_level) {
835	return key;
836	}
837	ks -= MMB_KEY_SHIFT;
838	} else {
839	// No bit found, go up a level
840	// we know that this block didn't have any answers, so we can push
841	// our start iterator forward.
842	u64a next_start = block_base + block_width;
843	if (next_start > it_end) {
844	break;
845	}
846	if (level-- == `0`) {
847	break;
848	}
849	it_start = next_start;
850	key >>= MMB_KEY_SHIFT;
851	ks += MMB_KEY_SHIFT;
852	}
853	}
854	return MMB_INVALID;
855	}
856
857	/* \brief Bounded iterator. Returns the index of the first set bit between*
858	* it_start (inclusive) and it_end (exclusive) or MMB_INVALID if no bits are
859	* set in that range.
860	*/
861	static really_inline
862	u32 mmbit_iterate_bounded(const u8 *bits, u32 total_bits, u32 it_start,
863	u32 it_end) {
864	MDEBUG_PRINTF("%p total_bits %u it_start %u it_end %u\n", bits, total_bits,
865	it_start, it_end);
866	assert(it_start <= it_end);
867	assert(it_end <= total_bits);
868	if (!total_bits \|\| it_end == it_start) {
869	return MMB_INVALID;
870	}
871	assert(it_start < total_bits);
872	u32 key;
873	if (mmbit_is_flat_model(total_bits)) {
874	key = mmbit_iterate_bounded_flat(bits, total_bits, it_start, it_end);
875	} else {
876	key = mmbit_iterate_bounded_big(bits, total_bits, it_start, it_end);
877	}
878	assert(key == MMB_INVALID \|\| mmbit_isset(bits, total_bits, key));
879	return key;
880	}
881
882	/* \brief Specialisation of \ref mmbit_unset_range for flat models. /
883	static really_inline
884	void mmbit_unset_range_flat(u8 *bits, u32 total_bits, u32 begin, u32 end) {
885	const u32 last_block = ROUNDDOWN_N(total_bits, MMB_KEY_BITS);
886
887	// Iterate over full-sized blocks.
888	for (u32 i = ROUNDDOWN_N(begin, MMB_KEY_BITS), e = MIN(end, last_block);
889	i < e; i += MMB_KEY_BITS) {
890	u8 *block_ptr = bits + i / `8`;
891	MMB_TYPE block = mmb_load(block_ptr);
892	MMB_TYPE mask = get_flat_masks(i, begin, end);
893	mmb_store(block_ptr, block & ~mask);
894	}
895
896	// Final block, which is less than full-sized.
897	if (end > last_block) {
898	u8 *block_ptr = bits + last_block / `8`;
899	u32 num_bits = total_bits - last_block;
900	MMB_TYPE block = mmbit_get_flat_block(block_ptr, num_bits);
901	MMB_TYPE mask = get_flat_masks(last_block, begin, end);
902	mmb_store_partial(block_ptr, block & ~mask, num_bits);
903	}
904	}
905
906	static really_inline
907	void mmbit_unset_range_big(u8 bits, const* u32 total_bits, u32 begin,
908	u32 end) {
909	// TODO: combine iterator and unset operation; completely replace this
910	u32 i = begin;
911	for (;;) {
912	i = mmbit_iterate_bounded(bits, total_bits, i, end);
913	if (i == MMB_INVALID) {
914	break;
915	}
916	mmbit_unset_big(bits, total_bits, i);
917	if (++i == end) {
918	break;
919	}
920	}
921	}
922
923	/* \brief Unset a whole range of bits. Ensures that all bits between \a begin*
924	* (inclusive) and \a end (exclusive) are switched off. */
925	static really_inline
926	void mmbit_unset_range(u8 bits, const* u32 total_bits, u32 begin, u32 end) {
927	MDEBUG_PRINTF("%p total_bits %u begin %u end %u\n", bits, total_bits, begin,
928	end);
929	assert(begin <= end);
930	assert(end <= total_bits);
931	if (mmbit_is_flat_model(total_bits)) {
932	mmbit_unset_range_flat(bits, total_bits, begin, end);
933	} else {
934	mmbit_unset_range_big(bits, total_bits, begin, end);
935	}
936	// No bits are on in [begin, end) once we're done.
937	assert(MMB_INVALID == mmbit_iterate_bounded(bits, total_bits, begin, end));
938	}
939
940	/* \brief Specialisation of \ref mmbit_init_range for flat models. /
941	static really_inline
942	void mmbit_init_range_flat(u8 bits, const* u32 total_bits, u32 begin, u32 end) {
943	const u32 last_block = ROUNDDOWN_N(total_bits, MMB_KEY_BITS);
944
945	// Iterate over full-sized blocks.
946	for (u32 i = `0`; i < last_block; i += MMB_KEY_BITS) {
947	mmb_store(bits + i / `8`, get_flat_masks(i, begin, end));
948	}
949
950	// Final block, which is less than full-sized.
951	if (total_bits % MMB_KEY_BITS) {
952	u32 num_bits = total_bits - last_block;
953	MMB_TYPE block = get_flat_masks(last_block, begin, end);
954	mmb_store_partial(bits + last_block / `8`, block, num_bits);
955	}
956	}
957
958	static really_inline
959	void mmbit_init_range_big(u8 bits, const* u32 total_bits, u32 begin, u32 end) {
960	u32 ks = mmbit_keyshift(total_bits);
961	u32 level = `0`;
962
963	for (;;) {
964	u8 *block = mmbit_get_level_root(bits, level);
965	u32 k1 = begin >> ks, k2 = end >> ks;
966
967	// Summary blocks need to account for the runt block on the end.
968	if ((k2 << ks) != end) {
969	k2++;
970	}
971
972	// Partial block to deal with beginning.
973	block += (k1 / MMB_KEY_BITS) * sizeof(MMB_TYPE);
974	if (k1 % MMB_KEY_BITS) {
975	u32 idx = k1 / MMB_KEY_BITS;
976	u32 block_end = (idx + `1`) * MMB_KEY_BITS;
977
978	// Because k1 % MMB_KEY_BITS != 0, we can avoid checking edge cases
979	// here (see the branch in mmb_mask_zero_to).
980	MMB_TYPE mask = MMB_ALL_ONES << (k1 % MMB_KEY_BITS);
981
982	if (k2 < block_end) {
983	assert(k2 % MMB_KEY_BITS);
984	mask &= mmb_mask_zero_to_nocheck(k2 % MMB_KEY_BITS);
985	mmb_store(block, mask);
986	goto next_level;
987	} else {
988	mmb_store(block, mask);
989	k1 = block_end;
990	block += sizeof(MMB_TYPE);
991	}
992	}
993
994	// Write blocks filled with ones until we get to the last block.
995	for (; k1 < (k2 & ~MMB_KEY_MASK); k1 += MMB_KEY_BITS) {
996	mmb_store(block, MMB_ALL_ONES);
997	block += sizeof(MMB_TYPE);
998	}
999
1000	// Final block.
1001	if (likely(k1 < k2)) {
1002	// Again, if k2 was at a block boundary, it would have been handled
1003	// by the previous loop, so we know k2 % MMB_KEY_BITS != 0 and can
1004	// avoid the branch in mmb_mask_zero_to here.
1005	assert(k2 % MMB_KEY_BITS);
1006	MMB_TYPE mask = mmb_mask_zero_to_nocheck(k2 % MMB_KEY_BITS);
1007	mmb_store(block, mask);
1008	}
1009
1010	next_level:
1011	if (ks == `0`) {
1012	break; // Last level is done, finished.
1013	}
1014
1015	ks -= MMB_KEY_SHIFT;
1016	level++;
1017	}
1018	}
1019
1020	/* \brief Initialises the multibit so that only the given range of bits are*
1021	* set.
1022	*
1023	* Ensures that all bits between \a begin (inclusive) and \a end (exclusive)
1024	* are switched on.
1025	*/
1026	static really_inline
1027	void mmbit_init_range(u8 bits, const* u32 total_bits, u32 begin, u32 end) {
1028	MDEBUG_PRINTF("%p total_bits %u begin %u end %u\n", bits, total_bits, begin,
1029	end);
1030	assert(begin <= end);
1031	assert(end <= total_bits);
1032
1033	if (!total_bits) {
1034	return;
1035	}
1036
1037	// Short cut for cases where we're not actually setting any bits; just
1038	// clear the multibit.
1039	if (begin == end) {
1040	mmbit_clear(bits, total_bits);
1041	return;
1042	}
1043
1044	if (mmbit_is_flat_model(total_bits)) {
1045	mmbit_init_range_flat(bits, total_bits, begin, end);
1046	} else {
1047	mmbit_init_range_big(bits, total_bits, begin, end);
1048	}
1049
1050	assert(begin == end \|\|
1051	mmbit_iterate(bits, total_bits, MMB_INVALID) == begin);
1052	assert(!end \|\| begin == end \|\|
1053	mmbit_iterate(bits, total_bits, end - `1`) == MMB_INVALID);
1054	}
1055
1056	/* \brief Determine the number of \ref mmbit_sparse_state elements required.*
1057	* */
1058	static really_inline
1059	u32 mmbit_sparse_iter_state_size(u32 total_bits) {
1060	if (mmbit_is_flat_model(total_bits)) {
1061	return `2`;
1062	}
1063	u32 levels = mmbit_maxlevel(total_bits);
1064	return levels + `1`;
1065	}
1066
1067	#ifdef DUMP_SUPPORT
1068	// Dump function, defined in multibit.c.
1069	void mmbit_sparse_iter_dump(const struct mmbit_sparse_iter *it, u32 total_bits);
1070	#endif
1071
1072	/* Internal: common loop used by mmbit_sparse_iter_{begin,next}_big. Returns*
1073	* matching next key given starting state, or MMB_INVALID. */
1074	static really_inline
1075	u32 mmbit_sparse_iter_exec(const u8 bits, u32 key, u32 idx, u32 level,
1076	const u32 max_level, struct mmbit_sparse_state *s,
1077	const struct mmbit_sparse_iter *it_root,
1078	const struct mmbit_sparse_iter *it) {
1079	for (;;) {
1080	MMB_TYPE block = s[level].mask;
1081	if (block) {
1082	u32 bit = mmb_ctz(block);
1083	key = (key << MMB_KEY_SHIFT) + bit;
1084	u32 bit_idx = mmbit_mask_index(bit, it->mask);
1085	if (level++ == max_level) {
1086	// we've found a key
1087	*idx = it->val + bit_idx;
1088	return key;
1089	} else {
1090	// iterator record is the start of the level (current it->val)
1091	// plus N, where N is the dense index of the bit in the current
1092	// level's itmask
1093	u32 iter_key = it->val + bit_idx;
1094	it = it_root + iter_key;
1095	MMB_TYPE nextblock =
1096	mmb_load(mmbit_get_level_root_const(bits, level) +
1097	key * sizeof(MMB_TYPE));
1098	s[level].mask = nextblock & it->mask;
1099	s[level].itkey = iter_key;
1100	}
1101	} else {
1102	// No bits set in this block
1103	if (level-- == `0`) {
1104	break; // no key available
1105	}
1106	key >>= MMB_KEY_SHIFT;
1107	// Update state mask and iterator
1108	s[level].mask &= (s[level].mask - `1`);
1109	it = it_root + s[level].itkey;
1110	}
1111	}
1112	return MMB_INVALID;
1113	}
1114
1115	static really_inline
1116	u32 mmbit_sparse_iter_begin_big(const u8 bits, u32 total_bits, u32 idx,
1117	const struct mmbit_sparse_iter *it_root,
1118	struct mmbit_sparse_state *s) {
1119	const struct mmbit_sparse_iter *it = it_root;
1120	u32 key = `0`;
1121	MMB_TYPE block = mmb_load(bits) & it->mask;
1122	if (!block) {
1123	return MMB_INVALID;
1124	}
1125
1126	// Load first block into top level state.
1127	const u32 max_level = mmbit_maxlevel(total_bits);
1128	s[`0`].mask = block;
1129	s[`0`].itkey = `0`;
1130	return mmbit_sparse_iter_exec(bits, key, idx, `0`, max_level,
1131	s, it_root, it);
1132	}
1133
1134	/* \brief Specialisation of \ref mmbit_sparse_iter_begin for flat models. /
1135	static really_inline
1136	u32 mmbit_sparse_iter_begin_flat(const u8 bits, u32 total_bits, u32 idx,
1137	const struct mmbit_sparse_iter *it_root,
1138	struct mmbit_sparse_state *s) {
1139	// Small cases have everything in the root iterator mask.
1140	if (total_bits <= MMB_KEY_BITS) {
1141	MMB_TYPE block = mmbit_get_flat_block(bits, total_bits);
1142	block &= it_root->mask;
1143	if (!block) {
1144	return MMB_INVALID;
1145	}
1146
1147	s->mask = block;
1148	u32 key = mmb_ctz(block);
1149	*idx = mmbit_mask_index(key, it_root->mask);
1150	return key;
1151	}
1152
1153	// Otherwise, the root iterator mask tells us which blocks (which we lay out
1154	// linearly in the flat model) could contain keys.
1155	assert(mmbit_maxlevel(total_bits) == `1`); // Should only be two levels
1156	MMB_TYPE root = it_root->mask;
1157	for (; root; root &= (root - `1`)) {
1158	u32 bit = mmb_ctz(root);
1159	u32 bit_idx = mmbit_mask_index(bit, it_root->mask);
1160	u32 iter_key = it_root->val + bit_idx;
1161	const struct mmbit_sparse_iter *it = it_root + iter_key;
1162	u32 block_key_min = bit * MMB_KEY_BITS;
1163	u32 block_key_max = block_key_min + MMB_KEY_BITS;
1164	MMB_TYPE block;
1165	if (block_key_max > total_bits) {
1166	block_key_max = total_bits;
1167	block = mmbit_get_flat_block(bits + (bit * sizeof(MMB_TYPE)),
1168	block_key_max - block_key_min);
1169	} else {
1170	block = mmb_load(bits + (bit * sizeof(MMB_TYPE)));
1171	}
1172
1173	block &= it->mask;
1174	if (block) {
1175	s[`0`].mask = root;
1176	s[`1`].mask = block;
1177	s[`1`].itkey = iter_key;
1178	u32 key = mmb_ctz(block);
1179	*idx = it->val + mmbit_mask_index(key, it->mask);
1180	return key + block_key_min;
1181	}
1182	}
1183
1184	return MMB_INVALID;
1185	}
1186
1187	/* \brief Sparse iterator, find first key.*
1188	*
1189	* Returns the first of the bits specified by the iterator \a it_root that is
1190	* on, and initialises the state \a s. If none of the bits specified by the
1191	* iterator are on, returns MMB_INVALID.
1192	*/
1193	static really_inline
1194	u32 mmbit_sparse_iter_begin(const u8 bits, u32 total_bits, u32 idx,
1195	const struct mmbit_sparse_iter *it_root,
1196	struct mmbit_sparse_state *s) {
1197	assert(ISALIGNED_N(it_root, alignof(struct mmbit_sparse_iter)));
1198
1199	// Our state _may_ be on the stack
1200	#ifndef _WIN32
1201	assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state)));
1202	#else
1203	assert(ISALIGNED_N(s, `4`));
1204	#endif
1205
1206	MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
1207	// iterator should have _something_ at the root level
1208	assert(it_root->mask != `0`);
1209	u32 key;
1210	if (mmbit_is_flat_model(total_bits)) {
1211	key = mmbit_sparse_iter_begin_flat(bits, total_bits, idx, it_root, s);
1212	} else {
1213	key = mmbit_sparse_iter_begin_big(bits, total_bits, idx, it_root, s);
1214	}
1215	if (key != MMB_INVALID) {
1216	assert(key < total_bits);
1217	assert(mmbit_isset(bits, total_bits, key));
1218	}
1219	return key;
1220	}
1221
1222	static really_inline
1223	u32 mmbit_sparse_iter_next_big(const u8 *bits, u32 total_bits, u32 last_key,
1224	u32 *idx,
1225	const struct mmbit_sparse_iter *it_root,
1226	struct mmbit_sparse_state *s) {
1227	const u32 max_level = mmbit_maxlevel(total_bits);
1228	u32 key = last_key >> MMB_KEY_SHIFT;
1229	s[max_level].mask &= (s[max_level].mask - `1`);
1230	const struct mmbit_sparse_iter *it = it_root + s[max_level].itkey;
1231	return mmbit_sparse_iter_exec(bits, key, idx, max_level, max_level, s,
1232	it_root, it);
1233	}
1234
1235	/* \brief Specialisation of \ref mmbit_sparse_iter_next for flat models. /
1236	static really_inline
1237	u32 mmbit_sparse_iter_next_flat(const u8 bits, const* u32 total_bits, u32 *idx,
1238	const struct mmbit_sparse_iter *it_root,
1239	struct mmbit_sparse_state *s) {
1240	if (total_bits <= MMB_KEY_BITS) {
1241	// All of our data is already in the s->mask, so we just need to scrape
1242	// off the next match.
1243	s->mask &= (s->mask - `1`);
1244	if (s->mask) {
1245	u32 key = mmb_ctz(s->mask);
1246	*idx = mmbit_mask_index(key, it_root->mask);
1247	return key;
1248	}
1249	} else {
1250	assert(s[`0`].mask);
1251
1252	s[`1`].mask &= (s[`1`].mask - `1`); // Remove previous key from iter state.
1253	u32 bit = mmb_ctz(s[`0`].mask); // Flat block currently being accessed.
1254
1255	for (;;) {
1256	if (s[`1`].mask) {
1257	u32 key = mmb_ctz(s[`1`].mask);
1258	const struct mmbit_sparse_iter *it = it_root + s[`1`].itkey;
1259	*idx = it->val + mmbit_mask_index(key, it->mask);
1260	key += (bit * MMB_KEY_BITS);
1261	return key;
1262	}
1263
1264	// Otherwise, we have no keys left in this block. Consult the root
1265	// mask and find the next one.
1266
1267	s[`0`].mask &= s[`0`].mask - `1`;
1268	if (!s[`0`].mask) {
1269	break;
1270	}
1271
1272	bit = mmb_ctz(s[`0`].mask);
1273	u32 bit_idx = mmbit_mask_index(bit, it_root->mask);
1274	u32 iter_key = it_root->val + bit_idx;
1275	const struct mmbit_sparse_iter *it = it_root + iter_key;
1276	u32 block_key_min = bit * MMB_KEY_BITS;
1277	u32 block_key_max = block_key_min + MMB_KEY_BITS;
1278	MMB_TYPE block;
1279	if (block_key_max > total_bits) {
1280	block_key_max = total_bits;
1281	block = mmbit_get_flat_block(bits + (bit * sizeof(MMB_TYPE)),
1282	block_key_max - block_key_min);
1283	} else {
1284	block = mmb_load(bits + (bit * sizeof(MMB_TYPE)));
1285	}
1286
1287	s[`1`].mask = block & it->mask;
1288	s[`1`].itkey = iter_key;
1289	}
1290	}
1291
1292	return MMB_INVALID;
1293	}
1294
1295	/* \brief Sparse iterator, find next key.*
1296	*
1297	* Takes in a sparse iterator tree structure \a it_root and a state array, and
1298	* finds the next on bit (from the set of bits specified in the iterator).
1299	*
1300	* NOTE: The sparse iterator stores copies of the multibit blocks in its state,
1301	* so it is not necessarily safe to set or unset bits in the multibit while
1302	* iterating: the changes you make may or may not be taken into account
1303	* by the iterator.
1304	*/
1305	static really_inline
1306	u32 mmbit_sparse_iter_next(const u8 *bits, u32 total_bits, u32 last_key,
1307	u32 idx, const* struct mmbit_sparse_iter *it_root,
1308	struct mmbit_sparse_state *s) {
1309	assert(ISALIGNED_N(it_root, alignof(struct mmbit_sparse_iter)));
1310
1311	// Our state _may_ be on the stack
1312	#ifndef _WIN32
1313	assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state)));
1314	#else
1315	assert(ISALIGNED_N(s, `4`));
1316	#endif
1317
1318	MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
1319	MDEBUG_PRINTF("NEXT (total_bits=%u, last_key=%u)\n", total_bits, last_key);
1320	UNUSED u32 last_idx = idx; // for assertion at the end*
1321	// our iterator should have _something_ at the root level
1322	assert(it_root->mask != `0`);
1323	assert(last_key < total_bits);
1324
1325	u32 key;
1326	if (mmbit_is_flat_model(total_bits)) {
1327	key = mmbit_sparse_iter_next_flat(bits, total_bits, idx, it_root, s);
1328	} else {
1329	key = mmbit_sparse_iter_next_big(bits, total_bits, last_key, idx,
1330	it_root, s);
1331	}
1332	if (key != MMB_INVALID) {
1333	MDEBUG_PRINTF("END NEXT: key=%u, idx=%u\n", key, *idx);
1334	assert(key < total_bits);
1335	assert(key > last_key);
1336	assert(mmbit_isset(bits, total_bits, key));
1337	assert(*idx > last_idx);
1338	} else {
1339	MDEBUG_PRINTF("END NEXT: no more keys\n");
1340	}
1341	return key;
1342	}
1343
1344	/* \brief Specialisation of \ref mmbit_sparse_iter_unset for flat models. /
1345	static really_inline
1346	void mmbit_sparse_iter_unset_flat(u8 *bits, u32 total_bits,
1347	const struct mmbit_sparse_iter *it_root) {
1348	if (total_bits <= MMB_KEY_BITS) {
1349	// Everything is in the root mask: we can just mask those bits off.
1350	MMB_TYPE block = mmbit_get_flat_block(bits, total_bits);
1351	block &= ~it_root->mask;
1352	mmb_store_partial(bits, block, total_bits);
1353	return;
1354	}
1355
1356	// Larger case, we have two iterator levels to worry about.
1357	u32 bit_idx = `0`;
1358	for (MMB_TYPE root = it_root->mask; root; root &= (root - `1`), bit_idx++) {
1359	u32 bit = mmb_ctz(root);
1360	u32 block_key_min = bit * MMB_KEY_BITS;
1361	u32 block_key_max = block_key_min + MMB_KEY_BITS;
1362	u8 block_ptr = bits + (bit sizeof(MMB_TYPE));
1363	u32 iter_key = it_root->val + bit_idx;
1364	const struct mmbit_sparse_iter *it = it_root + iter_key;
1365	if (block_key_max <= total_bits) {
1366	// Full-sized block.
1367	MMB_TYPE block = mmb_load(block_ptr);
1368	block &= ~it->mask;
1369	mmb_store(block_ptr, block);
1370	} else {
1371	// Runt (final) block.
1372	u32 num_bits = total_bits - block_key_min;
1373	MMB_TYPE block = mmbit_get_flat_block(block_ptr, num_bits);
1374	block &= ~it->mask;
1375	mmb_store_partial(block_ptr, block, num_bits);
1376	break; // We know this is the last block.
1377	}
1378	}
1379	}
1380
1381	static really_inline
1382	void mmbit_sparse_iter_unset_big(u8 *bits, u32 total_bits,
1383	const struct mmbit_sparse_iter *it_root,
1384	struct mmbit_sparse_state *s) {
1385	const struct mmbit_sparse_iter *it = it_root;
1386	MMB_TYPE block = mmb_load(bits) & it->mask;
1387	if (!block) {
1388	return;
1389	}
1390
1391	u32 key = `0`;
1392	const u32 max_level = mmbit_maxlevel(total_bits);
1393	u32 level = `0`;
1394
1395	// Load first block into top level state
1396	s[level].mask = block;
1397	s[level].itkey = `0`;
1398	for (;;) {
1399	block = s[level].mask;
1400	if (block) {
1401	if (level == max_level) {
1402	// bottom level block: we want to mask out the bits specified
1403	// by the iterator mask and then go back up a level.
1404	u8 *block_ptr =
1405	mmbit_get_level_root(bits, level) + key * sizeof(MMB_TYPE);
1406	MMB_TYPE real_block = mmb_load(block_ptr);
1407	real_block &= ~(it->mask);
1408	mmb_store(block_ptr, real_block);
1409	goto uplevel; // still cheap and nasty
1410	} else {
1411	u32 bit = mmb_ctz(block);
1412	key = (key << MMB_KEY_SHIFT) + bit;
1413	level++;
1414
1415	// iterator record is the start of the level (current it->val)
1416	// plus N, where N is the dense index of the bit in the current
1417	// level's itmask
1418	u32 iter_key = it->val + mmbit_mask_index(bit, it->mask);
1419	it = it_root + iter_key;
1420	MMB_TYPE nextblock =
1421	mmb_load(mmbit_get_level_root_const(bits, level) +
1422	key * sizeof(MMB_TYPE));
1423	s[level].mask = nextblock & it->mask;
1424	s[level].itkey = iter_key;
1425	}
1426	} else {
1427	uplevel:
1428	// No bits set in this block
1429	if (level == `0`) {
1430	return; // we are done
1431	}
1432	u8 *block_ptr =
1433	mmbit_get_level_root(bits, level) + key * sizeof(MMB_TYPE);
1434	MMB_TYPE real_block = mmb_load(block_ptr);
1435	key >>= MMB_KEY_SHIFT;
1436	level--;
1437
1438	if (real_block == `0`) {
1439	// If we've zeroed our block For Real (unmasked by iterator),
1440	// we can clear the parent bit that led us to it, so that
1441	// we don't go down this particular garden path again later.
1442	u32 bit = mmb_ctz(s[level].mask);
1443	u8 *parent_ptr =
1444	mmbit_get_level_root(bits, level) + key * sizeof(MMB_TYPE);
1445	MMB_TYPE parent_block = mmb_load(parent_ptr);
1446	mmb_clear(&parent_block, bit);
1447	mmb_store(parent_ptr, parent_block);
1448	}
1449
1450	// Update state mask and iterator
1451	s[level].mask &= (s[level].mask - `1`);
1452	it = it_root + s[level].itkey;
1453	}
1454	}
1455	}
1456
1457	/* \brief Sparse iterator, unset all bits.*
1458	*
1459	* Takes in a sparse iterator tree structure and switches off any entries found
1460	* therein.
1461	*/
1462	static really_inline
1463	void mmbit_sparse_iter_unset(u8 *bits, u32 total_bits,
1464	const struct mmbit_sparse_iter *it,
1465	struct mmbit_sparse_state *s) {
1466	assert(ISALIGNED_N(it, alignof(struct mmbit_sparse_iter)));
1467
1468	// Our state _may_ be on the stack
1469	#ifndef _WIN32
1470	assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state)));
1471	#else
1472	assert(ISALIGNED_N(s, `4`));
1473	#endif
1474
1475	MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
1476
1477	#ifdef MMB_TRACE_WRITES
1478	MMB_TRACE("ITER-UNSET iter=[");
1479	mmbit_sparse_iter_dump(it, total_bits);
1480	printf("] actually on=[");
1481	struct mmbit_sparse_state tmp[MAX_SPARSE_ITER_STATES];
1482	u32 idx = `0`;
1483	u32 i = mmbit_sparse_iter_begin(bits, total_bits, &idx, it, tmp);
1484	for (; i != MMB_INVALID;
1485	i = mmbit_sparse_iter_next(bits, total_bits, i, &idx, it, tmp)) {
1486	printf(" %u", i);
1487	}
1488	printf("]\n");
1489	#endif
1490
1491	if (mmbit_is_flat_model(total_bits)) {
1492	mmbit_sparse_iter_unset_flat(bits, total_bits, it);
1493	} else {
1494	mmbit_sparse_iter_unset_big(bits, total_bits, it, s);
1495	}
1496	}
1497
1498	#ifdef __cplusplus
1499	} // extern "C"
1500	#endif
1501
1502	#endif // MULTIBIT_H
1503

Browse the source code of ClickHouse/contrib/hyperscan/src/util/multibit.h