bitutils.h source code [ClickHouse/contrib/hyperscan/src/util/bitutils.h]

1	/*
2	* Copyright (c) 2015-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* \file*
30	* \brief Bit-twiddling primitives (ctz, compress etc)
31	*/
32
33	#ifndef BITUTILS_H
34	#define BITUTILS_H
35
36	#include "ue2common.h"
37	#include "popcount.h"
38	#include "util/arch.h"
39	#include "util/intrinsics.h"
40
41	#define CASE_BIT 0x20
42	#define CASE_CLEAR 0xdf
43	#define DOUBLE_CASE_CLEAR 0xdfdf
44	#define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL
45
46	static really_inline
47	u32 clz32(u32 x) {
48	assert(x); // behaviour not defined for x == 0
49	#if defined(_WIN32)
50	unsigned long r;
51	_BitScanReverse(&r, x);
52	return `31` - r;
53	#else
54	return (u32)__builtin_clz(x);
55	#endif
56	}
57
58	static really_inline
59	u32 clz64(u64a x) {
60	assert(x); // behaviour not defined for x == 0
61	#if defined(_WIN64)
62	unsigned long r;
63	_BitScanReverse64(&r, x);
64	return `63` - r;
65	#elif defined(_WIN32)
66	unsigned long x1 = (u32)x;
67	unsigned long x2 = (u32)(x >> `32`);
68	unsigned long r;
69	if (x2) {
70	_BitScanReverse(&r, x2);
71	return (u32)(`31` - r);
72	}
73	_BitScanReverse(&r, (u32)x1);
74	return (u32)(`63` - r);
75	#else
76	return (u32)__builtin_clzll(x);
77	#endif
78	}
79
80	// CTZ (count trailing zero) implementations.
81	static really_inline
82	u32 ctz32(u32 x) {
83	assert(x); // behaviour not defined for x == 0
84	#if defined(_WIN32)
85	unsigned long r;
86	_BitScanForward(&r, x);
87	return r;
88	#else
89	return (u32)__builtin_ctz(x);
90	#endif
91	}
92
93	static really_inline
94	u32 ctz64(u64a x) {
95	assert(x); // behaviour not defined for x == 0
96	#if defined(_WIN64)
97	unsigned long r;
98	_BitScanForward64(&r, x);
99	return r;
100	#elif defined(_WIN32)
101	unsigned long r;
102	if (_BitScanForward(&r, (u32)x)) {
103	return (u32)r;
104	}
105	_BitScanForward(&r, x >> `32`);
106	return (u32)(r + `32`);
107	#else
108	return (u32)__builtin_ctzll(x);
109	#endif
110	}
111
112	static really_inline
113	u32 lg2(u32 x) {
114	if (!x) {
115	return `0`;
116	}
117	return `31` - clz32(x);
118	}
119
120	static really_inline
121	u64a lg2_64(u64a x) {
122	if (!x) {
123	return `0`;
124	}
125	return `63` - clz64(x);
126	}
127
128	static really_inline
129	u32 findAndClearLSB_32(u32 *v) {
130	assert(v != `0`); // behaviour not defined in this case*
131	#ifndef NO_ASM
132	u32 val = *v, offset;
133	__asm__ ("bsf %1, %0\n"
134	"btr %0, %1\n"
135	: "=r" (offset), "=r" (val)
136	: "1" (val));
137	*v = val;
138	#else
139	u32 val = *v;
140	u32 offset = ctz32(val);
141	*v = val & (val - `1`);
142	#endif
143
144	assert(offset < `32`);
145	return offset;
146	}
147
148	static really_inline
149	u32 findAndClearLSB_64(u64a *v) {
150	assert(v != `0`); // behaviour not defined in this case*
151
152	#ifdef ARCH_64_BIT
153	#if defined(ARCH_X86_64) && !defined(NO_ASM)
154	u64a val = *v, offset;
155	__asm__ ("bsfq %1, %0\n"
156	"btrq %0, %1\n"
157	: "=r" (offset), "=r" (val)
158	: "1" (val));
159	*v = val;
160	#else
161	// generic variant using gcc's builtin on 64-bit
162	u64a val = *v, offset;
163	offset = ctz64(val);
164	*v = val & (val - `1`);
165	#endif // ARCH_X86_64
166	#else
167	// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
168	// inline calls to __builtin_ctzll
169	u32 v1 = (u32)*v;
170	u32 v2 = (u32)(*v >> `32`);
171	u32 offset;
172	if (v1) {
173	offset = findAndClearLSB_32(&v1);
174	*v = (u64a)v1 \| ((u64a)v2 << `32`);
175	} else {
176	offset = findAndClearLSB_32(&v2) + `32`;
177	*v = (u64a)v2 << `32`;
178	}
179	#endif
180
181	assert(offset < `64`);
182	return (u32)offset;
183	}
184
185	static really_inline
186	u32 findAndClearMSB_32(u32 *v) {
187	assert(v != `0`); // behaviour not defined in this case*
188	#ifndef NO_ASM
189	u32 val = *v, offset;
190	__asm__ ("bsr %1, %0\n"
191	"btr %0, %1\n"
192	: "=r" (offset), "=r" (val)
193	: "1" (val));
194	*v = val;
195	#else
196	u32 val = *v;
197	u32 offset = `31` - clz32(val);
198	*v = val & ~(`1` << offset);
199	#endif
200	assert(offset < `32`);
201	return offset;
202	}
203
204	static really_inline
205	u32 findAndClearMSB_64(u64a *v) {
206	assert(v != `0`); // behaviour not defined in this case*
207
208	#ifdef ARCH_64_BIT
209	#if defined(ARCH_X86_64) && !defined(NO_ASM)
210	u64a val = *v, offset;
211	__asm__ ("bsrq %1, %0\n"
212	"btrq %0, %1\n"
213	: "=r" (offset), "=r" (val)
214	: "1" (val));
215	*v = val;
216	#else
217	// generic variant using gcc's builtin on 64-bit
218	u64a val = *v, offset;
219	offset = `63` - clz64(val);
220	*v = val & ~(`1ULL` << offset);
221	#endif // ARCH_X86_64
222	#else
223	// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
224	// inline calls to __builtin_ctzll
225	u32 v1 = (u32)*v;
226	u32 v2 = (*v >> `32`);
227	u32 offset;
228	if (v2) {
229	offset = findAndClearMSB_32(&v2) + `32`;
230	*v = ((u64a)v2 << `32`) \| (u64a)v1;
231	} else {
232	offset = findAndClearMSB_32(&v1);
233	*v = (u64a)v1;
234	}
235	#endif
236
237	assert(offset < `64`);
238	return (u32)offset;
239	}
240
241	static really_inline
242	u32 compress32(u32 x, u32 m) {
243	#if defined(HAVE_BMI2)
244	// BMI2 has a single instruction for this operation.
245	return _pext_u32(x, m);
246	#else
247
248	// Return zero quickly on trivial cases
249	if ((x & m) == `0`) {
250	return `0`;
251	}
252
253	u32 mk, mp, mv, t;
254
255	x &= m; // clear irrelevant bits
256
257	mk = ~m << `1`; // we will count 0's to right
258	for (u32 i = `0`; i < `5`; i++) {
259	mp = mk ^ (mk << `1`);
260	mp ^= mp << `2`;
261	mp ^= mp << `4`;
262	mp ^= mp << `8`;
263	mp ^= mp << `16`;
264
265	mv = mp & m; // bits to move
266	m = (m ^ mv) \| (mv >> (`1` << i)); // compress m
267	t = x & mv;
268	x = (x ^ t) \| (t >> (`1` << i)); // compress x
269	mk = mk & ~mp;
270	}
271
272	return x;
273	#endif
274	}
275
276	static really_inline
277	u64a compress64(u64a x, u64a m) {
278	#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
279	// BMI2 has a single instruction for this operation.
280	return _pext_u64(x, m);
281	#else
282
283	// Return zero quickly on trivial cases
284	if ((x & m) == `0`) {
285	return `0`;
286	}
287
288	u64a mk, mp, mv, t;
289
290	x &= m; // clear irrelevant bits
291
292	mk = ~m << `1`; // we will count 0's to right
293	for (u32 i = `0`; i < `6`; i++) {
294	mp = mk ^ (mk << `1`);
295	mp ^= mp << `2`;
296	mp ^= mp << `4`;
297	mp ^= mp << `8`;
298	mp ^= mp << `16`;
299	mp ^= mp << `32`;
300
301	mv = mp & m; // bits to move
302	m = (m ^ mv) \| (mv >> (`1` << i)); // compress m
303	t = x & mv;
304	x = (x ^ t) \| (t >> (`1` << i)); // compress x
305	mk = mk & ~mp;
306	}
307
308	return x;
309	#endif
310	}
311
312	static really_inline
313	u32 expand32(u32 x, u32 m) {
314	#if defined(HAVE_BMI2)
315	// BMI2 has a single instruction for this operation.
316	return _pdep_u32(x, m);
317	#else
318
319	// Return zero quickly on trivial cases
320	if (!x \|\| !m) {
321	return `0`;
322	}
323
324	u32 m0, mk, mp, mv, t;
325	u32 array[`5`];
326
327	m0 = m; // save original mask
328	mk = ~m << `1`; // we will count 0's to right
329
330	for (int i = `0`; i < `5`; i++) {
331	mp = mk ^ (mk << `1`); // parallel suffix
332	mp = mp ^ (mp << `2`);
333	mp = mp ^ (mp << `4`);
334	mp = mp ^ (mp << `8`);
335	mp = mp ^ (mp << `16`);
336	mv = mp & m; // bits to move
337	array[i] = mv;
338	m = (m ^ mv) \| (mv >> (`1` << i)); // compress m
339	mk = mk & ~mp;
340	}
341
342	for (int i = `4`; i >= `0`; i--) {
343	mv = array[i];
344	t = x << (`1` << i);
345	x = (x & ~mv) \| (t & mv);
346	}
347
348	return x & m0; // clear out extraneous bits
349	#endif
350	}
351
352	static really_inline
353	u64a expand64(u64a x, u64a m) {
354	#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
355	// BMI2 has a single instruction for this operation.
356	return _pdep_u64(x, m);
357	#else
358
359	// Return zero quickly on trivial cases
360	if (!x \|\| !m) {
361	return `0`;
362	}
363
364	u64a m0, mk, mp, mv, t;
365	u64a array[`6`];
366
367	m0 = m; // save original mask
368	mk = ~m << `1`; // we will count 0's to right
369
370	for (int i = `0`; i < `6`; i++) {
371	mp = mk ^ (mk << `1`); // parallel suffix
372	mp = mp ^ (mp << `2`);
373	mp = mp ^ (mp << `4`);
374	mp = mp ^ (mp << `8`);
375	mp = mp ^ (mp << `16`);
376	mp = mp ^ (mp << `32`);
377	mv = mp & m; // bits to move
378	array[i] = mv;
379	m = (m ^ mv) \| (mv >> (`1` << i)); // compress m
380	mk = mk & ~mp;
381	}
382
383	for (int i = `5`; i >= `0`; i--) {
384	mv = array[i];
385	t = x << (`1` << i);
386	x = (x & ~mv) \| (t & mv);
387	}
388
389	return x & m0; // clear out extraneous bits
390	#endif
391	}
392
393
394	/ returns the first set bit after begin (if not ~0U). If no bit is set after*
395	* begin returns ~0U
396	*/
397	static really_inline
398	u32 bf64_iterate(u64a bitfield, u32 begin) {
399	if (begin != ~`0U`) {
400	/ switch off all bits at or below begin. Note: not legal to shift by*
401	* by size of the datatype or larger. */
402	assert(begin <= `63`);
403	bitfield &= ~((`2ULL` << begin) - `1`);
404	}
405
406	if (!bitfield) {
407	return ~`0U`;
408	}
409
410	return ctz64(bitfield);
411	}
412
413	static really_inline
414	char bf64_set(u64a *bitfield, u32 i) {
415	assert(i < `64`);
416	u64a mask = `1ULL` << i;
417	char was_set = !!(*bitfield & mask);
418	*bitfield \|= mask;
419
420	return was_set;
421	}
422
423	static really_inline
424	void bf64_unset(u64a *bitfield, u32 i) {
425	assert(i < `64`);
426	*bitfield &= ~(`1ULL` << i);
427	}
428
429	static really_inline
430	u32 rank_in_mask32(u32 mask, u32 bit) {
431	assert(bit < sizeof(u32) * `8`);
432	assert(mask & (u32)(`1U` << bit));
433	mask &= (u32)(`1U` << bit) - `1`;
434	return popcount32(mask);
435	}
436
437	static really_inline
438	u32 rank_in_mask64(u64a mask, u32 bit) {
439	assert(bit < sizeof(u64a) * `8`);
440	assert(mask & (u64a)(`1ULL` << bit));
441	mask &= (u64a)(`1ULL` << bit) - `1`;
442	return popcount64(mask);
443	}
444
445	static really_inline
446	u32 pext32(u32 x, u32 mask) {
447	#if defined(HAVE_BMI2)
448	// Intel BMI2 can do this operation in one instruction.
449	return _pext_u32(x, mask);
450	#else
451
452	u32 result = `0`, num = `1`;
453	while (mask != `0`) {
454	u32 bit = findAndClearLSB_32(&mask);
455	if (x & (`1U` << bit)) {
456	assert(num != `0`); // more than 32 bits!
457	result \|= num;
458	}
459	num <<= `1`;
460	}
461	return result;
462	#endif
463	}
464
465	static really_inline
466	u64a pext64(u64a x, u64a mask) {
467	#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
468	// Intel BMI2 can do this operation in one instruction.
469	return _pext_u64(x, mask);
470	#else
471
472	u32 result = `0`, num = `1`;
473	while (mask != `0`) {
474	u32 bit = findAndClearLSB_64(&mask);
475	if (x & (`1ULL` << bit)) {
476	assert(num != `0`); // more than 32 bits!
477	result \|= num;
478	}
479	num <<= `1`;
480	}
481	return result;
482	#endif
483	}
484
485	#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
486	static really_inline
487	u64a pdep64(u64a x, u64a mask) {
488	return _pdep_u64(x, mask);
489	}
490	#endif
491
492	#endif // BITUTILS_H
493

Browse the source code of ClickHouse/contrib/hyperscan/src/util/bitutils.h