1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Bit-twiddling primitives (ctz, compress etc)
31 */
32
33#ifndef BITUTILS_H
34#define BITUTILS_H
35
36#include "ue2common.h"
37#include "popcount.h"
38#include "util/arch.h"
39#include "util/intrinsics.h"
40
41#define CASE_BIT 0x20
42#define CASE_CLEAR 0xdf
43#define DOUBLE_CASE_CLEAR 0xdfdf
44#define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL
45
46static really_inline
47u32 clz32(u32 x) {
48 assert(x); // behaviour not defined for x == 0
49#if defined(_WIN32)
50 unsigned long r;
51 _BitScanReverse(&r, x);
52 return 31 - r;
53#else
54 return (u32)__builtin_clz(x);
55#endif
56}
57
58static really_inline
59u32 clz64(u64a x) {
60 assert(x); // behaviour not defined for x == 0
61#if defined(_WIN64)
62 unsigned long r;
63 _BitScanReverse64(&r, x);
64 return 63 - r;
65#elif defined(_WIN32)
66 unsigned long x1 = (u32)x;
67 unsigned long x2 = (u32)(x >> 32);
68 unsigned long r;
69 if (x2) {
70 _BitScanReverse(&r, x2);
71 return (u32)(31 - r);
72 }
73 _BitScanReverse(&r, (u32)x1);
74 return (u32)(63 - r);
75#else
76 return (u32)__builtin_clzll(x);
77#endif
78}
79
80// CTZ (count trailing zero) implementations.
81static really_inline
82u32 ctz32(u32 x) {
83 assert(x); // behaviour not defined for x == 0
84#if defined(_WIN32)
85 unsigned long r;
86 _BitScanForward(&r, x);
87 return r;
88#else
89 return (u32)__builtin_ctz(x);
90#endif
91}
92
93static really_inline
94u32 ctz64(u64a x) {
95 assert(x); // behaviour not defined for x == 0
96#if defined(_WIN64)
97 unsigned long r;
98 _BitScanForward64(&r, x);
99 return r;
100#elif defined(_WIN32)
101 unsigned long r;
102 if (_BitScanForward(&r, (u32)x)) {
103 return (u32)r;
104 }
105 _BitScanForward(&r, x >> 32);
106 return (u32)(r + 32);
107#else
108 return (u32)__builtin_ctzll(x);
109#endif
110}
111
112static really_inline
113u32 lg2(u32 x) {
114 if (!x) {
115 return 0;
116 }
117 return 31 - clz32(x);
118}
119
120static really_inline
121u64a lg2_64(u64a x) {
122 if (!x) {
123 return 0;
124 }
125 return 63 - clz64(x);
126}
127
128static really_inline
129u32 findAndClearLSB_32(u32 *v) {
130 assert(*v != 0); // behaviour not defined in this case
131#ifndef NO_ASM
132 u32 val = *v, offset;
133 __asm__ ("bsf %1, %0\n"
134 "btr %0, %1\n"
135 : "=r" (offset), "=r" (val)
136 : "1" (val));
137 *v = val;
138#else
139 u32 val = *v;
140 u32 offset = ctz32(val);
141 *v = val & (val - 1);
142#endif
143
144 assert(offset < 32);
145 return offset;
146}
147
148static really_inline
149u32 findAndClearLSB_64(u64a *v) {
150 assert(*v != 0); // behaviour not defined in this case
151
152#ifdef ARCH_64_BIT
153#if defined(ARCH_X86_64) && !defined(NO_ASM)
154 u64a val = *v, offset;
155 __asm__ ("bsfq %1, %0\n"
156 "btrq %0, %1\n"
157 : "=r" (offset), "=r" (val)
158 : "1" (val));
159 *v = val;
160#else
161 // generic variant using gcc's builtin on 64-bit
162 u64a val = *v, offset;
163 offset = ctz64(val);
164 *v = val & (val - 1);
165#endif // ARCH_X86_64
166#else
167 // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
168 // inline calls to __builtin_ctzll
169 u32 v1 = (u32)*v;
170 u32 v2 = (u32)(*v >> 32);
171 u32 offset;
172 if (v1) {
173 offset = findAndClearLSB_32(&v1);
174 *v = (u64a)v1 | ((u64a)v2 << 32);
175 } else {
176 offset = findAndClearLSB_32(&v2) + 32;
177 *v = (u64a)v2 << 32;
178 }
179#endif
180
181 assert(offset < 64);
182 return (u32)offset;
183}
184
185static really_inline
186u32 findAndClearMSB_32(u32 *v) {
187 assert(*v != 0); // behaviour not defined in this case
188#ifndef NO_ASM
189 u32 val = *v, offset;
190 __asm__ ("bsr %1, %0\n"
191 "btr %0, %1\n"
192 : "=r" (offset), "=r" (val)
193 : "1" (val));
194 *v = val;
195#else
196 u32 val = *v;
197 u32 offset = 31 - clz32(val);
198 *v = val & ~(1 << offset);
199#endif
200 assert(offset < 32);
201 return offset;
202}
203
204static really_inline
205u32 findAndClearMSB_64(u64a *v) {
206 assert(*v != 0); // behaviour not defined in this case
207
208#ifdef ARCH_64_BIT
209#if defined(ARCH_X86_64) && !defined(NO_ASM)
210 u64a val = *v, offset;
211 __asm__ ("bsrq %1, %0\n"
212 "btrq %0, %1\n"
213 : "=r" (offset), "=r" (val)
214 : "1" (val));
215 *v = val;
216#else
217 // generic variant using gcc's builtin on 64-bit
218 u64a val = *v, offset;
219 offset = 63 - clz64(val);
220 *v = val & ~(1ULL << offset);
221#endif // ARCH_X86_64
222#else
223 // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
224 // inline calls to __builtin_ctzll
225 u32 v1 = (u32)*v;
226 u32 v2 = (*v >> 32);
227 u32 offset;
228 if (v2) {
229 offset = findAndClearMSB_32(&v2) + 32;
230 *v = ((u64a)v2 << 32) | (u64a)v1;
231 } else {
232 offset = findAndClearMSB_32(&v1);
233 *v = (u64a)v1;
234 }
235#endif
236
237 assert(offset < 64);
238 return (u32)offset;
239}
240
241static really_inline
242u32 compress32(u32 x, u32 m) {
243#if defined(HAVE_BMI2)
244 // BMI2 has a single instruction for this operation.
245 return _pext_u32(x, m);
246#else
247
248 // Return zero quickly on trivial cases
249 if ((x & m) == 0) {
250 return 0;
251 }
252
253 u32 mk, mp, mv, t;
254
255 x &= m; // clear irrelevant bits
256
257 mk = ~m << 1; // we will count 0's to right
258 for (u32 i = 0; i < 5; i++) {
259 mp = mk ^ (mk << 1);
260 mp ^= mp << 2;
261 mp ^= mp << 4;
262 mp ^= mp << 8;
263 mp ^= mp << 16;
264
265 mv = mp & m; // bits to move
266 m = (m ^ mv) | (mv >> (1 << i)); // compress m
267 t = x & mv;
268 x = (x ^ t) | (t >> (1 << i)); // compress x
269 mk = mk & ~mp;
270 }
271
272 return x;
273#endif
274}
275
276static really_inline
277u64a compress64(u64a x, u64a m) {
278#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
279 // BMI2 has a single instruction for this operation.
280 return _pext_u64(x, m);
281#else
282
283 // Return zero quickly on trivial cases
284 if ((x & m) == 0) {
285 return 0;
286 }
287
288 u64a mk, mp, mv, t;
289
290 x &= m; // clear irrelevant bits
291
292 mk = ~m << 1; // we will count 0's to right
293 for (u32 i = 0; i < 6; i++) {
294 mp = mk ^ (mk << 1);
295 mp ^= mp << 2;
296 mp ^= mp << 4;
297 mp ^= mp << 8;
298 mp ^= mp << 16;
299 mp ^= mp << 32;
300
301 mv = mp & m; // bits to move
302 m = (m ^ mv) | (mv >> (1 << i)); // compress m
303 t = x & mv;
304 x = (x ^ t) | (t >> (1 << i)); // compress x
305 mk = mk & ~mp;
306 }
307
308 return x;
309#endif
310}
311
312static really_inline
313u32 expand32(u32 x, u32 m) {
314#if defined(HAVE_BMI2)
315 // BMI2 has a single instruction for this operation.
316 return _pdep_u32(x, m);
317#else
318
319 // Return zero quickly on trivial cases
320 if (!x || !m) {
321 return 0;
322 }
323
324 u32 m0, mk, mp, mv, t;
325 u32 array[5];
326
327 m0 = m; // save original mask
328 mk = ~m << 1; // we will count 0's to right
329
330 for (int i = 0; i < 5; i++) {
331 mp = mk ^ (mk << 1); // parallel suffix
332 mp = mp ^ (mp << 2);
333 mp = mp ^ (mp << 4);
334 mp = mp ^ (mp << 8);
335 mp = mp ^ (mp << 16);
336 mv = mp & m; // bits to move
337 array[i] = mv;
338 m = (m ^ mv) | (mv >> (1 << i)); // compress m
339 mk = mk & ~mp;
340 }
341
342 for (int i = 4; i >= 0; i--) {
343 mv = array[i];
344 t = x << (1 << i);
345 x = (x & ~mv) | (t & mv);
346 }
347
348 return x & m0; // clear out extraneous bits
349#endif
350}
351
352static really_inline
353u64a expand64(u64a x, u64a m) {
354#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
355 // BMI2 has a single instruction for this operation.
356 return _pdep_u64(x, m);
357#else
358
359 // Return zero quickly on trivial cases
360 if (!x || !m) {
361 return 0;
362 }
363
364 u64a m0, mk, mp, mv, t;
365 u64a array[6];
366
367 m0 = m; // save original mask
368 mk = ~m << 1; // we will count 0's to right
369
370 for (int i = 0; i < 6; i++) {
371 mp = mk ^ (mk << 1); // parallel suffix
372 mp = mp ^ (mp << 2);
373 mp = mp ^ (mp << 4);
374 mp = mp ^ (mp << 8);
375 mp = mp ^ (mp << 16);
376 mp = mp ^ (mp << 32);
377 mv = mp & m; // bits to move
378 array[i] = mv;
379 m = (m ^ mv) | (mv >> (1 << i)); // compress m
380 mk = mk & ~mp;
381 }
382
383 for (int i = 5; i >= 0; i--) {
384 mv = array[i];
385 t = x << (1 << i);
386 x = (x & ~mv) | (t & mv);
387 }
388
389 return x & m0; // clear out extraneous bits
390#endif
391}
392
393
394/* returns the first set bit after begin (if not ~0U). If no bit is set after
395 * begin returns ~0U
396 */
397static really_inline
398u32 bf64_iterate(u64a bitfield, u32 begin) {
399 if (begin != ~0U) {
400 /* switch off all bits at or below begin. Note: not legal to shift by
401 * by size of the datatype or larger. */
402 assert(begin <= 63);
403 bitfield &= ~((2ULL << begin) - 1);
404 }
405
406 if (!bitfield) {
407 return ~0U;
408 }
409
410 return ctz64(bitfield);
411}
412
413static really_inline
414char bf64_set(u64a *bitfield, u32 i) {
415 assert(i < 64);
416 u64a mask = 1ULL << i;
417 char was_set = !!(*bitfield & mask);
418 *bitfield |= mask;
419
420 return was_set;
421}
422
423static really_inline
424void bf64_unset(u64a *bitfield, u32 i) {
425 assert(i < 64);
426 *bitfield &= ~(1ULL << i);
427}
428
429static really_inline
430u32 rank_in_mask32(u32 mask, u32 bit) {
431 assert(bit < sizeof(u32) * 8);
432 assert(mask & (u32)(1U << bit));
433 mask &= (u32)(1U << bit) - 1;
434 return popcount32(mask);
435}
436
437static really_inline
438u32 rank_in_mask64(u64a mask, u32 bit) {
439 assert(bit < sizeof(u64a) * 8);
440 assert(mask & (u64a)(1ULL << bit));
441 mask &= (u64a)(1ULL << bit) - 1;
442 return popcount64(mask);
443}
444
445static really_inline
446u32 pext32(u32 x, u32 mask) {
447#if defined(HAVE_BMI2)
448 // Intel BMI2 can do this operation in one instruction.
449 return _pext_u32(x, mask);
450#else
451
452 u32 result = 0, num = 1;
453 while (mask != 0) {
454 u32 bit = findAndClearLSB_32(&mask);
455 if (x & (1U << bit)) {
456 assert(num != 0); // more than 32 bits!
457 result |= num;
458 }
459 num <<= 1;
460 }
461 return result;
462#endif
463}
464
465static really_inline
466u64a pext64(u64a x, u64a mask) {
467#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
468 // Intel BMI2 can do this operation in one instruction.
469 return _pext_u64(x, mask);
470#else
471
472 u32 result = 0, num = 1;
473 while (mask != 0) {
474 u32 bit = findAndClearLSB_64(&mask);
475 if (x & (1ULL << bit)) {
476 assert(num != 0); // more than 32 bits!
477 result |= num;
478 }
479 num <<= 1;
480 }
481 return result;
482#endif
483}
484
485#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
486static really_inline
487u64a pdep64(u64a x, u64a mask) {
488 return _pdep_u64(x, mask);
489}
490#endif
491
492#endif // BITUTILS_H
493