1 | /* |
2 | * Copyright (c) 2015-2017, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Bit-twiddling primitives (ctz, compress etc) |
31 | */ |
32 | |
33 | #ifndef BITUTILS_H |
34 | #define BITUTILS_H |
35 | |
36 | #include "ue2common.h" |
37 | #include "popcount.h" |
38 | #include "util/arch.h" |
39 | #include "util/intrinsics.h" |
40 | |
41 | #define CASE_BIT 0x20 |
42 | #define CASE_CLEAR 0xdf |
43 | #define DOUBLE_CASE_CLEAR 0xdfdf |
44 | #define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL |
45 | |
46 | static really_inline |
47 | u32 clz32(u32 x) { |
48 | assert(x); // behaviour not defined for x == 0 |
49 | #if defined(_WIN32) |
50 | unsigned long r; |
51 | _BitScanReverse(&r, x); |
52 | return 31 - r; |
53 | #else |
54 | return (u32)__builtin_clz(x); |
55 | #endif |
56 | } |
57 | |
58 | static really_inline |
59 | u32 clz64(u64a x) { |
60 | assert(x); // behaviour not defined for x == 0 |
61 | #if defined(_WIN64) |
62 | unsigned long r; |
63 | _BitScanReverse64(&r, x); |
64 | return 63 - r; |
65 | #elif defined(_WIN32) |
66 | unsigned long x1 = (u32)x; |
67 | unsigned long x2 = (u32)(x >> 32); |
68 | unsigned long r; |
69 | if (x2) { |
70 | _BitScanReverse(&r, x2); |
71 | return (u32)(31 - r); |
72 | } |
73 | _BitScanReverse(&r, (u32)x1); |
74 | return (u32)(63 - r); |
75 | #else |
76 | return (u32)__builtin_clzll(x); |
77 | #endif |
78 | } |
79 | |
80 | // CTZ (count trailing zero) implementations. |
81 | static really_inline |
82 | u32 ctz32(u32 x) { |
83 | assert(x); // behaviour not defined for x == 0 |
84 | #if defined(_WIN32) |
85 | unsigned long r; |
86 | _BitScanForward(&r, x); |
87 | return r; |
88 | #else |
89 | return (u32)__builtin_ctz(x); |
90 | #endif |
91 | } |
92 | |
93 | static really_inline |
94 | u32 ctz64(u64a x) { |
95 | assert(x); // behaviour not defined for x == 0 |
96 | #if defined(_WIN64) |
97 | unsigned long r; |
98 | _BitScanForward64(&r, x); |
99 | return r; |
100 | #elif defined(_WIN32) |
101 | unsigned long r; |
102 | if (_BitScanForward(&r, (u32)x)) { |
103 | return (u32)r; |
104 | } |
105 | _BitScanForward(&r, x >> 32); |
106 | return (u32)(r + 32); |
107 | #else |
108 | return (u32)__builtin_ctzll(x); |
109 | #endif |
110 | } |
111 | |
112 | static really_inline |
113 | u32 lg2(u32 x) { |
114 | if (!x) { |
115 | return 0; |
116 | } |
117 | return 31 - clz32(x); |
118 | } |
119 | |
120 | static really_inline |
121 | u64a lg2_64(u64a x) { |
122 | if (!x) { |
123 | return 0; |
124 | } |
125 | return 63 - clz64(x); |
126 | } |
127 | |
128 | static really_inline |
129 | u32 findAndClearLSB_32(u32 *v) { |
130 | assert(*v != 0); // behaviour not defined in this case |
131 | #ifndef NO_ASM |
132 | u32 val = *v, offset; |
133 | __asm__ ("bsf %1, %0\n" |
134 | "btr %0, %1\n" |
135 | : "=r" (offset), "=r" (val) |
136 | : "1" (val)); |
137 | *v = val; |
138 | #else |
139 | u32 val = *v; |
140 | u32 offset = ctz32(val); |
141 | *v = val & (val - 1); |
142 | #endif |
143 | |
144 | assert(offset < 32); |
145 | return offset; |
146 | } |
147 | |
148 | static really_inline |
149 | u32 findAndClearLSB_64(u64a *v) { |
150 | assert(*v != 0); // behaviour not defined in this case |
151 | |
152 | #ifdef ARCH_64_BIT |
153 | #if defined(ARCH_X86_64) && !defined(NO_ASM) |
154 | u64a val = *v, offset; |
155 | __asm__ ("bsfq %1, %0\n" |
156 | "btrq %0, %1\n" |
157 | : "=r" (offset), "=r" (val) |
158 | : "1" (val)); |
159 | *v = val; |
160 | #else |
161 | // generic variant using gcc's builtin on 64-bit |
162 | u64a val = *v, offset; |
163 | offset = ctz64(val); |
164 | *v = val & (val - 1); |
165 | #endif // ARCH_X86_64 |
166 | #else |
167 | // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't |
168 | // inline calls to __builtin_ctzll |
169 | u32 v1 = (u32)*v; |
170 | u32 v2 = (u32)(*v >> 32); |
171 | u32 offset; |
172 | if (v1) { |
173 | offset = findAndClearLSB_32(&v1); |
174 | *v = (u64a)v1 | ((u64a)v2 << 32); |
175 | } else { |
176 | offset = findAndClearLSB_32(&v2) + 32; |
177 | *v = (u64a)v2 << 32; |
178 | } |
179 | #endif |
180 | |
181 | assert(offset < 64); |
182 | return (u32)offset; |
183 | } |
184 | |
185 | static really_inline |
186 | u32 findAndClearMSB_32(u32 *v) { |
187 | assert(*v != 0); // behaviour not defined in this case |
188 | #ifndef NO_ASM |
189 | u32 val = *v, offset; |
190 | __asm__ ("bsr %1, %0\n" |
191 | "btr %0, %1\n" |
192 | : "=r" (offset), "=r" (val) |
193 | : "1" (val)); |
194 | *v = val; |
195 | #else |
196 | u32 val = *v; |
197 | u32 offset = 31 - clz32(val); |
198 | *v = val & ~(1 << offset); |
199 | #endif |
200 | assert(offset < 32); |
201 | return offset; |
202 | } |
203 | |
204 | static really_inline |
205 | u32 findAndClearMSB_64(u64a *v) { |
206 | assert(*v != 0); // behaviour not defined in this case |
207 | |
208 | #ifdef ARCH_64_BIT |
209 | #if defined(ARCH_X86_64) && !defined(NO_ASM) |
210 | u64a val = *v, offset; |
211 | __asm__ ("bsrq %1, %0\n" |
212 | "btrq %0, %1\n" |
213 | : "=r" (offset), "=r" (val) |
214 | : "1" (val)); |
215 | *v = val; |
216 | #else |
217 | // generic variant using gcc's builtin on 64-bit |
218 | u64a val = *v, offset; |
219 | offset = 63 - clz64(val); |
220 | *v = val & ~(1ULL << offset); |
221 | #endif // ARCH_X86_64 |
222 | #else |
223 | // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't |
224 | // inline calls to __builtin_ctzll |
225 | u32 v1 = (u32)*v; |
226 | u32 v2 = (*v >> 32); |
227 | u32 offset; |
228 | if (v2) { |
229 | offset = findAndClearMSB_32(&v2) + 32; |
230 | *v = ((u64a)v2 << 32) | (u64a)v1; |
231 | } else { |
232 | offset = findAndClearMSB_32(&v1); |
233 | *v = (u64a)v1; |
234 | } |
235 | #endif |
236 | |
237 | assert(offset < 64); |
238 | return (u32)offset; |
239 | } |
240 | |
241 | static really_inline |
242 | u32 compress32(u32 x, u32 m) { |
243 | #if defined(HAVE_BMI2) |
244 | // BMI2 has a single instruction for this operation. |
245 | return _pext_u32(x, m); |
246 | #else |
247 | |
248 | // Return zero quickly on trivial cases |
249 | if ((x & m) == 0) { |
250 | return 0; |
251 | } |
252 | |
253 | u32 mk, mp, mv, t; |
254 | |
255 | x &= m; // clear irrelevant bits |
256 | |
257 | mk = ~m << 1; // we will count 0's to right |
258 | for (u32 i = 0; i < 5; i++) { |
259 | mp = mk ^ (mk << 1); |
260 | mp ^= mp << 2; |
261 | mp ^= mp << 4; |
262 | mp ^= mp << 8; |
263 | mp ^= mp << 16; |
264 | |
265 | mv = mp & m; // bits to move |
266 | m = (m ^ mv) | (mv >> (1 << i)); // compress m |
267 | t = x & mv; |
268 | x = (x ^ t) | (t >> (1 << i)); // compress x |
269 | mk = mk & ~mp; |
270 | } |
271 | |
272 | return x; |
273 | #endif |
274 | } |
275 | |
276 | static really_inline |
277 | u64a compress64(u64a x, u64a m) { |
278 | #if defined(ARCH_X86_64) && defined(HAVE_BMI2) |
279 | // BMI2 has a single instruction for this operation. |
280 | return _pext_u64(x, m); |
281 | #else |
282 | |
283 | // Return zero quickly on trivial cases |
284 | if ((x & m) == 0) { |
285 | return 0; |
286 | } |
287 | |
288 | u64a mk, mp, mv, t; |
289 | |
290 | x &= m; // clear irrelevant bits |
291 | |
292 | mk = ~m << 1; // we will count 0's to right |
293 | for (u32 i = 0; i < 6; i++) { |
294 | mp = mk ^ (mk << 1); |
295 | mp ^= mp << 2; |
296 | mp ^= mp << 4; |
297 | mp ^= mp << 8; |
298 | mp ^= mp << 16; |
299 | mp ^= mp << 32; |
300 | |
301 | mv = mp & m; // bits to move |
302 | m = (m ^ mv) | (mv >> (1 << i)); // compress m |
303 | t = x & mv; |
304 | x = (x ^ t) | (t >> (1 << i)); // compress x |
305 | mk = mk & ~mp; |
306 | } |
307 | |
308 | return x; |
309 | #endif |
310 | } |
311 | |
312 | static really_inline |
313 | u32 expand32(u32 x, u32 m) { |
314 | #if defined(HAVE_BMI2) |
315 | // BMI2 has a single instruction for this operation. |
316 | return _pdep_u32(x, m); |
317 | #else |
318 | |
319 | // Return zero quickly on trivial cases |
320 | if (!x || !m) { |
321 | return 0; |
322 | } |
323 | |
324 | u32 m0, mk, mp, mv, t; |
325 | u32 array[5]; |
326 | |
327 | m0 = m; // save original mask |
328 | mk = ~m << 1; // we will count 0's to right |
329 | |
330 | for (int i = 0; i < 5; i++) { |
331 | mp = mk ^ (mk << 1); // parallel suffix |
332 | mp = mp ^ (mp << 2); |
333 | mp = mp ^ (mp << 4); |
334 | mp = mp ^ (mp << 8); |
335 | mp = mp ^ (mp << 16); |
336 | mv = mp & m; // bits to move |
337 | array[i] = mv; |
338 | m = (m ^ mv) | (mv >> (1 << i)); // compress m |
339 | mk = mk & ~mp; |
340 | } |
341 | |
342 | for (int i = 4; i >= 0; i--) { |
343 | mv = array[i]; |
344 | t = x << (1 << i); |
345 | x = (x & ~mv) | (t & mv); |
346 | } |
347 | |
348 | return x & m0; // clear out extraneous bits |
349 | #endif |
350 | } |
351 | |
352 | static really_inline |
353 | u64a expand64(u64a x, u64a m) { |
354 | #if defined(ARCH_X86_64) && defined(HAVE_BMI2) |
355 | // BMI2 has a single instruction for this operation. |
356 | return _pdep_u64(x, m); |
357 | #else |
358 | |
359 | // Return zero quickly on trivial cases |
360 | if (!x || !m) { |
361 | return 0; |
362 | } |
363 | |
364 | u64a m0, mk, mp, mv, t; |
365 | u64a array[6]; |
366 | |
367 | m0 = m; // save original mask |
368 | mk = ~m << 1; // we will count 0's to right |
369 | |
370 | for (int i = 0; i < 6; i++) { |
371 | mp = mk ^ (mk << 1); // parallel suffix |
372 | mp = mp ^ (mp << 2); |
373 | mp = mp ^ (mp << 4); |
374 | mp = mp ^ (mp << 8); |
375 | mp = mp ^ (mp << 16); |
376 | mp = mp ^ (mp << 32); |
377 | mv = mp & m; // bits to move |
378 | array[i] = mv; |
379 | m = (m ^ mv) | (mv >> (1 << i)); // compress m |
380 | mk = mk & ~mp; |
381 | } |
382 | |
383 | for (int i = 5; i >= 0; i--) { |
384 | mv = array[i]; |
385 | t = x << (1 << i); |
386 | x = (x & ~mv) | (t & mv); |
387 | } |
388 | |
389 | return x & m0; // clear out extraneous bits |
390 | #endif |
391 | } |
392 | |
393 | |
394 | /* returns the first set bit after begin (if not ~0U). If no bit is set after |
395 | * begin returns ~0U |
396 | */ |
397 | static really_inline |
398 | u32 bf64_iterate(u64a bitfield, u32 begin) { |
399 | if (begin != ~0U) { |
400 | /* switch off all bits at or below begin. Note: not legal to shift by |
401 | * by size of the datatype or larger. */ |
402 | assert(begin <= 63); |
403 | bitfield &= ~((2ULL << begin) - 1); |
404 | } |
405 | |
406 | if (!bitfield) { |
407 | return ~0U; |
408 | } |
409 | |
410 | return ctz64(bitfield); |
411 | } |
412 | |
413 | static really_inline |
414 | char bf64_set(u64a *bitfield, u32 i) { |
415 | assert(i < 64); |
416 | u64a mask = 1ULL << i; |
417 | char was_set = !!(*bitfield & mask); |
418 | *bitfield |= mask; |
419 | |
420 | return was_set; |
421 | } |
422 | |
423 | static really_inline |
424 | void bf64_unset(u64a *bitfield, u32 i) { |
425 | assert(i < 64); |
426 | *bitfield &= ~(1ULL << i); |
427 | } |
428 | |
429 | static really_inline |
430 | u32 rank_in_mask32(u32 mask, u32 bit) { |
431 | assert(bit < sizeof(u32) * 8); |
432 | assert(mask & (u32)(1U << bit)); |
433 | mask &= (u32)(1U << bit) - 1; |
434 | return popcount32(mask); |
435 | } |
436 | |
437 | static really_inline |
438 | u32 rank_in_mask64(u64a mask, u32 bit) { |
439 | assert(bit < sizeof(u64a) * 8); |
440 | assert(mask & (u64a)(1ULL << bit)); |
441 | mask &= (u64a)(1ULL << bit) - 1; |
442 | return popcount64(mask); |
443 | } |
444 | |
445 | static really_inline |
446 | u32 pext32(u32 x, u32 mask) { |
447 | #if defined(HAVE_BMI2) |
448 | // Intel BMI2 can do this operation in one instruction. |
449 | return _pext_u32(x, mask); |
450 | #else |
451 | |
452 | u32 result = 0, num = 1; |
453 | while (mask != 0) { |
454 | u32 bit = findAndClearLSB_32(&mask); |
455 | if (x & (1U << bit)) { |
456 | assert(num != 0); // more than 32 bits! |
457 | result |= num; |
458 | } |
459 | num <<= 1; |
460 | } |
461 | return result; |
462 | #endif |
463 | } |
464 | |
465 | static really_inline |
466 | u64a pext64(u64a x, u64a mask) { |
467 | #if defined(HAVE_BMI2) && defined(ARCH_64_BIT) |
468 | // Intel BMI2 can do this operation in one instruction. |
469 | return _pext_u64(x, mask); |
470 | #else |
471 | |
472 | u32 result = 0, num = 1; |
473 | while (mask != 0) { |
474 | u32 bit = findAndClearLSB_64(&mask); |
475 | if (x & (1ULL << bit)) { |
476 | assert(num != 0); // more than 32 bits! |
477 | result |= num; |
478 | } |
479 | num <<= 1; |
480 | } |
481 | return result; |
482 | #endif |
483 | } |
484 | |
485 | #if defined(HAVE_BMI2) && defined(ARCH_64_BIT) |
486 | static really_inline |
487 | u64a pdep64(u64a x, u64a mask) { |
488 | return _pdep_u64(x, mask); |
489 | } |
490 | #endif |
491 | |
492 | #endif // BITUTILS_H |
493 | |