1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "platform.h" |
7 | |
8 | #if defined(__WIN32__) |
9 | #include <intrin.h> |
10 | #endif |
11 | |
12 | #if defined(__ARM_NEON) |
13 | #include "../simd/arm/emulation.h" |
14 | #else |
15 | #include <immintrin.h> |
16 | #if defined(__EMSCRIPTEN__) |
17 | #include "../simd/wasm/emulation.h" |
18 | #endif |
19 | #endif |
20 | |
21 | #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) |
22 | #if !defined(_tzcnt_u32) |
23 | #define _tzcnt_u32 __tzcnt_u32 |
24 | #endif |
25 | #if !defined(_tzcnt_u64) |
26 | #define _tzcnt_u64 __tzcnt_u64 |
27 | #endif |
28 | #endif |
29 | |
30 | #if defined(__aarch64__) |
31 | #if !defined(_lzcnt_u32) |
32 | #define _lzcnt_u32 __builtin_clz |
33 | #endif |
34 | #else |
35 | #if defined(__LZCNT__) |
36 | #if !defined(_lzcnt_u32) |
37 | #define _lzcnt_u32 __lzcnt32 |
38 | #endif |
39 | #if !defined(_lzcnt_u64) |
40 | #define _lzcnt_u64 __lzcnt64 |
41 | #endif |
42 | #endif |
43 | #endif |
44 | |
45 | #if defined(__WIN32__) |
46 | # if !defined(NOMINMAX) |
47 | # define NOMINMAX |
48 | # endif |
49 | # include <windows.h> |
50 | #endif |
51 | |
52 | /* normally defined in pmmintrin.h, but we always need this */ |
53 | #if !defined(_MM_SET_DENORMALS_ZERO_MODE) |
54 | #define _MM_DENORMALS_ZERO_ON (0x0040) |
55 | #define _MM_DENORMALS_ZERO_OFF (0x0000) |
56 | #define _MM_DENORMALS_ZERO_MASK (0x0040) |
57 | #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) |
58 | #endif |
59 | |
60 | namespace embree |
61 | { |
62 | |
63 | //////////////////////////////////////////////////////////////////////////////// |
64 | /// Windows Platform |
65 | //////////////////////////////////////////////////////////////////////////////// |
66 | |
67 | #if defined(__WIN32__) |
68 | |
69 | __forceinline size_t read_tsc() |
70 | { |
71 | LARGE_INTEGER li; |
72 | QueryPerformanceCounter(&li); |
73 | return (size_t)li.QuadPart; |
74 | } |
75 | |
76 | __forceinline int bsf(int v) { |
77 | #if defined(__AVX2__) && !defined(__aarch64__) |
78 | return _tzcnt_u32(v); |
79 | #else |
80 | unsigned long r = 0; _BitScanForward(&r,v); return r; |
81 | #endif |
82 | } |
83 | |
84 | __forceinline unsigned bsf(unsigned v) { |
85 | #if defined(__AVX2__) && !defined(__aarch64__) |
86 | return _tzcnt_u32(v); |
87 | #else |
88 | unsigned long r = 0; _BitScanForward(&r,v); return r; |
89 | #endif |
90 | } |
91 | |
92 | #if defined(__X86_64__) |
93 | __forceinline size_t bsf(size_t v) { |
94 | #if defined(__AVX2__) |
95 | return _tzcnt_u64(v); |
96 | #else |
97 | unsigned long r = 0; _BitScanForward64(&r,v); return r; |
98 | #endif |
99 | } |
100 | #endif |
101 | |
102 | __forceinline int bscf(int& v) |
103 | { |
104 | int i = bsf(v); |
105 | v &= v-1; |
106 | return i; |
107 | } |
108 | |
109 | __forceinline unsigned bscf(unsigned& v) |
110 | { |
111 | unsigned i = bsf(v); |
112 | v &= v-1; |
113 | return i; |
114 | } |
115 | |
116 | #if defined(__X86_64__) |
117 | __forceinline size_t bscf(size_t& v) |
118 | { |
119 | size_t i = bsf(v); |
120 | v &= v-1; |
121 | return i; |
122 | } |
123 | #endif |
124 | |
125 | __forceinline int bsr(int v) { |
126 | #if defined(__AVX2__) && !defined(__aarch64__) |
127 | return 31 - _lzcnt_u32(v); |
128 | #else |
129 | unsigned long r = 0; _BitScanReverse(&r,v); return r; |
130 | #endif |
131 | } |
132 | |
133 | __forceinline unsigned bsr(unsigned v) { |
134 | #if defined(__AVX2__) && !defined(__aarch64__) |
135 | return 31 - _lzcnt_u32(v); |
136 | #else |
137 | unsigned long r = 0; _BitScanReverse(&r,v); return r; |
138 | #endif |
139 | } |
140 | |
141 | #if defined(__X86_64__) |
142 | __forceinline size_t bsr(size_t v) { |
143 | #if defined(__AVX2__) |
144 | return 63 -_lzcnt_u64(v); |
145 | #else |
146 | unsigned long r = 0; _BitScanReverse64(&r, v); return r; |
147 | #endif |
148 | } |
149 | #endif |
150 | |
151 | __forceinline int lzcnt(const int x) |
152 | { |
153 | #if defined(__AVX2__) && !defined(__aarch64__) |
154 | return _lzcnt_u32(x); |
155 | #else |
156 | if (unlikely(x == 0)) return 32; |
157 | return 31 - bsr(x); |
158 | #endif |
159 | } |
160 | |
161 | __forceinline int btc(int v, int i) { |
162 | long r = v; _bittestandcomplement(&r,i); return r; |
163 | } |
164 | |
165 | __forceinline int bts(int v, int i) { |
166 | long r = v; _bittestandset(&r,i); return r; |
167 | } |
168 | |
169 | __forceinline int btr(int v, int i) { |
170 | long r = v; _bittestandreset(&r,i); return r; |
171 | } |
172 | |
173 | #if defined(__X86_64__) |
174 | |
175 | __forceinline size_t btc(size_t v, size_t i) { |
176 | size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; |
177 | } |
178 | |
179 | __forceinline size_t bts(size_t v, size_t i) { |
180 | __int64 r = v; _bittestandset64(&r,i); return r; |
181 | } |
182 | |
183 | __forceinline size_t btr(size_t v, size_t i) { |
184 | __int64 r = v; _bittestandreset64(&r,i); return r; |
185 | } |
186 | |
187 | #endif |
188 | |
189 | __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { |
190 | return _InterlockedCompareExchange((volatile long*)p,v,c); |
191 | } |
192 | |
193 | //////////////////////////////////////////////////////////////////////////////// |
194 | /// Unix Platform |
195 | //////////////////////////////////////////////////////////////////////////////// |
196 | |
197 | #else |
198 | |
199 | #if defined(__i386__) && defined(__PIC__) |
200 | |
201 | __forceinline void __cpuid(int out[4], int op) |
202 | { |
203 | asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" |
204 | "cpuid\n\t" |
205 | "xchg{l}\t{%%}ebx, %1\n\t" |
206 | : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) |
207 | : "0" (op)); |
208 | } |
209 | |
210 | __forceinline void __cpuid_count(int out[4], int op1, int op2) |
211 | { |
212 | asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" |
213 | "cpuid\n\t" |
214 | "xchg{l}\t{%%}ebx, %1\n\t" |
215 | : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) |
216 | : "0" (op1), "2" (op2)); |
217 | } |
218 | |
219 | #elif defined(__X86_ASM__) |
220 | |
221 | __forceinline void __cpuid(int out[4], int op) { |
222 | #if defined(__ARM_NEON) |
223 | if (op == 0) { // Get CPU name |
224 | out[0] = 0x41524d20; |
225 | out[1] = 0x41524d20; |
226 | out[2] = 0x41524d20; |
227 | out[3] = 0x41524d20; |
228 | } |
229 | #else |
230 | asm volatile ("cpuid" : "=a" (out[0]), "=b" (out[1]), "=c" (out[2]), "=d" (out[3]) : "a" (op)); |
231 | #endif |
232 | } |
233 | |
234 | #if !defined(__ARM_NEON) |
235 | __forceinline void __cpuid_count(int out[4], int op1, int op2) { |
236 | asm volatile ("cpuid" : "=a" (out[0]), "=b" (out[1]), "=c" (out[2]), "=d" (out[3]) : "a" (op1), "c" (op2)); |
237 | } |
238 | #endif |
239 | |
240 | #endif |
241 | |
242 | __forceinline uint64_t read_tsc() { |
243 | #if defined(__X86_ASM__) |
244 | uint32_t high,low; |
245 | asm volatile ("rdtsc" : "=d" (high), "=a" (low)); |
246 | return (((uint64_t)high) << 32) + (uint64_t)low; |
247 | #else |
248 | /* Not supported yet, meaning measuring traversal cost per pixel does not work. */ |
249 | return 0; |
250 | #endif |
251 | } |
252 | |
253 | __forceinline int bsf(int v) { |
254 | #if defined(__ARM_NEON) |
255 | return __builtin_ctz(v); |
256 | #else |
257 | #if defined(__AVX2__) |
258 | return _tzcnt_u32(v); |
259 | #elif defined(__X86_ASM__) |
260 | int r = 0; asm ("bsf %1,%0" : "=r" (r) : "r" (v)); return r; |
261 | #else |
262 | return __builtin_ctz(v); |
263 | #endif |
264 | #endif |
265 | } |
266 | |
267 | #if defined(__64BIT__) |
268 | __forceinline unsigned bsf(unsigned v) |
269 | { |
270 | #if defined(__ARM_NEON) |
271 | return __builtin_ctz(v); |
272 | #else |
273 | #if defined(__AVX2__) |
274 | return _tzcnt_u32(v); |
275 | #elif defined(__X86_ASM__) |
276 | unsigned r = 0; asm ("bsf %1,%0" : "=r" (r) : "r" (v)); return r; |
277 | #else |
278 | return __builtin_ctz(v); |
279 | #endif |
280 | #endif |
281 | } |
282 | #endif |
283 | |
284 | __forceinline size_t bsf(size_t v) { |
285 | #if defined(__AVX2__) && !defined(__aarch64__) |
286 | #if defined(__X86_64__) |
287 | return _tzcnt_u64(v); |
288 | #else |
289 | return _tzcnt_u32(v); |
290 | #endif |
291 | #elif defined(__X86_ASM__) |
292 | size_t r = 0; asm ("bsf %1,%0" : "=r" (r) : "r" (v)); return r; |
293 | #else |
294 | return __builtin_ctzl(v); |
295 | #endif |
296 | } |
297 | |
298 | __forceinline int bscf(int& v) |
299 | { |
300 | int i = bsf(v); |
301 | v &= v-1; |
302 | return i; |
303 | } |
304 | |
305 | #if defined(__64BIT__) |
306 | __forceinline unsigned int bscf(unsigned int& v) |
307 | { |
308 | unsigned int i = bsf(v); |
309 | v &= v-1; |
310 | return i; |
311 | } |
312 | #endif |
313 | |
314 | __forceinline size_t bscf(size_t& v) |
315 | { |
316 | size_t i = bsf(v); |
317 | v &= v-1; |
318 | return i; |
319 | } |
320 | |
321 | __forceinline int bsr(int v) { |
322 | #if defined(__AVX2__) && !defined(__aarch64__) |
323 | return 31 - _lzcnt_u32(v); |
324 | #elif defined(__X86_ASM__) |
325 | int r = 0; asm ("bsr %1,%0" : "=r" (r) : "r" (v)); return r; |
326 | #else |
327 | return __builtin_clz(v) ^ 31; |
328 | #endif |
329 | } |
330 | |
331 | #if defined(__64BIT__) || defined(__EMSCRIPTEN__) |
332 | __forceinline unsigned bsr(unsigned v) { |
333 | #if defined(__AVX2__) |
334 | return 31 - _lzcnt_u32(v); |
335 | #elif defined(__X86_ASM__) |
336 | unsigned r = 0; asm ("bsr %1,%0" : "=r" (r) : "r" (v)); return r; |
337 | #else |
338 | return __builtin_clz(v) ^ 31; |
339 | #endif |
340 | } |
341 | #endif |
342 | |
343 | __forceinline size_t bsr(size_t v) { |
344 | #if defined(__AVX2__) && !defined(__aarch64__) |
345 | #if defined(__X86_64__) |
346 | return 63 - _lzcnt_u64(v); |
347 | #else |
348 | return 31 - _lzcnt_u32(v); |
349 | #endif |
350 | #elif defined(__X86_ASM__) |
351 | size_t r = 0; asm ("bsr %1,%0" : "=r" (r) : "r" (v)); return r; |
352 | #else |
353 | return (sizeof(v) * 8 - 1) - __builtin_clzl(v); |
354 | #endif |
355 | } |
356 | |
357 | __forceinline int lzcnt(const int x) |
358 | { |
359 | #if defined(__AVX2__) && !defined(__aarch64__) |
360 | return _lzcnt_u32(x); |
361 | #else |
362 | if (unlikely(x == 0)) return 32; |
363 | return 31 - bsr(x); |
364 | #endif |
365 | } |
366 | |
367 | __forceinline size_t blsr(size_t v) { |
368 | #if defined(__AVX2__) && !defined(__aarch64__) |
369 | #if defined(__INTEL_COMPILER) |
370 | return _blsr_u64(v); |
371 | #else |
372 | #if defined(__X86_64__) |
373 | return __blsr_u64(v); |
374 | #else |
375 | return __blsr_u32(v); |
376 | #endif |
377 | #endif |
378 | #else |
379 | return v & (v-1); |
380 | #endif |
381 | } |
382 | |
383 | __forceinline int btc(int v, int i) { |
384 | #if defined(__X86_ASM__) |
385 | int r = 0; asm ("btc %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
386 | #else |
387 | return (v ^ (1 << i)); |
388 | #endif |
389 | } |
390 | |
391 | __forceinline int bts(int v, int i) { |
392 | #if defined(__X86_ASM__) |
393 | int r = 0; asm ("bts %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
394 | #else |
395 | return (v | (1 << i)); |
396 | #endif |
397 | } |
398 | |
399 | __forceinline int btr(int v, int i) { |
400 | #if defined(__X86_ASM__) |
401 | int r = 0; asm ("btr %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
402 | #else |
403 | return (v & ~(1 << i)); |
404 | #endif |
405 | } |
406 | |
407 | __forceinline size_t btc(size_t v, size_t i) { |
408 | #if defined(__X86_ASM__) |
409 | size_t r = 0; asm ("btc %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
410 | #else |
411 | return (v ^ (1 << i)); |
412 | #endif |
413 | } |
414 | |
415 | __forceinline size_t bts(size_t v, size_t i) { |
416 | #if defined(__X86_ASM__) |
417 | size_t r = 0; asm ("bts %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
418 | #else |
419 | return (v | (1 << i)); |
420 | #endif |
421 | } |
422 | |
423 | __forceinline size_t btr(size_t v, size_t i) { |
424 | #if defined(__X86_ASM__) |
425 | size_t r = 0; asm ("btr %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
426 | #else |
427 | return (v & ~(1 << i)); |
428 | #endif |
429 | } |
430 | |
431 | __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { |
432 | return __sync_val_compare_and_swap(value, comparand, input); |
433 | } |
434 | |
435 | #endif |
436 | |
437 | //////////////////////////////////////////////////////////////////////////////// |
438 | /// All Platforms |
439 | //////////////////////////////////////////////////////////////////////////////// |
440 | |
441 | #if defined(__clang__) || defined(__GNUC__) |
442 | #if !defined(_mm_undefined_ps) |
443 | __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } |
444 | #endif |
445 | #if !defined(_mm_undefined_si128) |
446 | __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); } |
447 | #endif |
448 | #if !defined(_mm256_undefined_ps) && defined(__AVX__) |
449 | __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); } |
450 | #endif |
451 | #if !defined(_mm256_undefined_si256) && defined(__AVX__) |
452 | __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); } |
453 | #endif |
454 | #if !defined(_mm512_undefined_ps) && defined(__AVX512F__) |
455 | __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); } |
456 | #endif |
457 | #if !defined(_mm512_undefined_epi32) && defined(__AVX512F__) |
458 | __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); } |
459 | #endif |
460 | #endif |
461 | |
462 | #if defined(__SSE4_2__) || defined(__ARM_NEON) |
463 | |
464 | __forceinline int popcnt(int in) { |
465 | return _mm_popcnt_u32(in); |
466 | } |
467 | |
468 | __forceinline unsigned popcnt(unsigned in) { |
469 | return _mm_popcnt_u32(in); |
470 | } |
471 | |
472 | #if defined(__64BIT__) |
473 | __forceinline size_t popcnt(size_t in) { |
474 | return _mm_popcnt_u64(in); |
475 | } |
476 | #endif |
477 | |
478 | #endif |
479 | |
480 | #if defined(__X86_ASM__) |
481 | __forceinline uint64_t rdtsc() |
482 | { |
483 | int dummy[4]; |
484 | __cpuid(dummy,0); |
485 | uint64_t clock = read_tsc(); |
486 | __cpuid(dummy,0); |
487 | return clock; |
488 | } |
489 | #endif |
490 | |
491 | __forceinline void pause_cpu(const size_t N = 8) |
492 | { |
493 | for (size_t i=0; i<N; i++) |
494 | _mm_pause(); |
495 | } |
496 | |
497 | /* prefetches */ |
498 | __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); } |
499 | __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); } |
500 | __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); } |
501 | __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); } |
502 | __forceinline void prefetchEX (const void* ptr) { |
503 | #if defined(__INTEL_COMPILER) |
504 | _mm_prefetch((const char*)ptr,_MM_HINT_ET0); |
505 | #else |
506 | _mm_prefetch((const char*)ptr,_MM_HINT_T0); |
507 | #endif |
508 | } |
509 | |
510 | __forceinline void prefetchL1EX(const void* ptr) { |
511 | prefetchEX(ptr); |
512 | } |
513 | |
514 | __forceinline void prefetchL2EX(const void* ptr) { |
515 | prefetchEX(ptr); |
516 | } |
517 | #if defined(__AVX2__) && !defined(__aarch64__) |
518 | __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); } |
519 | __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); } |
520 | #if defined(__X86_64__) |
521 | __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); } |
522 | __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); } |
523 | #endif |
524 | #endif |
525 | |
526 | #if defined(__AVX512F__) |
527 | #if defined(__INTEL_COMPILER) |
528 | __forceinline float mm512_cvtss_f32(__m512 v) { |
529 | return _mm512_cvtss_f32(v); |
530 | } |
531 | __forceinline int mm512_mask2int(__mmask16 k1) { |
532 | return _mm512_mask2int(k1); |
533 | } |
534 | __forceinline __mmask16 mm512_int2mask(int mask) { |
535 | return _mm512_int2mask(mask); |
536 | } |
537 | #else |
538 | __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3 |
539 | return _mm_cvtss_f32(_mm512_castps512_ps128(v)); |
540 | } |
541 | __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3 |
542 | return (int)k1; |
543 | } |
544 | __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3 |
545 | return (__mmask16)mask; |
546 | } |
547 | #endif |
548 | #endif |
549 | } |
550 | |