1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "platform.h"
7
8#if defined(__WIN32__)
9#include <intrin.h>
10#endif
11
12#if defined(__ARM_NEON)
13#include "../simd/arm/emulation.h"
14#else
15#include <immintrin.h>
16#if defined(__EMSCRIPTEN__)
17#include "../simd/wasm/emulation.h"
18#endif
19#endif
20
21#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
22 #if !defined(_tzcnt_u32)
23 #define _tzcnt_u32 __tzcnt_u32
24 #endif
25 #if !defined(_tzcnt_u64)
26 #define _tzcnt_u64 __tzcnt_u64
27 #endif
28#endif
29
30#if defined(__aarch64__)
31 #if !defined(_lzcnt_u32)
32 #define _lzcnt_u32 __builtin_clz
33 #endif
34#else
35 #if defined(__LZCNT__)
36 #if !defined(_lzcnt_u32)
37 #define _lzcnt_u32 __lzcnt32
38 #endif
39 #if !defined(_lzcnt_u64)
40 #define _lzcnt_u64 __lzcnt64
41 #endif
42 #endif
43#endif
44
45#if defined(__WIN32__)
46# if !defined(NOMINMAX)
47# define NOMINMAX
48# endif
49# include <windows.h>
50#endif
51
52/* normally defined in pmmintrin.h, but we always need this */
53#if !defined(_MM_SET_DENORMALS_ZERO_MODE)
54#define _MM_DENORMALS_ZERO_ON (0x0040)
55#define _MM_DENORMALS_ZERO_OFF (0x0000)
56#define _MM_DENORMALS_ZERO_MASK (0x0040)
57#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
58#endif
59
60namespace embree
61{
62
63////////////////////////////////////////////////////////////////////////////////
64/// Windows Platform
65////////////////////////////////////////////////////////////////////////////////
66
67#if defined(__WIN32__)
68
69 __forceinline size_t read_tsc()
70 {
71 LARGE_INTEGER li;
72 QueryPerformanceCounter(&li);
73 return (size_t)li.QuadPart;
74 }
75
76 __forceinline int bsf(int v) {
77#if defined(__AVX2__) && !defined(__aarch64__)
78 return _tzcnt_u32(v);
79#else
80 unsigned long r = 0; _BitScanForward(&r,v); return r;
81#endif
82 }
83
84 __forceinline unsigned bsf(unsigned v) {
85#if defined(__AVX2__) && !defined(__aarch64__)
86 return _tzcnt_u32(v);
87#else
88 unsigned long r = 0; _BitScanForward(&r,v); return r;
89#endif
90 }
91
92#if defined(__X86_64__)
93 __forceinline size_t bsf(size_t v) {
94#if defined(__AVX2__)
95 return _tzcnt_u64(v);
96#else
97 unsigned long r = 0; _BitScanForward64(&r,v); return r;
98#endif
99 }
100#endif
101
102 __forceinline int bscf(int& v)
103 {
104 int i = bsf(v);
105 v &= v-1;
106 return i;
107 }
108
109 __forceinline unsigned bscf(unsigned& v)
110 {
111 unsigned i = bsf(v);
112 v &= v-1;
113 return i;
114 }
115
116#if defined(__X86_64__)
117 __forceinline size_t bscf(size_t& v)
118 {
119 size_t i = bsf(v);
120 v &= v-1;
121 return i;
122 }
123#endif
124
125 __forceinline int bsr(int v) {
126#if defined(__AVX2__) && !defined(__aarch64__)
127 return 31 - _lzcnt_u32(v);
128#else
129 unsigned long r = 0; _BitScanReverse(&r,v); return r;
130#endif
131 }
132
133 __forceinline unsigned bsr(unsigned v) {
134#if defined(__AVX2__) && !defined(__aarch64__)
135 return 31 - _lzcnt_u32(v);
136#else
137 unsigned long r = 0; _BitScanReverse(&r,v); return r;
138#endif
139 }
140
141#if defined(__X86_64__)
142 __forceinline size_t bsr(size_t v) {
143#if defined(__AVX2__)
144 return 63 -_lzcnt_u64(v);
145#else
146 unsigned long r = 0; _BitScanReverse64(&r, v); return r;
147#endif
148 }
149#endif
150
151 __forceinline int lzcnt(const int x)
152 {
153#if defined(__AVX2__) && !defined(__aarch64__)
154 return _lzcnt_u32(x);
155#else
156 if (unlikely(x == 0)) return 32;
157 return 31 - bsr(x);
158#endif
159 }
160
161 __forceinline int btc(int v, int i) {
162 long r = v; _bittestandcomplement(&r,i); return r;
163 }
164
165 __forceinline int bts(int v, int i) {
166 long r = v; _bittestandset(&r,i); return r;
167 }
168
169 __forceinline int btr(int v, int i) {
170 long r = v; _bittestandreset(&r,i); return r;
171 }
172
173#if defined(__X86_64__)
174
175 __forceinline size_t btc(size_t v, size_t i) {
176 size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
177 }
178
179 __forceinline size_t bts(size_t v, size_t i) {
180 __int64 r = v; _bittestandset64(&r,i); return r;
181 }
182
183 __forceinline size_t btr(size_t v, size_t i) {
184 __int64 r = v; _bittestandreset64(&r,i); return r;
185 }
186
187#endif
188
189 __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
190 return _InterlockedCompareExchange((volatile long*)p,v,c);
191 }
192
193////////////////////////////////////////////////////////////////////////////////
194/// Unix Platform
195////////////////////////////////////////////////////////////////////////////////
196
197#else
198
199#if defined(__i386__) && defined(__PIC__)
200
201 __forceinline void __cpuid(int out[4], int op)
202 {
203 asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
204 "cpuid\n\t"
205 "xchg{l}\t{%%}ebx, %1\n\t"
206 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
207 : "0"(op));
208 }
209
210 __forceinline void __cpuid_count(int out[4], int op1, int op2)
211 {
212 asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
213 "cpuid\n\t"
214 "xchg{l}\t{%%}ebx, %1\n\t"
215 : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
216 : "0" (op1), "2" (op2));
217 }
218
219#elif defined(__X86_ASM__)
220
221 __forceinline void __cpuid(int out[4], int op) {
222#if defined(__ARM_NEON)
223 if (op == 0) { // Get CPU name
224 out[0] = 0x41524d20;
225 out[1] = 0x41524d20;
226 out[2] = 0x41524d20;
227 out[3] = 0x41524d20;
228 }
229#else
230 asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
231#endif
232 }
233
234#if !defined(__ARM_NEON)
235 __forceinline void __cpuid_count(int out[4], int op1, int op2) {
236 asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
237 }
238#endif
239
240#endif
241
242 __forceinline uint64_t read_tsc() {
243#if defined(__X86_ASM__)
244 uint32_t high,low;
245 asm volatile ("rdtsc" : "=d"(high), "=a"(low));
246 return (((uint64_t)high) << 32) + (uint64_t)low;
247#else
248 /* Not supported yet, meaning measuring traversal cost per pixel does not work. */
249 return 0;
250#endif
251 }
252
253 __forceinline int bsf(int v) {
254#if defined(__ARM_NEON)
255 return __builtin_ctz(v);
256#else
257#if defined(__AVX2__)
258 return _tzcnt_u32(v);
259#elif defined(__X86_ASM__)
260 int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
261#else
262 return __builtin_ctz(v);
263#endif
264#endif
265 }
266
267#if defined(__64BIT__)
268 __forceinline unsigned bsf(unsigned v)
269 {
270#if defined(__ARM_NEON)
271 return __builtin_ctz(v);
272#else
273#if defined(__AVX2__)
274 return _tzcnt_u32(v);
275#elif defined(__X86_ASM__)
276 unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
277#else
278 return __builtin_ctz(v);
279#endif
280#endif
281 }
282#endif
283
284 __forceinline size_t bsf(size_t v) {
285#if defined(__AVX2__) && !defined(__aarch64__)
286#if defined(__X86_64__)
287 return _tzcnt_u64(v);
288#else
289 return _tzcnt_u32(v);
290#endif
291#elif defined(__X86_ASM__)
292 size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
293#else
294 return __builtin_ctzl(v);
295#endif
296 }
297
298 __forceinline int bscf(int& v)
299 {
300 int i = bsf(v);
301 v &= v-1;
302 return i;
303 }
304
305#if defined(__64BIT__)
306 __forceinline unsigned int bscf(unsigned int& v)
307 {
308 unsigned int i = bsf(v);
309 v &= v-1;
310 return i;
311 }
312#endif
313
314 __forceinline size_t bscf(size_t& v)
315 {
316 size_t i = bsf(v);
317 v &= v-1;
318 return i;
319 }
320
321 __forceinline int bsr(int v) {
322#if defined(__AVX2__) && !defined(__aarch64__)
323 return 31 - _lzcnt_u32(v);
324#elif defined(__X86_ASM__)
325 int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
326#else
327 return __builtin_clz(v) ^ 31;
328#endif
329 }
330
331#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
332 __forceinline unsigned bsr(unsigned v) {
333#if defined(__AVX2__)
334 return 31 - _lzcnt_u32(v);
335#elif defined(__X86_ASM__)
336 unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
337#else
338 return __builtin_clz(v) ^ 31;
339#endif
340 }
341#endif
342
343 __forceinline size_t bsr(size_t v) {
344#if defined(__AVX2__) && !defined(__aarch64__)
345#if defined(__X86_64__)
346 return 63 - _lzcnt_u64(v);
347#else
348 return 31 - _lzcnt_u32(v);
349#endif
350#elif defined(__X86_ASM__)
351 size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
352#else
353 return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
354#endif
355 }
356
357 __forceinline int lzcnt(const int x)
358 {
359#if defined(__AVX2__) && !defined(__aarch64__)
360 return _lzcnt_u32(x);
361#else
362 if (unlikely(x == 0)) return 32;
363 return 31 - bsr(x);
364#endif
365 }
366
367 __forceinline size_t blsr(size_t v) {
368#if defined(__AVX2__) && !defined(__aarch64__)
369 #if defined(__INTEL_COMPILER)
370 return _blsr_u64(v);
371 #else
372 #if defined(__X86_64__)
373 return __blsr_u64(v);
374 #else
375 return __blsr_u32(v);
376 #endif
377 #endif
378#else
379 return v & (v-1);
380#endif
381 }
382
383 __forceinline int btc(int v, int i) {
384#if defined(__X86_ASM__)
385 int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
386#else
387 return (v ^ (1 << i));
388#endif
389 }
390
391 __forceinline int bts(int v, int i) {
392#if defined(__X86_ASM__)
393 int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
394#else
395 return (v | (1 << i));
396#endif
397 }
398
399 __forceinline int btr(int v, int i) {
400#if defined(__X86_ASM__)
401 int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
402#else
403 return (v & ~(1 << i));
404#endif
405 }
406
407 __forceinline size_t btc(size_t v, size_t i) {
408#if defined(__X86_ASM__)
409 size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
410#else
411 return (v ^ (1 << i));
412#endif
413 }
414
415 __forceinline size_t bts(size_t v, size_t i) {
416#if defined(__X86_ASM__)
417 size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
418#else
419 return (v | (1 << i));
420#endif
421 }
422
423 __forceinline size_t btr(size_t v, size_t i) {
424#if defined(__X86_ASM__)
425 size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
426#else
427 return (v & ~(1 << i));
428#endif
429 }
430
431 __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
432 return __sync_val_compare_and_swap(value, comparand, input);
433 }
434
435#endif
436
437////////////////////////////////////////////////////////////////////////////////
438/// All Platforms
439////////////////////////////////////////////////////////////////////////////////
440
441#if defined(__clang__) || defined(__GNUC__)
442#if !defined(_mm_undefined_ps)
443 __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
444#endif
445#if !defined(_mm_undefined_si128)
446 __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
447#endif
448#if !defined(_mm256_undefined_ps) && defined(__AVX__)
449 __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
450#endif
451#if !defined(_mm256_undefined_si256) && defined(__AVX__)
452 __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
453#endif
454#if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
455 __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
456#endif
457#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
458 __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
459#endif
460#endif
461
462#if defined(__SSE4_2__) || defined(__ARM_NEON)
463
464 __forceinline int popcnt(int in) {
465 return _mm_popcnt_u32(in);
466 }
467
468 __forceinline unsigned popcnt(unsigned in) {
469 return _mm_popcnt_u32(in);
470 }
471
472#if defined(__64BIT__)
473 __forceinline size_t popcnt(size_t in) {
474 return _mm_popcnt_u64(in);
475 }
476#endif
477
478#endif
479
480#if defined(__X86_ASM__)
481 __forceinline uint64_t rdtsc()
482 {
483 int dummy[4];
484 __cpuid(dummy,0);
485 uint64_t clock = read_tsc();
486 __cpuid(dummy,0);
487 return clock;
488 }
489#endif
490
491 __forceinline void pause_cpu(const size_t N = 8)
492 {
493 for (size_t i=0; i<N; i++)
494 _mm_pause();
495 }
496
497 /* prefetches */
498 __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
499 __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
500 __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
501 __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
502 __forceinline void prefetchEX (const void* ptr) {
503#if defined(__INTEL_COMPILER)
504 _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
505#else
506 _mm_prefetch((const char*)ptr,_MM_HINT_T0);
507#endif
508 }
509
510 __forceinline void prefetchL1EX(const void* ptr) {
511 prefetchEX(ptr);
512 }
513
514 __forceinline void prefetchL2EX(const void* ptr) {
515 prefetchEX(ptr);
516 }
517#if defined(__AVX2__) && !defined(__aarch64__)
518 __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
519 __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
520#if defined(__X86_64__)
521 __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
522 __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
523#endif
524#endif
525
526#if defined(__AVX512F__)
527#if defined(__INTEL_COMPILER)
528 __forceinline float mm512_cvtss_f32(__m512 v) {
529 return _mm512_cvtss_f32(v);
530 }
531 __forceinline int mm512_mask2int(__mmask16 k1) {
532 return _mm512_mask2int(k1);
533 }
534 __forceinline __mmask16 mm512_int2mask(int mask) {
535 return _mm512_int2mask(mask);
536 }
537#else
538 __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
539 return _mm_cvtss_f32(_mm512_castps512_ps128(v));
540 }
541 __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
542 return (int)k1;
543 }
544 __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
545 return (__mmask16)mask;
546 }
547#endif
548#endif
549}
550