| 1 | // Copyright 2009-2021 Intel Corporation | 
|---|
| 2 | // SPDX-License-Identifier: Apache-2.0 | 
|---|
| 3 |  | 
|---|
| 4 | #pragma once | 
|---|
| 5 |  | 
|---|
| 6 | #include "../sys/platform.h" | 
|---|
| 7 | #include "../sys/intrinsics.h" | 
|---|
| 8 | #include "constants.h" | 
|---|
| 9 | #include <cmath> | 
|---|
| 10 |  | 
|---|
| 11 | #if defined(__ARM_NEON) | 
|---|
| 12 | #include "../simd/arm/emulation.h" | 
|---|
| 13 | #else | 
|---|
| 14 | #include <emmintrin.h> | 
|---|
| 15 | #include <xmmintrin.h> | 
|---|
| 16 | #include <immintrin.h> | 
|---|
| 17 | #endif | 
|---|
| 18 |  | 
|---|
| 19 | #if defined(__WIN32__) | 
|---|
| 20 | #if defined(_MSC_VER) && (_MSC_VER <= 1700) | 
|---|
| 21 | namespace std | 
|---|
| 22 | { | 
|---|
| 23 | __forceinline bool isinf ( const float x ) { return _finite(x) == 0; } | 
|---|
| 24 | __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; } | 
|---|
| 25 | __forceinline bool isfinite (const float x) { return _finite(x) != 0; } | 
|---|
| 26 | } | 
|---|
| 27 | #endif | 
|---|
| 28 | #endif | 
|---|
| 29 |  | 
|---|
| 30 | namespace embree | 
|---|
| 31 | { | 
|---|
| 32 | __forceinline bool isvalid ( const float& v ) { | 
|---|
| 33 | return (v > -FLT_LARGE) & (v < +FLT_LARGE); | 
|---|
| 34 | } | 
|---|
| 35 |  | 
|---|
| 36 | __forceinline int cast_f2i(float f) { | 
|---|
| 37 | union { float f; int i; } v; v.f = f; return v.i; | 
|---|
| 38 | } | 
|---|
| 39 |  | 
|---|
| 40 | __forceinline float cast_i2f(int i) { | 
|---|
| 41 | union { float f; int i; } v; v.i = i; return v.f; | 
|---|
| 42 | } | 
|---|
| 43 |  | 
|---|
| 44 | __forceinline int   toInt  (const float& a) { return int(a); } | 
|---|
| 45 | __forceinline float toFloat(const int&   a) { return float(a); } | 
|---|
| 46 |  | 
|---|
| 47 | #if defined(__WIN32__) | 
|---|
| 48 | __forceinline bool finite ( const float x ) { return _finite(x) != 0; } | 
|---|
| 49 | #endif | 
|---|
| 50 |  | 
|---|
| 51 | __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; } | 
|---|
| 52 | __forceinline float sqr  ( const float x ) { return x*x; } | 
|---|
| 53 |  | 
|---|
| 54 | __forceinline float rcp  ( const float x ) | 
|---|
| 55 | { | 
|---|
| 56 | #if defined(__aarch64__) | 
|---|
| 57 | // Move scalar to vector register and do rcp. | 
|---|
| 58 | __m128 a; | 
|---|
| 59 | a[0] = x; | 
|---|
| 60 | float32x4_t reciprocal = vrecpeq_f32(a); | 
|---|
| 61 | reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); | 
|---|
| 62 | reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); | 
|---|
| 63 | return reciprocal[0]; | 
|---|
| 64 | #else | 
|---|
| 65 |  | 
|---|
| 66 | const __m128 a = _mm_set_ss(x); | 
|---|
| 67 |  | 
|---|
| 68 | #if defined(__AVX512VL__) | 
|---|
| 69 | const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a); | 
|---|
| 70 | #else | 
|---|
| 71 | const __m128 r = _mm_rcp_ss(a); | 
|---|
| 72 | #endif | 
|---|
| 73 |  | 
|---|
| 74 | #if defined(__AVX2__) | 
|---|
| 75 | return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f)))); | 
|---|
| 76 | #else | 
|---|
| 77 | return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); | 
|---|
| 78 | #endif | 
|---|
| 79 |  | 
|---|
| 80 | #endif  //defined(__aarch64__) | 
|---|
| 81 | } | 
|---|
| 82 |  | 
|---|
| 83 | __forceinline float signmsk ( const float x ) { | 
|---|
| 84 | #if defined(__aarch64__) | 
|---|
| 85 | // FP and Neon shares same vector register in arm64 | 
|---|
| 86 | __m128 a; | 
|---|
| 87 | __m128i b; | 
|---|
| 88 | a[0] = x; | 
|---|
| 89 | b[0] = 0x80000000; | 
|---|
| 90 | a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); | 
|---|
| 91 | return a[0]; | 
|---|
| 92 | #else | 
|---|
| 93 | return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); | 
|---|
| 94 | #endif | 
|---|
| 95 | } | 
|---|
| 96 | __forceinline float xorf( const float x, const float y ) { | 
|---|
| 97 | #if defined(__aarch64__) | 
|---|
| 98 | // FP and Neon shares same vector register in arm64 | 
|---|
| 99 | __m128 a; | 
|---|
| 100 | __m128 b; | 
|---|
| 101 | a[0] = x; | 
|---|
| 102 | b[0] = y; | 
|---|
| 103 | a = _mm_xor_ps(a, b); | 
|---|
| 104 | return a[0]; | 
|---|
| 105 | #else | 
|---|
| 106 | return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); | 
|---|
| 107 | #endif | 
|---|
| 108 | } | 
|---|
| 109 | __forceinline float andf( const float x, const unsigned y ) { | 
|---|
| 110 | #if defined(__aarch64__) | 
|---|
| 111 | // FP and Neon shares same vector register in arm64 | 
|---|
| 112 | __m128 a; | 
|---|
| 113 | __m128i b; | 
|---|
| 114 | a[0] = x; | 
|---|
| 115 | b[0] = y; | 
|---|
| 116 | a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); | 
|---|
| 117 | return a[0]; | 
|---|
| 118 | #else | 
|---|
| 119 | return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); | 
|---|
| 120 | #endif | 
|---|
| 121 | } | 
|---|
| 122 | __forceinline float rsqrt( const float x ) | 
|---|
| 123 | { | 
|---|
| 124 | #if defined(__aarch64__) | 
|---|
| 125 | // FP and Neon shares same vector register in arm64 | 
|---|
| 126 | __m128 a; | 
|---|
| 127 | a[0] = x; | 
|---|
| 128 | __m128 value = _mm_rsqrt_ps(a); | 
|---|
| 129 | value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); | 
|---|
| 130 | value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); | 
|---|
| 131 | return value[0]; | 
|---|
| 132 | #else | 
|---|
| 133 |  | 
|---|
| 134 | const __m128 a = _mm_set_ss(x); | 
|---|
| 135 | #if defined(__AVX512VL__) | 
|---|
| 136 | __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); | 
|---|
| 137 | #else | 
|---|
| 138 | __m128 r = _mm_rsqrt_ss(a); | 
|---|
| 139 | #endif | 
|---|
| 140 | const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), | 
|---|
| 141 | _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); | 
|---|
| 142 | return _mm_cvtss_f32(c); | 
|---|
| 143 | #endif | 
|---|
| 144 | } | 
|---|
| 145 |  | 
|---|
| 146 | #if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700) | 
|---|
| 147 | __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); } | 
|---|
| 148 | __forceinline double nextafter(double x, double y) { return _nextafter(x, y); } | 
|---|
| 149 | __forceinline int roundf(float f) { return (int)(f + 0.5f); } | 
|---|
| 150 | #else | 
|---|
| 151 | __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); } | 
|---|
| 152 | __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); } | 
|---|
| 153 | #endif | 
|---|
| 154 |  | 
|---|
| 155 | __forceinline float abs  ( const float x ) { return ::fabsf(x); } | 
|---|
| 156 | __forceinline float acos ( const float x ) { return ::acosf (x); } | 
|---|
| 157 | __forceinline float asin ( const float x ) { return ::asinf (x); } | 
|---|
| 158 | __forceinline float atan ( const float x ) { return ::atanf (x); } | 
|---|
| 159 | __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); } | 
|---|
| 160 | __forceinline float cos  ( const float x ) { return ::cosf  (x); } | 
|---|
| 161 | __forceinline float cosh ( const float x ) { return ::coshf (x); } | 
|---|
| 162 | __forceinline float exp  ( const float x ) { return ::expf  (x); } | 
|---|
| 163 | __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); } | 
|---|
| 164 | __forceinline float log  ( const float x ) { return ::logf  (x); } | 
|---|
| 165 | __forceinline float log10( const float x ) { return ::log10f(x); } | 
|---|
| 166 | __forceinline float pow  ( const float x, const float y ) { return ::powf  (x, y); } | 
|---|
| 167 | __forceinline float sin  ( const float x ) { return ::sinf  (x); } | 
|---|
| 168 | __forceinline float sinh ( const float x ) { return ::sinhf (x); } | 
|---|
| 169 | __forceinline float sqrt ( const float x ) { return ::sqrtf (x); } | 
|---|
| 170 | __forceinline float tan  ( const float x ) { return ::tanf  (x); } | 
|---|
| 171 | __forceinline float tanh ( const float x ) { return ::tanhf (x); } | 
|---|
| 172 | __forceinline float floor( const float x ) { return ::floorf (x); } | 
|---|
| 173 | __forceinline float ceil ( const float x ) { return ::ceilf (x); } | 
|---|
| 174 | __forceinline float frac ( const float x ) { return x-floor(x); } | 
|---|
| 175 |  | 
|---|
| 176 | __forceinline double abs  ( const double x ) { return ::fabs(x); } | 
|---|
| 177 | __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; } | 
|---|
| 178 | __forceinline double acos ( const double x ) { return ::acos (x); } | 
|---|
| 179 | __forceinline double asin ( const double x ) { return ::asin (x); } | 
|---|
| 180 | __forceinline double atan ( const double x ) { return ::atan (x); } | 
|---|
| 181 | __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); } | 
|---|
| 182 | __forceinline double cos  ( const double x ) { return ::cos  (x); } | 
|---|
| 183 | __forceinline double cosh ( const double x ) { return ::cosh (x); } | 
|---|
| 184 | __forceinline double exp  ( const double x ) { return ::exp  (x); } | 
|---|
| 185 | __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); } | 
|---|
| 186 | __forceinline double log  ( const double x ) { return ::log  (x); } | 
|---|
| 187 | __forceinline double log10( const double x ) { return ::log10(x); } | 
|---|
| 188 | __forceinline double pow  ( const double x, const double y ) { return ::pow  (x, y); } | 
|---|
| 189 | __forceinline double rcp  ( const double x ) { return 1.0/x; } | 
|---|
| 190 | __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); } | 
|---|
| 191 | __forceinline double sin  ( const double x ) { return ::sin  (x); } | 
|---|
| 192 | __forceinline double sinh ( const double x ) { return ::sinh (x); } | 
|---|
| 193 | __forceinline double sqr  ( const double x ) { return x*x; } | 
|---|
| 194 | __forceinline double sqrt ( const double x ) { return ::sqrt (x); } | 
|---|
| 195 | __forceinline double tan  ( const double x ) { return ::tan  (x); } | 
|---|
| 196 | __forceinline double tanh ( const double x ) { return ::tanh (x); } | 
|---|
| 197 | __forceinline double floor( const double x ) { return ::floor (x); } | 
|---|
| 198 | __forceinline double ceil ( const double x ) { return ::ceil (x); } | 
|---|
| 199 |  | 
|---|
| 200 | #if defined(__aarch64__) | 
|---|
| 201 | __forceinline float mini(float a, float b) { | 
|---|
| 202 | // FP and Neon shares same vector register in arm64 | 
|---|
| 203 | __m128 x; | 
|---|
| 204 | __m128 y; | 
|---|
| 205 | x[0] = a; | 
|---|
| 206 | y[0] = b; | 
|---|
| 207 | x = _mm_min_ps(x, y); | 
|---|
| 208 | return x[0]; | 
|---|
| 209 | } | 
|---|
| 210 | #elif defined(__SSE4_1__) | 
|---|
| 211 | __forceinline float mini(float a, float b) { | 
|---|
| 212 | const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); | 
|---|
| 213 | const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); | 
|---|
| 214 | const __m128i ci = _mm_min_epi32(ai,bi); | 
|---|
| 215 | return _mm_cvtss_f32(_mm_castsi128_ps(ci)); | 
|---|
| 216 | } | 
|---|
| 217 | #endif | 
|---|
| 218 |  | 
|---|
| 219 | #if defined(__aarch64__) | 
|---|
| 220 | __forceinline float maxi(float a, float b) { | 
|---|
| 221 | // FP and Neon shares same vector register in arm64 | 
|---|
| 222 | __m128 x; | 
|---|
| 223 | __m128 y; | 
|---|
| 224 | x[0] = a; | 
|---|
| 225 | y[0] = b; | 
|---|
| 226 | x = _mm_max_ps(x, y); | 
|---|
| 227 | return x[0]; | 
|---|
| 228 | } | 
|---|
| 229 | #elif defined(__SSE4_1__) | 
|---|
| 230 | __forceinline float maxi(float a, float b) { | 
|---|
| 231 | const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); | 
|---|
| 232 | const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); | 
|---|
| 233 | const __m128i ci = _mm_max_epi32(ai,bi); | 
|---|
| 234 | return _mm_cvtss_f32(_mm_castsi128_ps(ci)); | 
|---|
| 235 | } | 
|---|
| 236 | #endif | 
|---|
| 237 |  | 
|---|
| 238 | template<typename T> | 
|---|
| 239 | __forceinline T twice(const T& a) { return a+a; } | 
|---|
| 240 |  | 
|---|
| 241 | __forceinline      int min(int      a, int      b) { return a<b ? a:b; } | 
|---|
| 242 | __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; } | 
|---|
| 243 | __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; } | 
|---|
| 244 | __forceinline    float min(float    a, float    b) { return a<b ? a:b; } | 
|---|
| 245 | __forceinline   double min(double   a, double   b) { return a<b ? a:b; } | 
|---|
| 246 | #if defined(__64BIT__) || defined(__EMSCRIPTEN__) | 
|---|
| 247 | __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; } | 
|---|
| 248 | #endif | 
|---|
| 249 | #if defined(__EMSCRIPTEN__) | 
|---|
| 250 | __forceinline   long   min(long     a, long     b) { return a<b ? a:b; } | 
|---|
| 251 | #endif | 
|---|
| 252 |  | 
|---|
| 253 | template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); } | 
|---|
| 254 | template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); } | 
|---|
| 255 | template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); } | 
|---|
| 256 |  | 
|---|
| 257 | template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); } | 
|---|
| 258 | template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); } | 
|---|
| 259 | template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); } | 
|---|
| 260 |  | 
|---|
| 261 | __forceinline      int max(int      a, int      b) { return a<b ? b:a; } | 
|---|
| 262 | __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; } | 
|---|
| 263 | __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; } | 
|---|
| 264 | __forceinline    float max(float    a, float    b) { return a<b ? b:a; } | 
|---|
| 265 | __forceinline   double max(double   a, double   b) { return a<b ? b:a; } | 
|---|
| 266 | #if defined(__64BIT__) || defined(__EMSCRIPTEN__) | 
|---|
| 267 | __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; } | 
|---|
| 268 | #endif | 
|---|
| 269 | #if defined(__EMSCRIPTEN__) | 
|---|
| 270 | __forceinline   long   max(long     a, long     b) { return a<b ? b:a; } | 
|---|
| 271 | #endif | 
|---|
| 272 |  | 
|---|
| 273 | template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); } | 
|---|
| 274 | template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } | 
|---|
| 275 | template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); } | 
|---|
| 276 |  | 
|---|
| 277 | template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); } | 
|---|
| 278 | template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); } | 
|---|
| 279 | template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); } | 
|---|
| 280 |  | 
|---|
| 281 | #if defined(__MACOSX__) | 
|---|
| 282 | __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; } | 
|---|
| 283 | __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; } | 
|---|
| 284 | #endif | 
|---|
| 285 |  | 
|---|
| 286 | #if defined(__MACOSX__) && !defined(__INTEL_COMPILER) | 
|---|
| 287 | __forceinline void sincosf(float x, float *sin, float *cos) { | 
|---|
| 288 | __sincosf(x,sin,cos); | 
|---|
| 289 | } | 
|---|
| 290 | #endif | 
|---|
| 291 |  | 
|---|
| 292 | #if defined(__WIN32__) || defined(__FreeBSD__) | 
|---|
| 293 | __forceinline void sincosf(float x, float *s, float *c) { | 
|---|
| 294 | *s = sinf(x); *c = cosf(x); | 
|---|
| 295 | } | 
|---|
| 296 | #endif | 
|---|
| 297 |  | 
|---|
| 298 | template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); } | 
|---|
| 299 | template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); } | 
|---|
| 300 |  | 
|---|
| 301 | template<typename T> __forceinline T  deg2rad ( const T& x )  { return x * T(1.74532925199432957692e-2f); } | 
|---|
| 302 | template<typename T> __forceinline T  rad2deg ( const T& x )  { return x * T(5.72957795130823208768e1f); } | 
|---|
| 303 | template<typename T> __forceinline T  sin2cos ( const T& x )  { return sqrt(max(T(zero),T(one)-x*x)); } | 
|---|
| 304 | template<typename T> __forceinline T  cos2sin ( const T& x )  { return sin2cos(x); } | 
|---|
| 305 |  | 
|---|
| 306 | #if defined(__AVX2__) | 
|---|
| 307 | __forceinline float madd  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } | 
|---|
| 308 | __forceinline float msub  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } | 
|---|
| 309 | __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } | 
|---|
| 310 | __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } | 
|---|
| 311 |  | 
|---|
| 312 | #elif defined (__aarch64__) && defined(__clang__) | 
|---|
| 313 | #pragma clang fp contract(fast) | 
|---|
| 314 | __forceinline float madd  ( const float a, const float b, const float c) { return a*b + c; } | 
|---|
| 315 | __forceinline float msub  ( const float a, const float b, const float c) { return a*b - c; } | 
|---|
| 316 | __forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; } | 
|---|
| 317 | __forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); } | 
|---|
| 318 | #pragma clang fp contract(on) | 
|---|
| 319 |  | 
|---|
| 320 | #else | 
|---|
| 321 | __forceinline float madd  ( const float a, const float b, const float c) { return a*b+c; } | 
|---|
| 322 | __forceinline float msub  ( const float a, const float b, const float c) { return a*b-c; } | 
|---|
| 323 | __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;} | 
|---|
| 324 | __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; } | 
|---|
| 325 | #endif | 
|---|
| 326 |  | 
|---|
| 327 | /*! random functions */ | 
|---|
| 328 | template<typename T> T random() { return T(0); } | 
|---|
| 329 | #if defined(_WIN32) | 
|---|
| 330 | template<> __forceinline int      random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); } | 
|---|
| 331 | template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); } | 
|---|
| 332 | #else | 
|---|
| 333 | template<> __forceinline int      random() { return int(rand()); } | 
|---|
| 334 | template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); } | 
|---|
| 335 | #endif | 
|---|
| 336 | template<> __forceinline float  random() { return rand()/float(RAND_MAX); } | 
|---|
| 337 | template<> __forceinline double random() { return rand()/double(RAND_MAX); } | 
|---|
| 338 |  | 
|---|
| 339 | #if _WIN32 | 
|---|
| 340 | __forceinline double drand48() { | 
|---|
| 341 | return double(rand())/double(RAND_MAX); | 
|---|
| 342 | } | 
|---|
| 343 |  | 
|---|
| 344 | __forceinline void srand48(long seed) { | 
|---|
| 345 | return srand(seed); | 
|---|
| 346 | } | 
|---|
| 347 | #endif | 
|---|
| 348 |  | 
|---|
| 349 | /*! selects */ | 
|---|
| 350 | __forceinline bool  select(bool s, bool  t , bool f) { return s ? t : f; } | 
|---|
| 351 | __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; } | 
|---|
| 352 | __forceinline float select(bool s, float t, float f) { return s ? t : f; } | 
|---|
| 353 |  | 
|---|
| 354 | __forceinline bool all(bool s) { return s; } | 
|---|
| 355 |  | 
|---|
| 356 | __forceinline float lerp(const float v0, const float v1, const float t) { | 
|---|
| 357 | return madd(1.0f-t,v0,t*v1); | 
|---|
| 358 | } | 
|---|
| 359 |  | 
|---|
| 360 | template<typename T> | 
|---|
| 361 | __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) { | 
|---|
| 362 | return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3))); | 
|---|
| 363 | } | 
|---|
| 364 |  | 
|---|
| 365 | /*! exchange */ | 
|---|
| 366 | template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; } | 
|---|
| 367 |  | 
|---|
| 368 | /*  load/store */ | 
|---|
| 369 | template<typename Ty> struct mem; | 
|---|
| 370 |  | 
|---|
| 371 | template<> struct mem<float> { | 
|---|
| 372 | static __forceinline float load (bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; } | 
|---|
| 373 | static __forceinline float loadu(bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; } | 
|---|
| 374 |  | 
|---|
| 375 | static __forceinline void store (bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; } | 
|---|
| 376 | static __forceinline void storeu(bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; } | 
|---|
| 377 | }; | 
|---|
| 378 |  | 
|---|
| 379 | /*! bit reverse operation */ | 
|---|
| 380 | template<class T> | 
|---|
| 381 | __forceinline T bitReverse(const T& vin) | 
|---|
| 382 | { | 
|---|
| 383 | T v = vin; | 
|---|
| 384 | v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); | 
|---|
| 385 | v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); | 
|---|
| 386 | v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); | 
|---|
| 387 | v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); | 
|---|
| 388 | v = ( v >> 16             ) | ( v               << 16); | 
|---|
| 389 | return v; | 
|---|
| 390 | } | 
|---|
| 391 |  | 
|---|
| 392 | /*! bit interleave operation */ | 
|---|
| 393 | template<class T> | 
|---|
| 394 | __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin) | 
|---|
| 395 | { | 
|---|
| 396 | T x = xin, y = yin, z = zin; | 
|---|
| 397 | x = (x | (x << 16)) & 0x030000FF; | 
|---|
| 398 | x = (x | (x <<  8)) & 0x0300F00F; | 
|---|
| 399 | x = (x | (x <<  4)) & 0x030C30C3; | 
|---|
| 400 | x = (x | (x <<  2)) & 0x09249249; | 
|---|
| 401 |  | 
|---|
| 402 | y = (y | (y << 16)) & 0x030000FF; | 
|---|
| 403 | y = (y | (y <<  8)) & 0x0300F00F; | 
|---|
| 404 | y = (y | (y <<  4)) & 0x030C30C3; | 
|---|
| 405 | y = (y | (y <<  2)) & 0x09249249; | 
|---|
| 406 |  | 
|---|
| 407 | z = (z | (z << 16)) & 0x030000FF; | 
|---|
| 408 | z = (z | (z <<  8)) & 0x0300F00F; | 
|---|
| 409 | z = (z | (z <<  4)) & 0x030C30C3; | 
|---|
| 410 | z = (z | (z <<  2)) & 0x09249249; | 
|---|
| 411 |  | 
|---|
| 412 | return x | (y << 1) | (z << 2); | 
|---|
| 413 | } | 
|---|
| 414 |  | 
|---|
| 415 | #if defined(__AVX2__) && !defined(__aarch64__) | 
|---|
| 416 |  | 
|---|
| 417 | template<> | 
|---|
| 418 | __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) | 
|---|
| 419 | { | 
|---|
| 420 | const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ ); | 
|---|
| 421 | const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */); | 
|---|
| 422 | const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */); | 
|---|
| 423 | return xx | yy | zz; | 
|---|
| 424 | } | 
|---|
| 425 |  | 
|---|
| 426 | #endif | 
|---|
| 427 |  | 
|---|
| 428 | /*! bit interleave operation for 64bit data types*/ | 
|---|
| 429 | template<class T> | 
|---|
| 430 | __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){ | 
|---|
| 431 | T x = xin & 0x1fffff; | 
|---|
| 432 | T y = yin & 0x1fffff; | 
|---|
| 433 | T z = zin & 0x1fffff; | 
|---|
| 434 |  | 
|---|
| 435 | x = (x | x << 32) & 0x1f00000000ffff; | 
|---|
| 436 | x = (x | x << 16) & 0x1f0000ff0000ff; | 
|---|
| 437 | x = (x | x << 8) & 0x100f00f00f00f00f; | 
|---|
| 438 | x = (x | x << 4) & 0x10c30c30c30c30c3; | 
|---|
| 439 | x = (x | x << 2) & 0x1249249249249249; | 
|---|
| 440 |  | 
|---|
| 441 | y = (y | y << 32) & 0x1f00000000ffff; | 
|---|
| 442 | y = (y | y << 16) & 0x1f0000ff0000ff; | 
|---|
| 443 | y = (y | y << 8) & 0x100f00f00f00f00f; | 
|---|
| 444 | y = (y | y << 4) & 0x10c30c30c30c30c3; | 
|---|
| 445 | y = (y | y << 2) & 0x1249249249249249; | 
|---|
| 446 |  | 
|---|
| 447 | z = (z | z << 32) & 0x1f00000000ffff; | 
|---|
| 448 | z = (z | z << 16) & 0x1f0000ff0000ff; | 
|---|
| 449 | z = (z | z << 8) & 0x100f00f00f00f00f; | 
|---|
| 450 | z = (z | z << 4) & 0x10c30c30c30c30c3; | 
|---|
| 451 | z = (z | z << 2) & 0x1249249249249249; | 
|---|
| 452 |  | 
|---|
| 453 | return x | (y << 1) | (z << 2); | 
|---|
| 454 | } | 
|---|
| 455 | } | 
|---|
| 456 |  | 
|---|