| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #pragma once |
| 5 | |
| 6 | // Transcendental functions from "ispc": https://github.com/ispc/ispc/ |
| 7 | // Most of the transcendental implementations in ispc code come from |
| 8 | // Solomon Boulos's "syrah": https://github.com/boulos/syrah/ |
| 9 | |
| 10 | #include "../simd/simd.h" |
| 11 | |
| 12 | namespace embree |
| 13 | { |
| 14 | |
| 15 | namespace fastapprox |
| 16 | { |
| 17 | |
| 18 | template <typename T> |
| 19 | __forceinline T sin(const T &v) |
| 20 | { |
| 21 | static const float piOverTwoVec = 1.57079637050628662109375; |
| 22 | static const float twoOverPiVec = 0.636619746685028076171875; |
| 23 | auto scaled = v * twoOverPiVec; |
| 24 | auto kReal = floor(scaled); |
| 25 | auto k = toInt(kReal); |
| 26 | |
| 27 | // Reduced range version of x |
| 28 | auto x = v - kReal * piOverTwoVec; |
| 29 | auto kMod4 = k & 3; |
| 30 | auto sinUseCos = (kMod4 == 1) | (kMod4 == 3); |
| 31 | auto flipSign = (kMod4 > 1); |
| 32 | |
| 33 | // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, |
| 34 | // 4, 6, 8, 10|], [|single...|], [0;Pi/2]); |
| 35 | static const float sinC2 = -0.16666667163372039794921875; |
| 36 | static const float sinC4 = +8.333347737789154052734375e-3; |
| 37 | static const float sinC6 = -1.9842604524455964565277099609375e-4; |
| 38 | static const float sinC8 = +2.760012648650445044040679931640625e-6; |
| 39 | static const float sinC10 = -2.50293279435709337121807038784027099609375e-8; |
| 40 | |
| 41 | static const float cosC2 = -0.5; |
| 42 | static const float cosC4 = +4.166664183139801025390625e-2; |
| 43 | static const float cosC6 = -1.388833043165504932403564453125e-3; |
| 44 | static const float cosC8 = +2.47562347794882953166961669921875e-5; |
| 45 | static const float cosC10 = -2.59630184018533327616751194000244140625e-7; |
| 46 | |
| 47 | auto outside = select(sinUseCos, 1., x); |
| 48 | auto c2 = select(sinUseCos, T(cosC2), T(sinC2)); |
| 49 | auto c4 = select(sinUseCos, T(cosC4), T(sinC4)); |
| 50 | auto c6 = select(sinUseCos, T(cosC6), T(sinC6)); |
| 51 | auto c8 = select(sinUseCos, T(cosC8), T(sinC8)); |
| 52 | auto c10 = select(sinUseCos, T(cosC10), T(sinC10)); |
| 53 | |
| 54 | auto x2 = x * x; |
| 55 | auto formula = x2 * c10 + c8; |
| 56 | formula = x2 * formula + c6; |
| 57 | formula = x2 * formula + c4; |
| 58 | formula = x2 * formula + c2; |
| 59 | formula = x2 * formula + 1.; |
| 60 | formula *= outside; |
| 61 | |
| 62 | formula = select(flipSign, -formula, formula); |
| 63 | return formula; |
| 64 | } |
| 65 | |
| 66 | template <typename T> |
| 67 | __forceinline T cos(const T &v) |
| 68 | { |
| 69 | static const float piOverTwoVec = 1.57079637050628662109375; |
| 70 | static const float twoOverPiVec = 0.636619746685028076171875; |
| 71 | auto scaled = v * twoOverPiVec; |
| 72 | auto kReal = floor(scaled); |
| 73 | auto k = toInt(kReal); |
| 74 | |
| 75 | // Reduced range version of x |
| 76 | auto x = v - kReal * piOverTwoVec; |
| 77 | |
| 78 | auto kMod4 = k & 3; |
| 79 | auto cosUseCos = (kMod4 == 0) | (kMod4 == 2); |
| 80 | auto flipSign = (kMod4 == 1) | (kMod4 == 2); |
| 81 | |
| 82 | const float sinC2 = -0.16666667163372039794921875; |
| 83 | const float sinC4 = +8.333347737789154052734375e-3; |
| 84 | const float sinC6 = -1.9842604524455964565277099609375e-4; |
| 85 | const float sinC8 = +2.760012648650445044040679931640625e-6; |
| 86 | const float sinC10 = -2.50293279435709337121807038784027099609375e-8; |
| 87 | |
| 88 | const float cosC2 = -0.5; |
| 89 | const float cosC4 = +4.166664183139801025390625e-2; |
| 90 | const float cosC6 = -1.388833043165504932403564453125e-3; |
| 91 | const float cosC8 = +2.47562347794882953166961669921875e-5; |
| 92 | const float cosC10 = -2.59630184018533327616751194000244140625e-7; |
| 93 | |
| 94 | auto outside = select(cosUseCos, 1., x); |
| 95 | auto c2 = select(cosUseCos, T(cosC2), T(sinC2)); |
| 96 | auto c4 = select(cosUseCos, T(cosC4), T(sinC4)); |
| 97 | auto c6 = select(cosUseCos, T(cosC6), T(sinC6)); |
| 98 | auto c8 = select(cosUseCos, T(cosC8), T(sinC8)); |
| 99 | auto c10 = select(cosUseCos, T(cosC10), T(sinC10)); |
| 100 | |
| 101 | auto x2 = x * x; |
| 102 | auto formula = x2 * c10 + c8; |
| 103 | formula = x2 * formula + c6; |
| 104 | formula = x2 * formula + c4; |
| 105 | formula = x2 * formula + c2; |
| 106 | formula = x2 * formula + 1.; |
| 107 | formula *= outside; |
| 108 | |
| 109 | formula = select(flipSign, -formula, formula); |
| 110 | return formula; |
| 111 | } |
| 112 | |
| 113 | template <typename T> |
| 114 | __forceinline void sincos(const T &v, T &sinResult, T &cosResult) |
| 115 | { |
| 116 | const float piOverTwoVec = 1.57079637050628662109375; |
| 117 | const float twoOverPiVec = 0.636619746685028076171875; |
| 118 | auto scaled = v * twoOverPiVec; |
| 119 | auto kReal = floor(scaled); |
| 120 | auto k = toInt(kReal); |
| 121 | |
| 122 | // Reduced range version of x |
| 123 | auto x = v - kReal * piOverTwoVec; |
| 124 | auto kMod4 = k & 3; |
| 125 | auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2)); |
| 126 | auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3)); |
| 127 | auto sinFlipSign = (kMod4 > 1); |
| 128 | auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2)); |
| 129 | |
| 130 | const float oneVec = +1.; |
| 131 | const float sinC2 = -0.16666667163372039794921875; |
| 132 | const float sinC4 = +8.333347737789154052734375e-3; |
| 133 | const float sinC6 = -1.9842604524455964565277099609375e-4; |
| 134 | const float sinC8 = +2.760012648650445044040679931640625e-6; |
| 135 | const float sinC10 = -2.50293279435709337121807038784027099609375e-8; |
| 136 | |
| 137 | const float cosC2 = -0.5; |
| 138 | const float cosC4 = +4.166664183139801025390625e-2; |
| 139 | const float cosC6 = -1.388833043165504932403564453125e-3; |
| 140 | const float cosC8 = +2.47562347794882953166961669921875e-5; |
| 141 | const float cosC10 = -2.59630184018533327616751194000244140625e-7; |
| 142 | |
| 143 | auto x2 = x * x; |
| 144 | |
| 145 | auto sinFormula = x2 * sinC10 + sinC8; |
| 146 | auto cosFormula = x2 * cosC10 + cosC8; |
| 147 | sinFormula = x2 * sinFormula + sinC6; |
| 148 | cosFormula = x2 * cosFormula + cosC6; |
| 149 | |
| 150 | sinFormula = x2 * sinFormula + sinC4; |
| 151 | cosFormula = x2 * cosFormula + cosC4; |
| 152 | |
| 153 | sinFormula = x2 * sinFormula + sinC2; |
| 154 | cosFormula = x2 * cosFormula + cosC2; |
| 155 | |
| 156 | sinFormula = x2 * sinFormula + oneVec; |
| 157 | cosFormula = x2 * cosFormula + oneVec; |
| 158 | |
| 159 | sinFormula *= x; |
| 160 | |
| 161 | sinResult = select(sinUseCos, cosFormula, sinFormula); |
| 162 | cosResult = select(cosUseCos, cosFormula, sinFormula); |
| 163 | |
| 164 | sinResult = select(sinFlipSign, -sinResult, sinResult); |
| 165 | cosResult = select(cosFlipSign, -cosResult, cosResult); |
| 166 | } |
| 167 | |
| 168 | template <typename T> |
| 169 | __forceinline T tan(const T &v) |
| 170 | { |
| 171 | const float piOverFourVec = 0.785398185253143310546875; |
| 172 | const float fourOverPiVec = 1.27323949337005615234375; |
| 173 | |
| 174 | auto xLt0 = v < 0.; |
| 175 | auto y = select(xLt0, -v, v); |
| 176 | auto scaled = y * fourOverPiVec; |
| 177 | |
| 178 | auto kReal = floor(scaled); |
| 179 | auto k = toInt(kReal); |
| 180 | |
| 181 | auto x = y - kReal * piOverFourVec; |
| 182 | |
| 183 | // If k & 1, x -= Pi/4 |
| 184 | auto needOffset = (k & 1) != 0; |
| 185 | x = select(needOffset, x - piOverFourVec, x); |
| 186 | |
| 187 | // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To... |
| 188 | auto kMod4 = k & 3; |
| 189 | auto useCotan = (kMod4 == 1) | (kMod4 == 2); |
| 190 | |
| 191 | const float oneVec = 1.0; |
| 192 | |
| 193 | const float tanC2 = +0.33333075046539306640625; |
| 194 | const float tanC4 = +0.13339905440807342529296875; |
| 195 | const float tanC6 = +5.3348250687122344970703125e-2; |
| 196 | const float tanC8 = +2.46033705770969390869140625e-2; |
| 197 | const float tanC10 = +2.892402000725269317626953125e-3; |
| 198 | const float tanC12 = +9.500005282461643218994140625e-3; |
| 199 | |
| 200 | const float cotC2 = -0.3333333432674407958984375; |
| 201 | const float cotC4 = -2.222204394638538360595703125e-2; |
| 202 | const float cotC6 = -2.11752182804048061370849609375e-3; |
| 203 | const float cotC8 = -2.0846328698098659515380859375e-4; |
| 204 | const float cotC10 = -2.548247357481159269809722900390625e-5; |
| 205 | const float cotC12 = -3.5257363606433500535786151885986328125e-7; |
| 206 | |
| 207 | auto x2 = x * x; |
| 208 | T z; |
| 209 | if (any(useCotan)) |
| 210 | { |
| 211 | auto cotVal = x2 * cotC12 + cotC10; |
| 212 | cotVal = x2 * cotVal + cotC8; |
| 213 | cotVal = x2 * cotVal + cotC6; |
| 214 | cotVal = x2 * cotVal + cotC4; |
| 215 | cotVal = x2 * cotVal + cotC2; |
| 216 | cotVal = x2 * cotVal + oneVec; |
| 217 | // The equation is for x * cot(x) but we need -x * cot(x) for the tan part. |
| 218 | cotVal /= -x; |
| 219 | z = cotVal; |
| 220 | } |
| 221 | auto useTan = !useCotan; |
| 222 | if (any(useTan)) |
| 223 | { |
| 224 | auto tanVal = x2 * tanC12 + tanC10; |
| 225 | tanVal = x2 * tanVal + tanC8; |
| 226 | tanVal = x2 * tanVal + tanC6; |
| 227 | tanVal = x2 * tanVal + tanC4; |
| 228 | tanVal = x2 * tanVal + tanC2; |
| 229 | tanVal = x2 * tanVal + oneVec; |
| 230 | // Equation was for tan(x)/x |
| 231 | tanVal *= x; |
| 232 | z = select(useTan, tanVal, z); |
| 233 | } |
| 234 | return select(xLt0, -z, z); |
| 235 | } |
| 236 | |
| 237 | template <typename T> |
| 238 | __forceinline T asin(const T &x0) |
| 239 | { |
| 240 | auto isneg = (x0 < 0.f); |
| 241 | auto x = abs(x0); |
| 242 | auto isnan = (x > 1.f); |
| 243 | |
| 244 | // sollya |
| 245 | // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|], |
| 246 | // [1e-20;.9999999999999999]); |
| 247 | // avg error: 1.1105439e-06, max error 1.3187528e-06 |
| 248 | auto v = 1.57079517841339111328125f + |
| 249 | x * (-0.21450997889041900634765625f + |
| 250 | x * (8.78556668758392333984375e-2f + |
| 251 | x * (-4.489909112453460693359375e-2f + |
| 252 | x * (1.928029954433441162109375e-2f + |
| 253 | x * (-4.3095736764371395111083984375e-3f))))); |
| 254 | |
| 255 | v *= -sqrt(1.f - x); |
| 256 | v = v + 1.57079637050628662109375f; |
| 257 | |
| 258 | v = select(v < 0.f, T(0.f), v); |
| 259 | v = select(isneg, -v, v); |
| 260 | v = select(isnan, T(cast_i2f(0x7fc00000)), v); |
| 261 | |
| 262 | return v; |
| 263 | } |
| 264 | |
| 265 | template <typename T> |
| 266 | __forceinline T acos(const T &v) |
| 267 | { |
| 268 | return 1.57079637050628662109375f - asin(v); |
| 269 | } |
| 270 | |
| 271 | template <typename T> |
| 272 | __forceinline T atan(const T &v) |
| 273 | { |
| 274 | const float piOverTwoVec = 1.57079637050628662109375; |
| 275 | // atan(-x) = -atan(x) (so flip from negative to positive first) |
| 276 | // If x > 1 -> atan(x) = Pi/2 - atan(1/x) |
| 277 | auto xNeg = v < 0.f; |
| 278 | auto xFlipped = select(xNeg, -v, v); |
| 279 | |
| 280 | auto xGt1 = xFlipped > 1.; |
| 281 | auto x = select(xGt1, rcpSafe(xFlipped), xFlipped); |
| 282 | |
| 283 | // These coefficients approximate atan(x)/x |
| 284 | const float atanC0 = +0.99999988079071044921875; |
| 285 | const float atanC2 = -0.3333191573619842529296875; |
| 286 | const float atanC4 = +0.199689209461212158203125; |
| 287 | const float atanC6 = -0.14015688002109527587890625; |
| 288 | const float atanC8 = +9.905083477497100830078125e-2; |
| 289 | const float atanC10 = -5.93664981424808502197265625e-2; |
| 290 | const float atanC12 = +2.417283318936824798583984375e-2; |
| 291 | const float atanC14 = -4.6721356920897960662841796875e-3; |
| 292 | |
| 293 | auto x2 = x * x; |
| 294 | auto result = x2 * atanC14 + atanC12; |
| 295 | result = x2 * result + atanC10; |
| 296 | result = x2 * result + atanC8; |
| 297 | result = x2 * result + atanC6; |
| 298 | result = x2 * result + atanC4; |
| 299 | result = x2 * result + atanC2; |
| 300 | result = x2 * result + atanC0; |
| 301 | result *= x; |
| 302 | |
| 303 | result = select(xGt1, piOverTwoVec - result, result); |
| 304 | result = select(xNeg, -result, result); |
| 305 | return result; |
| 306 | } |
| 307 | |
| 308 | template <typename T> |
| 309 | __forceinline T atan2(const T &y, const T &x) |
| 310 | { |
| 311 | const float piVec = 3.1415926536; |
| 312 | // atan2(y, x) = |
| 313 | // |
| 314 | // atan2(y > 0, x = +-0) -> Pi/2 |
| 315 | // atan2(y < 0, x = +-0) -> -Pi/2 |
| 316 | // atan2(y = +-0, x < +0) -> +-Pi |
| 317 | // atan2(y = +-0, x >= +0) -> +-0 |
| 318 | // |
| 319 | // atan2(y >= 0, x < 0) -> Pi + atan(y/x) |
| 320 | // atan2(y < 0, x < 0) -> -Pi + atan(y/x) |
| 321 | // atan2(y, x > 0) -> atan(y/x) |
| 322 | // |
| 323 | // and then a bunch of code for dealing with infinities. |
| 324 | auto yOverX = y * rcpSafe(x); |
| 325 | auto atanArg = atan(yOverX); |
| 326 | auto xLt0 = x < 0.f; |
| 327 | auto yLt0 = y < 0.f; |
| 328 | auto offset = select(xLt0, |
| 329 | select(yLt0, T(-piVec), T(piVec)), 0.f); |
| 330 | return offset + atanArg; |
| 331 | } |
| 332 | |
| 333 | template <typename T> |
| 334 | __forceinline T exp(const T &v) |
| 335 | { |
| 336 | const float ln2Part1 = 0.6931457519; |
| 337 | const float ln2Part2 = 1.4286067653e-6; |
| 338 | const float oneOverLn2 = 1.44269502162933349609375; |
| 339 | |
| 340 | auto scaled = v * oneOverLn2; |
| 341 | auto kReal = floor(scaled); |
| 342 | auto k = toInt(kReal); |
| 343 | |
| 344 | // Reduced range version of x |
| 345 | auto x = v - kReal * ln2Part1; |
| 346 | x -= kReal * ln2Part2; |
| 347 | |
| 348 | // These coefficients are for e^x in [0, ln(2)] |
| 349 | const float one = 1.; |
| 350 | const float c2 = 0.4999999105930328369140625; |
| 351 | const float c3 = 0.166668415069580078125; |
| 352 | const float c4 = 4.16539050638675689697265625e-2; |
| 353 | const float c5 = 8.378830738365650177001953125e-3; |
| 354 | const float c6 = 1.304379315115511417388916015625e-3; |
| 355 | const float c7 = 2.7555381529964506626129150390625e-4; |
| 356 | |
| 357 | auto result = x * c7 + c6; |
| 358 | result = x * result + c5; |
| 359 | result = x * result + c4; |
| 360 | result = x * result + c3; |
| 361 | result = x * result + c2; |
| 362 | result = x * result + one; |
| 363 | result = x * result + one; |
| 364 | |
| 365 | // Compute 2^k (should differ for float and double, but I'll avoid |
| 366 | // it for now and just do floats) |
| 367 | const int fpbias = 127; |
| 368 | auto biasedN = k + fpbias; |
| 369 | auto overflow = kReal > fpbias; |
| 370 | // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0) |
| 371 | // we've got underflow. -127 * ln(2) -> -88.02. So the most |
| 372 | // negative float input that doesn't result in zero is like -88. |
| 373 | auto underflow = kReal <= -fpbias; |
| 374 | const int infBits = 0x7f800000; |
| 375 | biasedN <<= 23; |
| 376 | // Reinterpret this thing as float |
| 377 | auto twoToTheN = asFloat(biasedN); |
| 378 | // Handle both doubles and floats (hopefully eliding the copy for float) |
| 379 | auto elemtype2n = twoToTheN; |
| 380 | result *= elemtype2n; |
| 381 | result = select(overflow, cast_i2f(infBits), result); |
| 382 | result = select(underflow, 0., result); |
| 383 | return result; |
| 384 | } |
| 385 | |
| 386 | // Range reduction for logarithms takes log(x) -> log(2^n * y) -> n |
| 387 | // * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)). |
| 388 | template <typename T, typename R> |
| 389 | __forceinline void __rangeReduceLog(const T &input, |
| 390 | T &reduced, |
| 391 | R &exponent) |
| 392 | { |
| 393 | auto intVersion = asInt(input); |
| 394 | // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM |
| 395 | // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000 |
| 396 | // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0 |
| 397 | // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111 |
| 398 | // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF |
| 399 | |
| 400 | //const int exponentMask(0x7F800000) |
| 401 | static const int nonexponentMask = 0x807FFFFF; |
| 402 | |
| 403 | // We want the reduced version to have an exponent of -1 which is |
| 404 | // -1 + 127 after biasing or 126 |
| 405 | static const int exponentNeg1 = (126l << 23); |
| 406 | // NOTE(boulos): We don't need to mask anything out since we know |
| 407 | // the sign bit has to be 0. If it's 1, we need to return infinity/nan |
| 408 | // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). |
| 409 | auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128] |
| 410 | |
| 411 | auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2 |
| 412 | exponent = offsetExponent - 127; // get the real value |
| 413 | |
| 414 | // Blend the offset_exponent with the original input (do this in |
| 415 | // int for now, until I decide if float can have & and ¬) |
| 416 | auto blended = (intVersion & nonexponentMask) | (exponentNeg1); |
| 417 | reduced = asFloat(blended); |
| 418 | } |
| 419 | |
| 420 | template <typename T> struct ExponentType { }; |
| 421 | template <int N> struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; }; |
| 422 | template <> struct ExponentType<float> { typedef int Ty; }; |
| 423 | |
| 424 | template <typename T> |
| 425 | __forceinline T log(const T &v) |
| 426 | { |
| 427 | T reduced; |
| 428 | typename ExponentType<T>::Ty exponent; |
| 429 | |
| 430 | const int nanBits = 0x7fc00000; |
| 431 | const int negInfBits = 0xFF800000; |
| 432 | const float nan = cast_i2f(nanBits); |
| 433 | const float negInf = cast_i2f(negInfBits); |
| 434 | auto useNan = v < 0.; |
| 435 | auto useInf = v == 0.; |
| 436 | auto exceptional = useNan | useInf; |
| 437 | const float one = 1.0; |
| 438 | |
| 439 | auto patched = select(exceptional, one, v); |
| 440 | __rangeReduceLog(patched, reduced, exponent); |
| 441 | |
| 442 | const float ln2 = 0.693147182464599609375; |
| 443 | |
| 444 | auto x1 = one - reduced; |
| 445 | const float c1 = +0.50000095367431640625; |
| 446 | const float c2 = +0.33326041698455810546875; |
| 447 | const float c3 = +0.2519190013408660888671875; |
| 448 | const float c4 = +0.17541764676570892333984375; |
| 449 | const float c5 = +0.3424419462680816650390625; |
| 450 | const float c6 = -0.599632322788238525390625; |
| 451 | const float c7 = +1.98442304134368896484375; |
| 452 | const float c8 = -2.4899270534515380859375; |
| 453 | const float c9 = +1.7491014003753662109375; |
| 454 | |
| 455 | auto result = x1 * c9 + c8; |
| 456 | result = x1 * result + c7; |
| 457 | result = x1 * result + c6; |
| 458 | result = x1 * result + c5; |
| 459 | result = x1 * result + c4; |
| 460 | result = x1 * result + c3; |
| 461 | result = x1 * result + c2; |
| 462 | result = x1 * result + c1; |
| 463 | result = x1 * result + one; |
| 464 | |
| 465 | // Equation was for -(ln(red)/(1-red)) |
| 466 | result *= -x1; |
| 467 | result += toFloat(exponent) * ln2; |
| 468 | |
| 469 | return select(exceptional, |
| 470 | select(useNan, T(nan), T(negInf)), |
| 471 | result); |
| 472 | } |
| 473 | |
| 474 | template <typename T> |
| 475 | __forceinline T pow(const T &x, const T &y) |
| 476 | { |
| 477 | auto x1 = abs(x); |
| 478 | auto z = exp(y * log(x1)); |
| 479 | |
| 480 | // Handle special cases |
| 481 | const float twoOver23 = 8388608.0f; |
| 482 | auto yInt = y == round(y); |
| 483 | auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit |
| 484 | |
| 485 | // x == 0 |
| 486 | z = select(x == 0.0f, |
| 487 | select(y < 0.0f, T(inf) | signmsk(x), |
| 488 | select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z); |
| 489 | |
| 490 | // x < 0 |
| 491 | auto xNegative = x < 0.0f; |
| 492 | if (any(xNegative)) |
| 493 | { |
| 494 | auto z1 = z | asFloat(yOddInt); |
| 495 | z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN()); |
| 496 | z = select(xNegative, z1, z); |
| 497 | } |
| 498 | |
| 499 | auto xFinite = isfinite(x); |
| 500 | auto yFinite = isfinite(y); |
| 501 | if (all(xFinite & yFinite)) |
| 502 | return z; |
| 503 | |
| 504 | // x finite and y infinite |
| 505 | z = select(andn(xFinite, yFinite), |
| 506 | select(x1 == 1.0f, 1.0f, |
| 507 | select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z); |
| 508 | |
| 509 | // x infinite |
| 510 | z = select(xFinite, z, |
| 511 | select(y == 0.0f, 1.0f, |
| 512 | select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x))); |
| 513 | |
| 514 | return z; |
| 515 | } |
| 516 | |
| 517 | template <typename T> |
| 518 | __forceinline T pow(const T &x, float y) |
| 519 | { |
| 520 | return pow(x, T(y)); |
| 521 | } |
| 522 | |
| 523 | } // namespace fastapprox |
| 524 | |
| 525 | } // namespace embree |
| 526 | |