| 1 | // SPDX-License-Identifier: Apache-2.0 |
| 2 | // ---------------------------------------------------------------------------- |
| 3 | // Copyright 2019-2022 Arm Limited |
| 4 | // Copyright 2008 Jose Fonseca |
| 5 | // |
| 6 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| 7 | // use this file except in compliance with the License. You may obtain a copy |
| 8 | // of the License at: |
| 9 | // |
| 10 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | // |
| 12 | // Unless required by applicable law or agreed to in writing, software |
| 13 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| 14 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| 15 | // License for the specific language governing permissions and limitations |
| 16 | // under the License. |
| 17 | // ---------------------------------------------------------------------------- |
| 18 | |
| 19 | /* |
| 20 | * This module implements vector support for floats, ints, and vector lane |
| 21 | * control masks. It provides access to both explicit vector width types, and |
| 22 | * flexible N-wide types where N can be determined at compile time. |
| 23 | * |
| 24 | * The design of this module encourages use of vector length agnostic code, via |
| 25 | * the vint, vfloat, and vmask types. These will take on the widest SIMD vector |
| 26 | * with that is available at compile time. The current vector width is |
| 27 | * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant. |
| 28 | * |
| 29 | * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types. |
| 30 | * These are provided primarily for prototyping and algorithm debug of VLA |
| 31 | * implementations. |
| 32 | * |
| 33 | * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4 |
| 34 | * types. These are provided for use by VLA code, but are also expected to be |
| 35 | * used as a fixed-width type and will supported a reference C++ fallback for |
| 36 | * use on platforms without SIMD intrinsics. |
| 37 | * |
| 38 | * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8 |
| 39 | * types. These are provide for use by VLA code, and are not expected to be |
| 40 | * used as a fixed-width type in normal code. No reference C implementation is |
| 41 | * provided on platforms without underlying SIMD intrinsics. |
| 42 | * |
| 43 | * With the current implementation ISA support is provided for: |
| 44 | * |
| 45 | * * 1-wide for scalar reference. |
| 46 | * * 4-wide for Armv8-A NEON. |
| 47 | * * 4-wide for x86-64 SSE2. |
| 48 | * * 4-wide for x86-64 SSE4.1. |
| 49 | * * 8-wide for x86-64 AVX2. |
| 50 | */ |
| 51 | |
| 52 | #ifndef ASTC_VECMATHLIB_H_INCLUDED |
| 53 | #define ASTC_VECMATHLIB_H_INCLUDED |
| 54 | |
| 55 | #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 |
| 56 | #include <immintrin.h> |
| 57 | #elif ASTCENC_NEON != 0 |
| 58 | #include <arm_neon.h> |
| 59 | #endif |
| 60 | |
| 61 | #if !defined(__clang__) && defined(_MSC_VER) |
| 62 | #define ASTCENC_SIMD_INLINE __forceinline |
| 63 | #define ASTCENC_NO_INLINE |
| 64 | #elif defined(__GNUC__) && !defined(__clang__) |
| 65 | #define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline |
| 66 | #define ASTCENC_NO_INLINE __attribute__ ((noinline)) |
| 67 | #else |
| 68 | #define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline |
| 69 | #define ASTCENC_NO_INLINE __attribute__ ((noinline)) |
| 70 | #endif |
| 71 | |
| 72 | #if ASTCENC_AVX >= 2 |
| 73 | /* If we have AVX2 expose 8-wide VLA. */ |
| 74 | #include "astcenc_vecmathlib_sse_4.h" |
| 75 | #include "astcenc_vecmathlib_common_4.h" |
| 76 | #include "astcenc_vecmathlib_avx2_8.h" |
| 77 | |
| 78 | #define ASTCENC_SIMD_WIDTH 8 |
| 79 | |
| 80 | using vfloat = vfloat8; |
| 81 | |
| 82 | #if defined(ASTCENC_NO_INVARIANCE) |
| 83 | using vfloatacc = vfloat8; |
| 84 | #else |
| 85 | using vfloatacc = vfloat4; |
| 86 | #endif |
| 87 | |
| 88 | using vint = vint8; |
| 89 | using vmask = vmask8; |
| 90 | |
| 91 | constexpr auto loada = vfloat8::loada; |
| 92 | constexpr auto load1 = vfloat8::load1; |
| 93 | |
| 94 | #elif ASTCENC_SSE >= 20 |
| 95 | /* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */ |
| 96 | #include "astcenc_vecmathlib_sse_4.h" |
| 97 | #include "astcenc_vecmathlib_common_4.h" |
| 98 | |
| 99 | #define ASTCENC_SIMD_WIDTH 4 |
| 100 | |
| 101 | using vfloat = vfloat4; |
| 102 | using vfloatacc = vfloat4; |
| 103 | using vint = vint4; |
| 104 | using vmask = vmask4; |
| 105 | |
| 106 | constexpr auto loada = vfloat4::loada; |
| 107 | constexpr auto load1 = vfloat4::load1; |
| 108 | |
| 109 | #elif ASTCENC_NEON > 0 |
| 110 | /* If we have NEON expose 4-wide VLA. */ |
| 111 | #include "astcenc_vecmathlib_neon_4.h" |
| 112 | #include "astcenc_vecmathlib_common_4.h" |
| 113 | |
| 114 | #define ASTCENC_SIMD_WIDTH 4 |
| 115 | |
| 116 | using vfloat = vfloat4; |
| 117 | using vfloatacc = vfloat4; |
| 118 | using vint = vint4; |
| 119 | using vmask = vmask4; |
| 120 | |
| 121 | constexpr auto loada = vfloat4::loada; |
| 122 | constexpr auto load1 = vfloat4::load1; |
| 123 | |
| 124 | #else |
| 125 | // If we have nothing expose 4-wide VLA, and 4-wide fixed width. |
| 126 | |
| 127 | // Note: We no longer expose the 1-wide scalar fallback because it is not |
| 128 | // invariant with the 4-wide path due to algorithms that use horizontal |
| 129 | // operations that accumulate a local vector sum before accumulating into |
| 130 | // a running sum. |
| 131 | // |
| 132 | // For 4 items adding into an accumulator using 1-wide vectors the sum is: |
| 133 | // |
| 134 | // result = ((((sum + l0) + l1) + l2) + l3) |
| 135 | // |
| 136 | // ... whereas the accumulator for a 4-wide vector sum is: |
| 137 | // |
| 138 | // result = sum + ((l0 + l2) + (l1 + l3)) |
| 139 | // |
| 140 | // In "normal maths" this is the same, but the floating point reassociation |
| 141 | // differences mean that these will not produce the same result. |
| 142 | |
| 143 | #include "astcenc_vecmathlib_none_4.h" |
| 144 | #include "astcenc_vecmathlib_common_4.h" |
| 145 | |
| 146 | #define ASTCENC_SIMD_WIDTH 4 |
| 147 | |
| 148 | using vfloat = vfloat4; |
| 149 | using vfloatacc = vfloat4; |
| 150 | using vint = vint4; |
| 151 | using vmask = vmask4; |
| 152 | |
| 153 | constexpr auto loada = vfloat4::loada; |
| 154 | constexpr auto load1 = vfloat4::load1; |
| 155 | #endif |
| 156 | |
| 157 | /** |
| 158 | * @brief Round a count down to the largest multiple of 8. |
| 159 | * |
| 160 | * @param count The unrounded value. |
| 161 | * |
| 162 | * @return The rounded value. |
| 163 | */ |
| 164 | ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count) |
| 165 | { |
| 166 | return count & static_cast<unsigned int>(~(8 - 1)); |
| 167 | } |
| 168 | |
| 169 | /** |
| 170 | * @brief Round a count down to the largest multiple of 4. |
| 171 | * |
| 172 | * @param count The unrounded value. |
| 173 | * |
| 174 | * @return The rounded value. |
| 175 | */ |
| 176 | ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count) |
| 177 | { |
| 178 | return count & static_cast<unsigned int>(~(4 - 1)); |
| 179 | } |
| 180 | |
| 181 | /** |
| 182 | * @brief Round a count down to the largest multiple of the SIMD width. |
| 183 | * |
| 184 | * Assumption that the vector width is a power of two ... |
| 185 | * |
| 186 | * @param count The unrounded value. |
| 187 | * |
| 188 | * @return The rounded value. |
| 189 | */ |
| 190 | ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count) |
| 191 | { |
| 192 | return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1)); |
| 193 | } |
| 194 | |
| 195 | /** |
| 196 | * @brief Round a count up to the largest multiple of the SIMD width. |
| 197 | * |
| 198 | * Assumption that the vector width is a power of two ... |
| 199 | * |
| 200 | * @param count The unrounded value. |
| 201 | * |
| 202 | * @return The rounded value. |
| 203 | */ |
| 204 | ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count) |
| 205 | { |
| 206 | unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH; |
| 207 | return multiples * ASTCENC_SIMD_WIDTH; |
| 208 | } |
| 209 | |
| 210 | /** |
| 211 | * @brief Return @c a with lanes negated if the @c b lane is negative. |
| 212 | */ |
| 213 | ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b) |
| 214 | { |
| 215 | vint ia = float_as_int(a); |
| 216 | vint ib = float_as_int(b); |
| 217 | vint sign_mask(static_cast<int>(0x80000000)); |
| 218 | vint r = ia ^ (ib & sign_mask); |
| 219 | return int_as_float(r); |
| 220 | } |
| 221 | |
| 222 | /** |
| 223 | * @brief Return fast, but approximate, vector atan(x). |
| 224 | * |
| 225 | * Max error of this implementation is 0.004883. |
| 226 | */ |
| 227 | ASTCENC_SIMD_INLINE vfloat atan(vfloat x) |
| 228 | { |
| 229 | vmask c = abs(x) > vfloat(1.0f); |
| 230 | vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x); |
| 231 | vfloat y = select(x, vfloat(1.0f) / x, c); |
| 232 | y = y / (y * y * vfloat(0.28f) + vfloat(1.0f)); |
| 233 | return select(y, z - y, c); |
| 234 | } |
| 235 | |
| 236 | /** |
| 237 | * @brief Return fast, but approximate, vector atan2(x, y). |
| 238 | */ |
| 239 | ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x) |
| 240 | { |
| 241 | vfloat z = atan(abs(y / x)); |
| 242 | vmask xmask = vmask(float_as_int(x).m); |
| 243 | return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y); |
| 244 | } |
| 245 | |
| 246 | /* |
| 247 | * @brief Factory that returns a unit length 4 component vfloat4. |
| 248 | */ |
| 249 | static ASTCENC_SIMD_INLINE vfloat4 unit4() |
| 250 | { |
| 251 | return vfloat4(0.5f); |
| 252 | } |
| 253 | |
| 254 | /** |
| 255 | * @brief Factory that returns a unit length 3 component vfloat4. |
| 256 | */ |
| 257 | static ASTCENC_SIMD_INLINE vfloat4 unit3() |
| 258 | { |
| 259 | float val = 0.577350258827209473f; |
| 260 | return vfloat4(val, val, val, 0.0f); |
| 261 | } |
| 262 | |
| 263 | /** |
| 264 | * @brief Factory that returns a unit length 2 component vfloat4. |
| 265 | */ |
| 266 | static ASTCENC_SIMD_INLINE vfloat4 unit2() |
| 267 | { |
| 268 | float val = 0.707106769084930420f; |
| 269 | return vfloat4(val, val, 0.0f, 0.0f); |
| 270 | } |
| 271 | |
| 272 | /** |
| 273 | * @brief Factory that returns a 3 component vfloat4. |
| 274 | */ |
| 275 | static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c) |
| 276 | { |
| 277 | return vfloat4(a, b, c, 0.0f); |
| 278 | } |
| 279 | |
| 280 | /** |
| 281 | * @brief Factory that returns a 2 component vfloat4. |
| 282 | */ |
| 283 | static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b) |
| 284 | { |
| 285 | return vfloat4(a, b, 0.0f, 0.0f); |
| 286 | } |
| 287 | |
| 288 | /** |
| 289 | * @brief Normalize a non-zero length vector to unit length. |
| 290 | */ |
| 291 | static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a) |
| 292 | { |
| 293 | vfloat4 length = dot(a, a); |
| 294 | return a / sqrt(length); |
| 295 | } |
| 296 | |
| 297 | /** |
| 298 | * @brief Normalize a vector, returning @c safe if len is zero. |
| 299 | */ |
| 300 | static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe) |
| 301 | { |
| 302 | vfloat4 length = dot(a, a); |
| 303 | if (length.lane<0>() != 0.0f) |
| 304 | { |
| 305 | return a / sqrt(length); |
| 306 | } |
| 307 | |
| 308 | return safe; |
| 309 | } |
| 310 | |
| 311 | |
| 312 | |
| 313 | #define POLY0(x, c0) ( c0) |
| 314 | #define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0) |
| 315 | #define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0) |
| 316 | #define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0) |
| 317 | #define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0) |
| 318 | #define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0) |
| 319 | |
| 320 | /** |
| 321 | * @brief Compute an approximate exp2(x) for each lane in the vector. |
| 322 | * |
| 323 | * Based on 5th degree minimax polynomials, ported from this blog |
| 324 | * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html |
| 325 | */ |
| 326 | static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x) |
| 327 | { |
| 328 | x = clamp(-126.99999f, 129.0f, x); |
| 329 | |
| 330 | vint4 ipart = float_to_int(x - 0.5f); |
| 331 | vfloat4 fpart = x - int_to_float(ipart); |
| 332 | |
| 333 | // Integer contrib, using 1 << ipart |
| 334 | vfloat4 iexp = int_as_float(lsl<23>(ipart + 127)); |
| 335 | |
| 336 | // Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5) |
| 337 | vfloat4 fexp = POLY5(fpart, |
| 338 | 9.9999994e-1f, |
| 339 | 6.9315308e-1f, |
| 340 | 2.4015361e-1f, |
| 341 | 5.5826318e-2f, |
| 342 | 8.9893397e-3f, |
| 343 | 1.8775767e-3f); |
| 344 | |
| 345 | return iexp * fexp; |
| 346 | } |
| 347 | |
| 348 | /** |
| 349 | * @brief Compute an approximate log2(x) for each lane in the vector. |
| 350 | * |
| 351 | * Based on 5th degree minimax polynomials, ported from this blog |
| 352 | * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html |
| 353 | */ |
| 354 | static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x) |
| 355 | { |
| 356 | vint4 exp(0x7F800000); |
| 357 | vint4 mant(0x007FFFFF); |
| 358 | vint4 one(0x3F800000); |
| 359 | |
| 360 | vint4 i = float_as_int(x); |
| 361 | |
| 362 | vfloat4 e = int_to_float(lsr<23>(i & exp) - 127); |
| 363 | |
| 364 | vfloat4 m = int_as_float((i & mant) | one); |
| 365 | |
| 366 | // Polynomial fit of log2(x)/(x - 1), for x in range [1, 2) |
| 367 | vfloat4 p = POLY4(m, |
| 368 | 2.8882704548164776201f, |
| 369 | -2.52074962577807006663f, |
| 370 | 1.48116647521213171641f, |
| 371 | -0.465725644288844778798f, |
| 372 | 0.0596515482674574969533f); |
| 373 | |
| 374 | // Increases the polynomial degree, but ensures that log2(1) == 0 |
| 375 | p = p * (m - 1.0f); |
| 376 | |
| 377 | return p + e; |
| 378 | } |
| 379 | |
| 380 | /** |
| 381 | * @brief Compute an approximate pow(x, y) for each lane in the vector. |
| 382 | * |
| 383 | * Power function based on the exp2(log2(x) * y) transform. |
| 384 | */ |
| 385 | static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y) |
| 386 | { |
| 387 | vmask4 zero_mask = y == vfloat4(0.0f); |
| 388 | vfloat4 estimate = exp2(log2(x) * y); |
| 389 | |
| 390 | // Guarantee that y == 0 returns exactly 1.0f |
| 391 | return select(estimate, vfloat4(1.0f), zero_mask); |
| 392 | } |
| 393 | |
| 394 | /** |
| 395 | * @brief Count the leading zeros for each lane in @c a. |
| 396 | * |
| 397 | * Valid for all data values of @c a; will return a per-lane value [0, 32]. |
| 398 | */ |
| 399 | static ASTCENC_SIMD_INLINE vint4 clz(vint4 a) |
| 400 | { |
| 401 | // This function is a horrible abuse of floating point exponents to convert |
| 402 | // the original integer value into a 2^N encoding we can recover easily. |
| 403 | |
| 404 | // Convert to float without risk of rounding up by keeping only top 8 bits. |
| 405 | // This trick is is guaranteed to keep top 8 bits and clear the 9th. |
| 406 | a = (~lsr<8>(a)) & a; |
| 407 | a = float_as_int(int_to_float(a)); |
| 408 | |
| 409 | // Extract and unbias exponent |
| 410 | a = vint4(127 + 31) - lsr<23>(a); |
| 411 | |
| 412 | // Clamp result to a valid 32-bit range |
| 413 | return clamp(0, 32, a); |
| 414 | } |
| 415 | |
| 416 | /** |
| 417 | * @brief Return lanewise 2^a for each lane in @c a. |
| 418 | * |
| 419 | * Use of signed int means that this is only valid for values in range [0, 31]. |
| 420 | */ |
| 421 | static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a) |
| 422 | { |
| 423 | // 2^30 is the largest signed number than can be represented |
| 424 | assert(all(a < vint4(31))); |
| 425 | |
| 426 | // This function is a horrible abuse of floating point to use the exponent |
| 427 | // and float conversion to generate a 2^N multiple. |
| 428 | |
| 429 | // Bias the exponent |
| 430 | vint4 exp = a + 127; |
| 431 | exp = lsl<23>(exp); |
| 432 | |
| 433 | // Reinterpret the bits as a float, and then convert to an int |
| 434 | vfloat4 f = int_as_float(exp); |
| 435 | return float_to_int(f); |
| 436 | } |
| 437 | |
| 438 | /** |
| 439 | * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1]. |
| 440 | */ |
| 441 | static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p) |
| 442 | { |
| 443 | vint4 fp16_one = vint4(0x3C00); |
| 444 | vint4 fp16_small = lsl<8>(p); |
| 445 | |
| 446 | vmask4 is_one = p == vint4(0xFFFF); |
| 447 | vmask4 is_small = p < vint4(4); |
| 448 | |
| 449 | // Manually inline clz() on Visual Studio to avoid release build codegen bug |
| 450 | // see https://github.com/ARM-software/astc-encoder/issues/259 |
| 451 | #if !defined(__clang__) && defined(_MSC_VER) |
| 452 | vint4 a = (~lsr<8>(p)) & p; |
| 453 | a = float_as_int(int_to_float(a)); |
| 454 | a = vint4(127 + 31) - lsr<23>(a); |
| 455 | vint4 lz = clamp(0, 32, a) - 16; |
| 456 | #else |
| 457 | vint4 lz = clz(p) - 16; |
| 458 | #endif |
| 459 | |
| 460 | p = p * two_to_the_n(lz + 1); |
| 461 | p = p & vint4(0xFFFF); |
| 462 | |
| 463 | p = lsr<6>(p); |
| 464 | |
| 465 | p = p | lsl<10>(vint4(14) - lz); |
| 466 | |
| 467 | vint4 r = select(p, fp16_one, is_one); |
| 468 | r = select(r, fp16_small, is_small); |
| 469 | return r; |
| 470 | } |
| 471 | |
| 472 | /** |
| 473 | * @brief Convert 16-bit LNS to float16. |
| 474 | */ |
| 475 | static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p) |
| 476 | { |
| 477 | vint4 mc = p & 0x7FF; |
| 478 | vint4 ec = lsr<11>(p); |
| 479 | |
| 480 | vint4 mc_512 = mc * 3; |
| 481 | vmask4 mask_512 = mc < vint4(512); |
| 482 | |
| 483 | vint4 mc_1536 = mc * 4 - 512; |
| 484 | vmask4 mask_1536 = mc < vint4(1536); |
| 485 | |
| 486 | vint4 mc_else = mc * 5 - 2048; |
| 487 | |
| 488 | vint4 mt = mc_else; |
| 489 | mt = select(mt, mc_1536, mask_1536); |
| 490 | mt = select(mt, mc_512, mask_512); |
| 491 | |
| 492 | vint4 res = lsl<10>(ec) | lsr<3>(mt); |
| 493 | return min(res, vint4(0x7BFF)); |
| 494 | } |
| 495 | |
| 496 | /** |
| 497 | * @brief Extract mantissa and exponent of a float value. |
| 498 | * |
| 499 | * @param a The input value. |
| 500 | * @param[out] exp The output exponent. |
| 501 | * |
| 502 | * @return The mantissa. |
| 503 | */ |
| 504 | static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp) |
| 505 | { |
| 506 | // Interpret the bits as an integer |
| 507 | vint4 ai = float_as_int(a); |
| 508 | |
| 509 | // Extract and unbias the exponent |
| 510 | exp = (lsr<23>(ai) & 0xFF) - 126; |
| 511 | |
| 512 | // Extract and unbias the mantissa |
| 513 | vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000; |
| 514 | return int_as_float(manti); |
| 515 | } |
| 516 | |
| 517 | /** |
| 518 | * @brief Convert float to 16-bit LNS. |
| 519 | */ |
| 520 | static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a) |
| 521 | { |
| 522 | vint4 exp; |
| 523 | vfloat4 mant = frexp(a, exp); |
| 524 | |
| 525 | // Do these early before we start messing about ... |
| 526 | vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f)); |
| 527 | vmask4 mask_infinity = a >= vfloat4(65536.0f); |
| 528 | |
| 529 | // If input is smaller than 2^-14, multiply by 2^25 and don't bias. |
| 530 | vmask4 exp_lt_m13 = exp < vint4(-13); |
| 531 | |
| 532 | vfloat4 a1a = a * 33554432.0f; |
| 533 | vint4 expa = vint4::zero(); |
| 534 | |
| 535 | vfloat4 a1b = (mant - 0.5f) * 4096; |
| 536 | vint4 expb = exp + 14; |
| 537 | |
| 538 | a = select(a1b, a1a, exp_lt_m13); |
| 539 | exp = select(expb, expa, exp_lt_m13); |
| 540 | |
| 541 | vmask4 a_lt_384 = a < vfloat4(384.0f); |
| 542 | vmask4 a_lt_1408 = a <= vfloat4(1408.0f); |
| 543 | |
| 544 | vfloat4 a2a = a * (4.0f / 3.0f); |
| 545 | vfloat4 a2b = a + 128.0f; |
| 546 | vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f); |
| 547 | |
| 548 | a = a2c; |
| 549 | a = select(a, a2b, a_lt_1408); |
| 550 | a = select(a, a2a, a_lt_384); |
| 551 | |
| 552 | a = a + (int_to_float(exp) * 2048.0f) + 1.0f; |
| 553 | |
| 554 | a = select(a, vfloat4(65535.0f), mask_infinity); |
| 555 | a = select(a, vfloat4::zero(), mask_underflow_nan); |
| 556 | |
| 557 | return a; |
| 558 | } |
| 559 | |
| 560 | namespace astc |
| 561 | { |
| 562 | |
| 563 | static ASTCENC_SIMD_INLINE float pow(float x, float y) |
| 564 | { |
| 565 | return pow(vfloat4(x), vfloat4(y)).lane<0>(); |
| 566 | } |
| 567 | |
| 568 | } |
| 569 | |
| 570 | #endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED |
| 571 | |