| 1 | /* Implement powl for x86 using extra-precision log. |
| 2 | Copyright (C) 2012-2020 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <math.h> |
| 20 | #include <math_private.h> |
| 21 | #include <math-underflow.h> |
| 22 | #include <stdbool.h> |
| 23 | |
| 24 | /* High parts and low parts of -log (k/16), for integer k from 12 to |
| 25 | 24. */ |
| 26 | |
| 27 | static const long double powl_log_table[] = |
| 28 | { |
| 29 | 0x4.9a58844d36e49e1p-4L, -0x1.0522624fd558f574p-68L, |
| 30 | 0x3.527da7915b3c6de4p-4L, 0x1.7d4ef4b901b99b9ep-68L, |
| 31 | 0x2.22f1d044fc8f7bc8p-4L, -0x1.8e97c071a42fc388p-68L, |
| 32 | 0x1.08598b59e3a0688ap-4L, 0x3.fd9bf503372c12fcp-72L, |
| 33 | -0x0p+0L, 0x0p+0L, |
| 34 | -0xf.85186008b15330cp-8L, 0x1.9b47488a6687672cp-72L, |
| 35 | -0x1.e27076e2af2e5e9ep-4L, -0xa.87ffe1fe9e155dcp-72L, |
| 36 | -0x2.bfe60e14f27a791p-4L, 0x1.83bebf1bdb88a032p-68L, |
| 37 | -0x3.91fef8f353443584p-4L, -0xb.b03de5ff734495cp-72L, |
| 38 | -0x4.59d72aeae98380e8p-4L, 0xc.e0aa3be4747dc1p-72L, |
| 39 | -0x5.1862f08717b09f4p-4L, -0x2.decdeccf1cd10578p-68L, |
| 40 | -0x5.ce75fdaef401a738p-4L, -0x9.314feb4fbde5aaep-72L, |
| 41 | -0x6.7cc8fb2fe612fcbp-4L, 0x2.5ca2642feb779f98p-68L, |
| 42 | }; |
| 43 | |
| 44 | /* High 32 bits of log2 (e), and remainder rounded to 64 bits. */ |
| 45 | static const long double log2e_hi = 0x1.71547652p+0L; |
| 46 | static const long double log2e_lo = 0xb.82fe1777d0ffda1p-36L; |
| 47 | |
| 48 | /* Given a number with high part HI and low part LO, add the number X |
| 49 | to it and store the result in *RHI and *RLO. It is given that |
| 50 | either |X| < |0.7 * HI|, or HI == LO == 0, and that the values are |
| 51 | small enough that no overflow occurs. The result does not need to |
| 52 | be exact to 128 bits; 78-bit accuracy of the final accumulated |
| 53 | result suffices. */ |
| 54 | |
| 55 | static inline void |
| 56 | acc_split (long double *rhi, long double *rlo, long double hi, long double lo, |
| 57 | long double x) |
| 58 | { |
| 59 | long double thi = hi + x; |
| 60 | long double tlo = (hi - thi) + x + lo; |
| 61 | *rhi = thi + tlo; |
| 62 | *rlo = (thi - *rhi) + tlo; |
| 63 | } |
| 64 | |
| 65 | extern long double __powl_helper (long double x, long double y); |
| 66 | libm_hidden_proto (__powl_helper) |
| 67 | |
| 68 | /* Given X a value that is finite and nonzero, or a NaN, and Y a |
| 69 | finite nonzero value with 0x1p-79 <= |Y| <= 0x1p78, compute X to |
| 70 | the power Y. */ |
| 71 | |
| 72 | long double |
| 73 | __powl_helper (long double x, long double y) |
| 74 | { |
| 75 | if (isnan (x)) |
| 76 | return __ieee754_expl (y * __ieee754_logl (x)); |
| 77 | bool negate; |
| 78 | if (x < 0) |
| 79 | { |
| 80 | long double absy = fabsl (y); |
| 81 | if (absy >= 0x1p64L) |
| 82 | negate = false; |
| 83 | else |
| 84 | { |
| 85 | unsigned long long yll = absy; |
| 86 | if (yll != absy) |
| 87 | return __ieee754_expl (y * __ieee754_logl (x)); |
| 88 | negate = (yll & 1) != 0; |
| 89 | } |
| 90 | x = fabsl (x); |
| 91 | } |
| 92 | else |
| 93 | negate = false; |
| 94 | |
| 95 | /* We need to compute Y * log2 (X) to at least 64 bits after the |
| 96 | point for normal results (that is, to at least 78 bits |
| 97 | precision). */ |
| 98 | int x_int_exponent; |
| 99 | long double x_frac; |
| 100 | x_frac = __frexpl (x, &x_int_exponent); |
| 101 | if (x_frac <= 0x0.aaaaaaaaaaaaaaaap0L) /* 2.0L / 3.0L, rounded down */ |
| 102 | { |
| 103 | x_frac *= 2.0; |
| 104 | x_int_exponent--; |
| 105 | } |
| 106 | |
| 107 | long double log_x_frac_hi, log_x_frac_lo; |
| 108 | /* Determine an initial approximation to log (X_FRAC) using |
| 109 | POWL_LOG_TABLE, and multiply by a value K/16 to reduce to an |
| 110 | interval (24/25, 26/25). */ |
| 111 | int k = (int) ((16.0L / x_frac) + 0.5L); |
| 112 | log_x_frac_hi = powl_log_table[2 * k - 24]; |
| 113 | log_x_frac_lo = powl_log_table[2 * k - 23]; |
| 114 | long double x_frac_low; |
| 115 | if (k == 16) |
| 116 | x_frac_low = 0.0L; |
| 117 | else |
| 118 | { |
| 119 | /* Mask off low 5 bits of X_FRAC so the multiplication by K/16 |
| 120 | is exact. These bits are small enough that they can be |
| 121 | corrected for by adding log2 (e) * X_FRAC_LOW to the final |
| 122 | result. */ |
| 123 | int32_t se; |
| 124 | uint32_t i0, i1; |
| 125 | GET_LDOUBLE_WORDS (se, i0, i1, x_frac); |
| 126 | x_frac_low = x_frac; |
| 127 | i1 &= 0xffffffe0; |
| 128 | SET_LDOUBLE_WORDS (x_frac, se, i0, i1); |
| 129 | x_frac_low -= x_frac; |
| 130 | x_frac_low /= x_frac; |
| 131 | x_frac *= k / 16.0L; |
| 132 | } |
| 133 | |
| 134 | /* Now compute log (X_FRAC) for X_FRAC in (24/25, 26/25). Separate |
| 135 | W = X_FRAC - 1 into high 16 bits and remaining bits, so that |
| 136 | multiplications for low-order power series terms are exact. The |
| 137 | remaining bits are small enough that adding a 64-bit value of |
| 138 | log2 (1 + W_LO / (1 + W_HI)) will be a sufficient correction for |
| 139 | them. */ |
| 140 | long double w = x_frac - 1; |
| 141 | long double w_hi, w_lo; |
| 142 | int32_t se; |
| 143 | uint32_t i0, i1; |
| 144 | GET_LDOUBLE_WORDS (se, i0, i1, w); |
| 145 | i0 &= 0xffff0000; |
| 146 | i1 = 0; |
| 147 | SET_LDOUBLE_WORDS (w_hi, se, i0, i1); |
| 148 | w_lo = w - w_hi; |
| 149 | long double wp = w_hi; |
| 150 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, wp); |
| 151 | wp *= -w_hi; |
| 152 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, |
| 153 | wp / 2.0L); |
| 154 | wp *= -w_hi; |
| 155 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, |
| 156 | wp * 0x0.5555p0L); /* -W_HI**3 / 3, high part. */ |
| 157 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, |
| 158 | wp * 0x0.5555555555555555p-16L); /* -W_HI**3 / 3, low part. */ |
| 159 | wp *= -w_hi; |
| 160 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, |
| 161 | wp / 4.0L); |
| 162 | /* Subsequent terms are small enough that they only need be computed |
| 163 | to 64 bits. */ |
| 164 | for (int i = 5; i <= 17; i++) |
| 165 | { |
| 166 | wp *= -w_hi; |
| 167 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, |
| 168 | wp / i); |
| 169 | } |
| 170 | |
| 171 | /* Convert LOG_X_FRAC_HI + LOG_X_FRAC_LO to a base-2 logarithm. */ |
| 172 | long double log2_x_frac_hi, log2_x_frac_lo; |
| 173 | long double log_x_frac_hi32, log_x_frac_lo64; |
| 174 | GET_LDOUBLE_WORDS (se, i0, i1, log_x_frac_hi); |
| 175 | i1 = 0; |
| 176 | SET_LDOUBLE_WORDS (log_x_frac_hi32, se, i0, i1); |
| 177 | log_x_frac_lo64 = (log_x_frac_hi - log_x_frac_hi32) + log_x_frac_lo; |
| 178 | long double log2_x_frac_hi1 = log_x_frac_hi32 * log2e_hi; |
| 179 | long double log2_x_frac_lo1 |
| 180 | = log_x_frac_lo64 * log2e_hi + log_x_frac_hi * log2e_lo; |
| 181 | log2_x_frac_hi = log2_x_frac_hi1 + log2_x_frac_lo1; |
| 182 | log2_x_frac_lo = (log2_x_frac_hi1 - log2_x_frac_hi) + log2_x_frac_lo1; |
| 183 | |
| 184 | /* Correct for the masking off of W_LO. */ |
| 185 | long double log2_1p_w_lo; |
| 186 | asm ("fyl2xp1" |
| 187 | : "=t" (log2_1p_w_lo) |
| 188 | : "0" (w_lo / (1.0L + w_hi)), "u" (1.0L) |
| 189 | : "st(1)" ); |
| 190 | acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo, |
| 191 | log2_1p_w_lo); |
| 192 | |
| 193 | /* Correct for the masking off of X_FRAC_LOW. */ |
| 194 | acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo, |
| 195 | x_frac_low * M_LOG2El); |
| 196 | |
| 197 | /* Add the integer and fractional parts of the base-2 logarithm. */ |
| 198 | long double log2_x_hi, log2_x_lo; |
| 199 | log2_x_hi = x_int_exponent + log2_x_frac_hi; |
| 200 | log2_x_lo = ((x_int_exponent - log2_x_hi) + log2_x_frac_hi) + log2_x_frac_lo; |
| 201 | |
| 202 | /* Compute the base-2 logarithm of the result. */ |
| 203 | long double log2_res_hi, log2_res_lo; |
| 204 | long double log2_x_hi32, log2_x_lo64; |
| 205 | GET_LDOUBLE_WORDS (se, i0, i1, log2_x_hi); |
| 206 | i1 = 0; |
| 207 | SET_LDOUBLE_WORDS (log2_x_hi32, se, i0, i1); |
| 208 | log2_x_lo64 = (log2_x_hi - log2_x_hi32) + log2_x_lo; |
| 209 | long double y_hi32, y_lo32; |
| 210 | GET_LDOUBLE_WORDS (se, i0, i1, y); |
| 211 | i1 = 0; |
| 212 | SET_LDOUBLE_WORDS (y_hi32, se, i0, i1); |
| 213 | y_lo32 = y - y_hi32; |
| 214 | log2_res_hi = log2_x_hi32 * y_hi32; |
| 215 | log2_res_lo = log2_x_hi32 * y_lo32 + log2_x_lo64 * y; |
| 216 | |
| 217 | /* Split the base-2 logarithm of the result into integer and |
| 218 | fractional parts. */ |
| 219 | long double log2_res_int = roundl (log2_res_hi); |
| 220 | long double log2_res_frac = log2_res_hi - log2_res_int + log2_res_lo; |
| 221 | /* If the integer part is very large, the computed fractional part |
| 222 | may be outside the valid range for f2xm1. */ |
| 223 | if (fabsl (log2_res_int) > 16500) |
| 224 | log2_res_frac = 0; |
| 225 | |
| 226 | /* Compute the final result. */ |
| 227 | long double res; |
| 228 | asm ("f2xm1" : "=t" (res) : "0" (log2_res_frac)); |
| 229 | res += 1.0L; |
| 230 | if (negate) |
| 231 | res = -res; |
| 232 | asm ("fscale" : "=t" (res) : "0" (res), "u" (log2_res_int)); |
| 233 | math_check_force_underflow (res); |
| 234 | return res; |
| 235 | } |
| 236 | |
| 237 | libm_hidden_def (__powl_helper) |
| 238 | |