| 1 | /* |
| 2 | * IBM Accurate Mathematical Library |
| 3 | * written by International Business Machines Corp. |
| 4 | * Copyright (C) 2001-2020 Free Software Foundation, Inc. |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU Lesser General Public License as published by |
| 8 | * the Free Software Foundation; either version 2.1 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | * |
| 11 | * This program is distributed in the hope that it will be useful, |
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | * GNU Lesser General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU Lesser General Public License |
| 17 | * along with this program; if not, see <https://www.gnu.org/licenses/>. |
| 18 | */ |
| 19 | /************************************************************************/ |
| 20 | /* MODULE_NAME: mpa.c */ |
| 21 | /* */ |
| 22 | /* FUNCTIONS: */ |
| 23 | /* mcr */ |
| 24 | /* acr */ |
| 25 | /* cpy */ |
| 26 | /* norm */ |
| 27 | /* denorm */ |
| 28 | /* mp_dbl */ |
| 29 | /* dbl_mp */ |
| 30 | /* add_magnitudes */ |
| 31 | /* sub_magnitudes */ |
| 32 | /* add */ |
| 33 | /* sub */ |
| 34 | /* mul */ |
| 35 | /* inv */ |
| 36 | /* dvd */ |
| 37 | /* */ |
| 38 | /* Arithmetic functions for multiple precision numbers. */ |
| 39 | /* Relative errors are bounded */ |
| 40 | /************************************************************************/ |
| 41 | |
| 42 | |
| 43 | #include "endian.h" |
| 44 | #include "mpa.h" |
| 45 | #include <sys/param.h> |
| 46 | #include <alloca.h> |
| 47 | |
| 48 | #ifndef SECTION |
| 49 | # define SECTION |
| 50 | #endif |
| 51 | |
| 52 | #ifndef NO__CONST |
| 53 | const mp_no __mpone = { 1, { 1.0, 1.0 } }; |
| 54 | const mp_no __mptwo = { 1, { 1.0, 2.0 } }; |
| 55 | #endif |
| 56 | |
| 57 | #ifndef NO___ACR |
| 58 | /* Compare mantissa of two multiple precision numbers regardless of the sign |
| 59 | and exponent of the numbers. */ |
| 60 | static int |
| 61 | mcr (const mp_no *x, const mp_no *y, int p) |
| 62 | { |
| 63 | long i; |
| 64 | long p2 = p; |
| 65 | for (i = 1; i <= p2; i++) |
| 66 | { |
| 67 | if (X[i] == Y[i]) |
| 68 | continue; |
| 69 | else if (X[i] > Y[i]) |
| 70 | return 1; |
| 71 | else |
| 72 | return -1; |
| 73 | } |
| 74 | return 0; |
| 75 | } |
| 76 | |
| 77 | /* Compare the absolute values of two multiple precision numbers. */ |
| 78 | int |
| 79 | __acr (const mp_no *x, const mp_no *y, int p) |
| 80 | { |
| 81 | long i; |
| 82 | |
| 83 | if (X[0] == 0) |
| 84 | { |
| 85 | if (Y[0] == 0) |
| 86 | i = 0; |
| 87 | else |
| 88 | i = -1; |
| 89 | } |
| 90 | else if (Y[0] == 0) |
| 91 | i = 1; |
| 92 | else |
| 93 | { |
| 94 | if (EX > EY) |
| 95 | i = 1; |
| 96 | else if (EX < EY) |
| 97 | i = -1; |
| 98 | else |
| 99 | i = mcr (x, y, p); |
| 100 | } |
| 101 | |
| 102 | return i; |
| 103 | } |
| 104 | #endif |
| 105 | |
| 106 | #ifndef NO___CPY |
| 107 | /* Copy multiple precision number X into Y. They could be the same |
| 108 | number. */ |
| 109 | void |
| 110 | __cpy (const mp_no *x, mp_no *y, int p) |
| 111 | { |
| 112 | long i; |
| 113 | |
| 114 | EY = EX; |
| 115 | for (i = 0; i <= p; i++) |
| 116 | Y[i] = X[i]; |
| 117 | } |
| 118 | #endif |
| 119 | |
| 120 | #ifndef NO___MP_DBL |
| 121 | /* Convert a multiple precision number *X into a double precision |
| 122 | number *Y, normalized case (|x| >= 2**(-1022))). X has precision |
| 123 | P, which is positive. */ |
| 124 | static void |
| 125 | norm (const mp_no *x, double *y, int p) |
| 126 | { |
| 127 | # define R RADIXI |
| 128 | long i; |
| 129 | double c; |
| 130 | mantissa_t a, u, v, z[5]; |
| 131 | if (p < 5) |
| 132 | { |
| 133 | if (p == 1) |
| 134 | c = X[1]; |
| 135 | else if (p == 2) |
| 136 | c = X[1] + R * X[2]; |
| 137 | else if (p == 3) |
| 138 | c = X[1] + R * (X[2] + R * X[3]); |
| 139 | else /* p == 4. */ |
| 140 | c = (X[1] + R * X[2]) + R * R * (X[3] + R * X[4]); |
| 141 | } |
| 142 | else |
| 143 | { |
| 144 | for (a = 1, z[1] = X[1]; z[1] < TWO23; ) |
| 145 | { |
| 146 | a *= 2; |
| 147 | z[1] *= 2; |
| 148 | } |
| 149 | |
| 150 | for (i = 2; i < 5; i++) |
| 151 | { |
| 152 | mantissa_store_t d, r; |
| 153 | d = X[i] * (mantissa_store_t) a; |
| 154 | DIV_RADIX (d, r); |
| 155 | z[i] = r; |
| 156 | z[i - 1] += d; |
| 157 | } |
| 158 | |
| 159 | u = ALIGN_DOWN_TO (z[3], TWO19); |
| 160 | v = z[3] - u; |
| 161 | |
| 162 | if (v == TWO18) |
| 163 | { |
| 164 | if (z[4] == 0) |
| 165 | { |
| 166 | for (i = 5; i <= p; i++) |
| 167 | { |
| 168 | if (X[i] == 0) |
| 169 | continue; |
| 170 | else |
| 171 | { |
| 172 | z[3] += 1; |
| 173 | break; |
| 174 | } |
| 175 | } |
| 176 | } |
| 177 | else |
| 178 | z[3] += 1; |
| 179 | } |
| 180 | |
| 181 | c = (z[1] + R * (z[2] + R * z[3])) / a; |
| 182 | } |
| 183 | |
| 184 | c *= X[0]; |
| 185 | |
| 186 | for (i = 1; i < EX; i++) |
| 187 | c *= RADIX; |
| 188 | for (i = 1; i > EX; i--) |
| 189 | c *= RADIXI; |
| 190 | |
| 191 | *y = c; |
| 192 | # undef R |
| 193 | } |
| 194 | |
| 195 | /* Convert a multiple precision number *X into a double precision |
| 196 | number *Y, Denormal case (|x| < 2**(-1022))). */ |
| 197 | static void |
| 198 | denorm (const mp_no *x, double *y, int p) |
| 199 | { |
| 200 | long i, k; |
| 201 | long p2 = p; |
| 202 | double c; |
| 203 | mantissa_t u, z[5]; |
| 204 | |
| 205 | # define R RADIXI |
| 206 | if (EX < -44 || (EX == -44 && X[1] < TWO5)) |
| 207 | { |
| 208 | *y = 0; |
| 209 | return; |
| 210 | } |
| 211 | |
| 212 | if (p2 == 1) |
| 213 | { |
| 214 | if (EX == -42) |
| 215 | { |
| 216 | z[1] = X[1] + TWO10; |
| 217 | z[2] = 0; |
| 218 | z[3] = 0; |
| 219 | k = 3; |
| 220 | } |
| 221 | else if (EX == -43) |
| 222 | { |
| 223 | z[1] = TWO10; |
| 224 | z[2] = X[1]; |
| 225 | z[3] = 0; |
| 226 | k = 2; |
| 227 | } |
| 228 | else |
| 229 | { |
| 230 | z[1] = TWO10; |
| 231 | z[2] = 0; |
| 232 | z[3] = X[1]; |
| 233 | k = 1; |
| 234 | } |
| 235 | } |
| 236 | else if (p2 == 2) |
| 237 | { |
| 238 | if (EX == -42) |
| 239 | { |
| 240 | z[1] = X[1] + TWO10; |
| 241 | z[2] = X[2]; |
| 242 | z[3] = 0; |
| 243 | k = 3; |
| 244 | } |
| 245 | else if (EX == -43) |
| 246 | { |
| 247 | z[1] = TWO10; |
| 248 | z[2] = X[1]; |
| 249 | z[3] = X[2]; |
| 250 | k = 2; |
| 251 | } |
| 252 | else |
| 253 | { |
| 254 | z[1] = TWO10; |
| 255 | z[2] = 0; |
| 256 | z[3] = X[1]; |
| 257 | k = 1; |
| 258 | } |
| 259 | } |
| 260 | else |
| 261 | { |
| 262 | if (EX == -42) |
| 263 | { |
| 264 | z[1] = X[1] + TWO10; |
| 265 | z[2] = X[2]; |
| 266 | k = 3; |
| 267 | } |
| 268 | else if (EX == -43) |
| 269 | { |
| 270 | z[1] = TWO10; |
| 271 | z[2] = X[1]; |
| 272 | k = 2; |
| 273 | } |
| 274 | else |
| 275 | { |
| 276 | z[1] = TWO10; |
| 277 | z[2] = 0; |
| 278 | k = 1; |
| 279 | } |
| 280 | z[3] = X[k]; |
| 281 | } |
| 282 | |
| 283 | u = ALIGN_DOWN_TO (z[3], TWO5); |
| 284 | |
| 285 | if (u == z[3]) |
| 286 | { |
| 287 | for (i = k + 1; i <= p2; i++) |
| 288 | { |
| 289 | if (X[i] == 0) |
| 290 | continue; |
| 291 | else |
| 292 | { |
| 293 | z[3] += 1; |
| 294 | break; |
| 295 | } |
| 296 | } |
| 297 | } |
| 298 | |
| 299 | c = X[0] * ((z[1] + R * (z[2] + R * z[3])) - TWO10); |
| 300 | |
| 301 | *y = c * TWOM1032; |
| 302 | # undef R |
| 303 | } |
| 304 | |
| 305 | /* Convert multiple precision number *X into double precision number *Y. The |
| 306 | result is correctly rounded to the nearest/even. */ |
| 307 | void |
| 308 | __mp_dbl (const mp_no *x, double *y, int p) |
| 309 | { |
| 310 | if (X[0] == 0) |
| 311 | { |
| 312 | *y = 0; |
| 313 | return; |
| 314 | } |
| 315 | |
| 316 | if (__glibc_likely (EX > -42 || (EX == -42 && X[1] >= TWO10))) |
| 317 | norm (x, y, p); |
| 318 | else |
| 319 | denorm (x, y, p); |
| 320 | } |
| 321 | #endif |
| 322 | |
| 323 | /* Get the multiple precision equivalent of X into *Y. If the precision is too |
| 324 | small, the result is truncated. */ |
| 325 | void |
| 326 | SECTION |
| 327 | __dbl_mp (double x, mp_no *y, int p) |
| 328 | { |
| 329 | long i, n; |
| 330 | long p2 = p; |
| 331 | |
| 332 | /* Sign. */ |
| 333 | if (x == 0) |
| 334 | { |
| 335 | Y[0] = 0; |
| 336 | return; |
| 337 | } |
| 338 | else if (x > 0) |
| 339 | Y[0] = 1; |
| 340 | else |
| 341 | { |
| 342 | Y[0] = -1; |
| 343 | x = -x; |
| 344 | } |
| 345 | |
| 346 | /* Exponent. */ |
| 347 | for (EY = 1; x >= RADIX; EY += 1) |
| 348 | x *= RADIXI; |
| 349 | for (; x < 1; EY -= 1) |
| 350 | x *= RADIX; |
| 351 | |
| 352 | /* Digits. */ |
| 353 | n = MIN (p2, 4); |
| 354 | for (i = 1; i <= n; i++) |
| 355 | { |
| 356 | INTEGER_OF (x, Y[i]); |
| 357 | x *= RADIX; |
| 358 | } |
| 359 | for (; i <= p2; i++) |
| 360 | Y[i] = 0; |
| 361 | } |
| 362 | |
| 363 | /* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0. The |
| 364 | sign of the sum *Z is not changed. X and Y may overlap but not X and Z or |
| 365 | Y and Z. No guard digit is used. The result equals the exact sum, |
| 366 | truncated. */ |
| 367 | static void |
| 368 | SECTION |
| 369 | add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p) |
| 370 | { |
| 371 | long i, j, k; |
| 372 | long p2 = p; |
| 373 | mantissa_t zk; |
| 374 | |
| 375 | EZ = EX; |
| 376 | |
| 377 | i = p2; |
| 378 | j = p2 + EY - EX; |
| 379 | k = p2 + 1; |
| 380 | |
| 381 | if (__glibc_unlikely (j < 1)) |
| 382 | { |
| 383 | __cpy (x, z, p); |
| 384 | return; |
| 385 | } |
| 386 | |
| 387 | zk = 0; |
| 388 | |
| 389 | for (; j > 0; i--, j--) |
| 390 | { |
| 391 | zk += X[i] + Y[j]; |
| 392 | if (zk >= RADIX) |
| 393 | { |
| 394 | Z[k--] = zk - RADIX; |
| 395 | zk = 1; |
| 396 | } |
| 397 | else |
| 398 | { |
| 399 | Z[k--] = zk; |
| 400 | zk = 0; |
| 401 | } |
| 402 | } |
| 403 | |
| 404 | for (; i > 0; i--) |
| 405 | { |
| 406 | zk += X[i]; |
| 407 | if (zk >= RADIX) |
| 408 | { |
| 409 | Z[k--] = zk - RADIX; |
| 410 | zk = 1; |
| 411 | } |
| 412 | else |
| 413 | { |
| 414 | Z[k--] = zk; |
| 415 | zk = 0; |
| 416 | } |
| 417 | } |
| 418 | |
| 419 | if (zk == 0) |
| 420 | { |
| 421 | for (i = 1; i <= p2; i++) |
| 422 | Z[i] = Z[i + 1]; |
| 423 | } |
| 424 | else |
| 425 | { |
| 426 | Z[1] = zk; |
| 427 | EZ += 1; |
| 428 | } |
| 429 | } |
| 430 | |
| 431 | /* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0. |
| 432 | The sign of the difference *Z is not changed. X and Y may overlap but not X |
| 433 | and Z or Y and Z. One guard digit is used. The error is less than one |
| 434 | ULP. */ |
| 435 | static void |
| 436 | SECTION |
| 437 | sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p) |
| 438 | { |
| 439 | long i, j, k; |
| 440 | long p2 = p; |
| 441 | mantissa_t zk; |
| 442 | |
| 443 | EZ = EX; |
| 444 | i = p2; |
| 445 | j = p2 + EY - EX; |
| 446 | k = p2; |
| 447 | |
| 448 | /* Y is too small compared to X, copy X over to the result. */ |
| 449 | if (__glibc_unlikely (j < 1)) |
| 450 | { |
| 451 | __cpy (x, z, p); |
| 452 | return; |
| 453 | } |
| 454 | |
| 455 | /* The relevant least significant digit in Y is non-zero, so we factor it in |
| 456 | to enhance accuracy. */ |
| 457 | if (j < p2 && Y[j + 1] > 0) |
| 458 | { |
| 459 | Z[k + 1] = RADIX - Y[j + 1]; |
| 460 | zk = -1; |
| 461 | } |
| 462 | else |
| 463 | zk = Z[k + 1] = 0; |
| 464 | |
| 465 | /* Subtract and borrow. */ |
| 466 | for (; j > 0; i--, j--) |
| 467 | { |
| 468 | zk += (X[i] - Y[j]); |
| 469 | if (zk < 0) |
| 470 | { |
| 471 | Z[k--] = zk + RADIX; |
| 472 | zk = -1; |
| 473 | } |
| 474 | else |
| 475 | { |
| 476 | Z[k--] = zk; |
| 477 | zk = 0; |
| 478 | } |
| 479 | } |
| 480 | |
| 481 | /* We're done with digits from Y, so it's just digits in X. */ |
| 482 | for (; i > 0; i--) |
| 483 | { |
| 484 | zk += X[i]; |
| 485 | if (zk < 0) |
| 486 | { |
| 487 | Z[k--] = zk + RADIX; |
| 488 | zk = -1; |
| 489 | } |
| 490 | else |
| 491 | { |
| 492 | Z[k--] = zk; |
| 493 | zk = 0; |
| 494 | } |
| 495 | } |
| 496 | |
| 497 | /* Normalize. */ |
| 498 | for (i = 1; Z[i] == 0; i++) |
| 499 | ; |
| 500 | EZ = EZ - i + 1; |
| 501 | for (k = 1; i <= p2 + 1; ) |
| 502 | Z[k++] = Z[i++]; |
| 503 | for (; k <= p2; ) |
| 504 | Z[k++] = 0; |
| 505 | } |
| 506 | |
| 507 | /* Add *X and *Y and store the result in *Z. X and Y may overlap, but not X |
| 508 | and Z or Y and Z. One guard digit is used. The error is less than one |
| 509 | ULP. */ |
| 510 | void |
| 511 | SECTION |
| 512 | __add (const mp_no *x, const mp_no *y, mp_no *z, int p) |
| 513 | { |
| 514 | int n; |
| 515 | |
| 516 | if (X[0] == 0) |
| 517 | { |
| 518 | __cpy (y, z, p); |
| 519 | return; |
| 520 | } |
| 521 | else if (Y[0] == 0) |
| 522 | { |
| 523 | __cpy (x, z, p); |
| 524 | return; |
| 525 | } |
| 526 | |
| 527 | if (X[0] == Y[0]) |
| 528 | { |
| 529 | if (__acr (x, y, p) > 0) |
| 530 | { |
| 531 | add_magnitudes (x, y, z, p); |
| 532 | Z[0] = X[0]; |
| 533 | } |
| 534 | else |
| 535 | { |
| 536 | add_magnitudes (y, x, z, p); |
| 537 | Z[0] = Y[0]; |
| 538 | } |
| 539 | } |
| 540 | else |
| 541 | { |
| 542 | if ((n = __acr (x, y, p)) == 1) |
| 543 | { |
| 544 | sub_magnitudes (x, y, z, p); |
| 545 | Z[0] = X[0]; |
| 546 | } |
| 547 | else if (n == -1) |
| 548 | { |
| 549 | sub_magnitudes (y, x, z, p); |
| 550 | Z[0] = Y[0]; |
| 551 | } |
| 552 | else |
| 553 | Z[0] = 0; |
| 554 | } |
| 555 | } |
| 556 | |
| 557 | /* Subtract *Y from *X and return the result in *Z. X and Y may overlap but |
| 558 | not X and Z or Y and Z. One guard digit is used. The error is less than |
| 559 | one ULP. */ |
| 560 | void |
| 561 | SECTION |
| 562 | __sub (const mp_no *x, const mp_no *y, mp_no *z, int p) |
| 563 | { |
| 564 | int n; |
| 565 | |
| 566 | if (X[0] == 0) |
| 567 | { |
| 568 | __cpy (y, z, p); |
| 569 | Z[0] = -Z[0]; |
| 570 | return; |
| 571 | } |
| 572 | else if (Y[0] == 0) |
| 573 | { |
| 574 | __cpy (x, z, p); |
| 575 | return; |
| 576 | } |
| 577 | |
| 578 | if (X[0] != Y[0]) |
| 579 | { |
| 580 | if (__acr (x, y, p) > 0) |
| 581 | { |
| 582 | add_magnitudes (x, y, z, p); |
| 583 | Z[0] = X[0]; |
| 584 | } |
| 585 | else |
| 586 | { |
| 587 | add_magnitudes (y, x, z, p); |
| 588 | Z[0] = -Y[0]; |
| 589 | } |
| 590 | } |
| 591 | else |
| 592 | { |
| 593 | if ((n = __acr (x, y, p)) == 1) |
| 594 | { |
| 595 | sub_magnitudes (x, y, z, p); |
| 596 | Z[0] = X[0]; |
| 597 | } |
| 598 | else if (n == -1) |
| 599 | { |
| 600 | sub_magnitudes (y, x, z, p); |
| 601 | Z[0] = -Y[0]; |
| 602 | } |
| 603 | else |
| 604 | Z[0] = 0; |
| 605 | } |
| 606 | } |
| 607 | |
| 608 | #ifndef NO__MUL |
| 609 | /* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X |
| 610 | and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P |
| 611 | digits. In case P > 3 the error is bounded by 1.001 ULP. */ |
| 612 | void |
| 613 | SECTION |
| 614 | __mul (const mp_no *x, const mp_no *y, mp_no *z, int p) |
| 615 | { |
| 616 | long i, j, k, ip, ip2; |
| 617 | long p2 = p; |
| 618 | mantissa_store_t zk; |
| 619 | const mp_no *a; |
| 620 | mantissa_store_t *diag; |
| 621 | |
| 622 | /* Is z=0? */ |
| 623 | if (__glibc_unlikely (X[0] * Y[0] == 0)) |
| 624 | { |
| 625 | Z[0] = 0; |
| 626 | return; |
| 627 | } |
| 628 | |
| 629 | /* We need not iterate through all X's and Y's since it's pointless to |
| 630 | multiply zeroes. Here, both are zero... */ |
| 631 | for (ip2 = p2; ip2 > 0; ip2--) |
| 632 | if (X[ip2] != 0 || Y[ip2] != 0) |
| 633 | break; |
| 634 | |
| 635 | a = X[ip2] != 0 ? y : x; |
| 636 | |
| 637 | /* ... and here, at least one of them is still zero. */ |
| 638 | for (ip = ip2; ip > 0; ip--) |
| 639 | if (a->d[ip] != 0) |
| 640 | break; |
| 641 | |
| 642 | /* The product looks like this for p = 3 (as an example): |
| 643 | |
| 644 | |
| 645 | a1 a2 a3 |
| 646 | x b1 b2 b3 |
| 647 | ----------------------------- |
| 648 | a1*b3 a2*b3 a3*b3 |
| 649 | a1*b2 a2*b2 a3*b2 |
| 650 | a1*b1 a2*b1 a3*b1 |
| 651 | |
| 652 | So our K needs to ideally be P*2, but we're limiting ourselves to P + 3 |
| 653 | for P >= 3. We compute the above digits in two parts; the last P-1 |
| 654 | digits and then the first P digits. The last P-1 digits are a sum of |
| 655 | products of the input digits from P to P-k where K is 0 for the least |
| 656 | significant digit and increases as we go towards the left. The product |
| 657 | term is of the form X[k]*X[P-k] as can be seen in the above example. |
| 658 | |
| 659 | The first P digits are also a sum of products with the same product term, |
| 660 | except that the sum is from 1 to k. This is also evident from the above |
| 661 | example. |
| 662 | |
| 663 | Another thing that becomes evident is that only the most significant |
| 664 | ip+ip2 digits of the result are non-zero, where ip and ip2 are the |
| 665 | 'internal precision' of the input numbers, i.e. digits after ip and ip2 |
| 666 | are all 0. */ |
| 667 | |
| 668 | k = (__glibc_unlikely (p2 < 3)) ? p2 + p2 : p2 + 3; |
| 669 | |
| 670 | while (k > ip + ip2 + 1) |
| 671 | Z[k--] = 0; |
| 672 | |
| 673 | zk = 0; |
| 674 | |
| 675 | /* Precompute sums of diagonal elements so that we can directly use them |
| 676 | later. See the next comment to know we why need them. */ |
| 677 | diag = alloca (k * sizeof (mantissa_store_t)); |
| 678 | mantissa_store_t d = 0; |
| 679 | for (i = 1; i <= ip; i++) |
| 680 | { |
| 681 | d += X[i] * (mantissa_store_t) Y[i]; |
| 682 | diag[i] = d; |
| 683 | } |
| 684 | while (i < k) |
| 685 | diag[i++] = d; |
| 686 | |
| 687 | while (k > p2) |
| 688 | { |
| 689 | long lim = k / 2; |
| 690 | |
| 691 | if (k % 2 == 0) |
| 692 | /* We want to add this only once, but since we subtract it in the sum |
| 693 | of products above, we add twice. */ |
| 694 | zk += 2 * X[lim] * (mantissa_store_t) Y[lim]; |
| 695 | |
| 696 | for (i = k - p2, j = p2; i < j; i++, j--) |
| 697 | zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]); |
| 698 | |
| 699 | zk -= diag[k - 1]; |
| 700 | |
| 701 | DIV_RADIX (zk, Z[k]); |
| 702 | k--; |
| 703 | } |
| 704 | |
| 705 | /* The real deal. Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i |
| 706 | goes from 1 -> k - 1 and j goes the same range in reverse. To reduce the |
| 707 | number of multiplications, we halve the range and if k is an even number, |
| 708 | add the diagonal element X[k/2]Y[k/2]. Through the half range, we compute |
| 709 | X[i] * Y[j] as (X[i] + X[j]) * (Y[i] + Y[j]) - X[i] * Y[i] - X[j] * Y[j]. |
| 710 | |
| 711 | This reduction tells us that we're summing two things, the first term |
| 712 | through the half range and the negative of the sum of the product of all |
| 713 | terms of X and Y in the full range. i.e. |
| 714 | |
| 715 | SUM(X[i] * Y[i]) for k terms. This is precalculated above for each k in |
| 716 | a single loop so that it completes in O(n) time and can hence be directly |
| 717 | used in the loop below. */ |
| 718 | while (k > 1) |
| 719 | { |
| 720 | long lim = k / 2; |
| 721 | |
| 722 | if (k % 2 == 0) |
| 723 | /* We want to add this only once, but since we subtract it in the sum |
| 724 | of products above, we add twice. */ |
| 725 | zk += 2 * X[lim] * (mantissa_store_t) Y[lim]; |
| 726 | |
| 727 | for (i = 1, j = k - 1; i < j; i++, j--) |
| 728 | zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]); |
| 729 | |
| 730 | zk -= diag[k - 1]; |
| 731 | |
| 732 | DIV_RADIX (zk, Z[k]); |
| 733 | k--; |
| 734 | } |
| 735 | Z[k] = zk; |
| 736 | |
| 737 | /* Get the exponent sum into an intermediate variable. This is a subtle |
| 738 | optimization, where given enough registers, all operations on the exponent |
| 739 | happen in registers and the result is written out only once into EZ. */ |
| 740 | int e = EX + EY; |
| 741 | |
| 742 | /* Is there a carry beyond the most significant digit? */ |
| 743 | if (__glibc_unlikely (Z[1] == 0)) |
| 744 | { |
| 745 | for (i = 1; i <= p2; i++) |
| 746 | Z[i] = Z[i + 1]; |
| 747 | e--; |
| 748 | } |
| 749 | |
| 750 | EZ = e; |
| 751 | Z[0] = X[0] * Y[0]; |
| 752 | } |
| 753 | #endif |
| 754 | |
| 755 | #ifndef NO__SQR |
| 756 | /* Square *X and store result in *Y. X and Y may not overlap. For P in |
| 757 | [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the |
| 758 | error is bounded by 1.001 ULP. This is a faster special case of |
| 759 | multiplication. */ |
| 760 | void |
| 761 | SECTION |
| 762 | __sqr (const mp_no *x, mp_no *y, int p) |
| 763 | { |
| 764 | long i, j, k, ip; |
| 765 | mantissa_store_t yk; |
| 766 | |
| 767 | /* Is z=0? */ |
| 768 | if (__glibc_unlikely (X[0] == 0)) |
| 769 | { |
| 770 | Y[0] = 0; |
| 771 | return; |
| 772 | } |
| 773 | |
| 774 | /* We need not iterate through all X's since it's pointless to |
| 775 | multiply zeroes. */ |
| 776 | for (ip = p; ip > 0; ip--) |
| 777 | if (X[ip] != 0) |
| 778 | break; |
| 779 | |
| 780 | k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; |
| 781 | |
| 782 | while (k > 2 * ip + 1) |
| 783 | Y[k--] = 0; |
| 784 | |
| 785 | yk = 0; |
| 786 | |
| 787 | while (k > p) |
| 788 | { |
| 789 | mantissa_store_t yk2 = 0; |
| 790 | long lim = k / 2; |
| 791 | |
| 792 | if (k % 2 == 0) |
| 793 | yk += X[lim] * (mantissa_store_t) X[lim]; |
| 794 | |
| 795 | /* In __mul, this loop (and the one within the next while loop) run |
| 796 | between a range to calculate the mantissa as follows: |
| 797 | |
| 798 | Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1] |
| 799 | + X[n] * Y[k] |
| 800 | |
| 801 | For X == Y, we can get away with summing halfway and doubling the |
| 802 | result. For cases where the range size is even, the mid-point needs |
| 803 | to be added separately (above). */ |
| 804 | for (i = k - p, j = p; i < j; i++, j--) |
| 805 | yk2 += X[i] * (mantissa_store_t) X[j]; |
| 806 | |
| 807 | yk += 2 * yk2; |
| 808 | |
| 809 | DIV_RADIX (yk, Y[k]); |
| 810 | k--; |
| 811 | } |
| 812 | |
| 813 | while (k > 1) |
| 814 | { |
| 815 | mantissa_store_t yk2 = 0; |
| 816 | long lim = k / 2; |
| 817 | |
| 818 | if (k % 2 == 0) |
| 819 | yk += X[lim] * (mantissa_store_t) X[lim]; |
| 820 | |
| 821 | /* Likewise for this loop. */ |
| 822 | for (i = 1, j = k - 1; i < j; i++, j--) |
| 823 | yk2 += X[i] * (mantissa_store_t) X[j]; |
| 824 | |
| 825 | yk += 2 * yk2; |
| 826 | |
| 827 | DIV_RADIX (yk, Y[k]); |
| 828 | k--; |
| 829 | } |
| 830 | Y[k] = yk; |
| 831 | |
| 832 | /* Squares are always positive. */ |
| 833 | Y[0] = 1; |
| 834 | |
| 835 | /* Get the exponent sum into an intermediate variable. This is a subtle |
| 836 | optimization, where given enough registers, all operations on the exponent |
| 837 | happen in registers and the result is written out only once into EZ. */ |
| 838 | int e = EX * 2; |
| 839 | |
| 840 | /* Is there a carry beyond the most significant digit? */ |
| 841 | if (__glibc_unlikely (Y[1] == 0)) |
| 842 | { |
| 843 | for (i = 1; i <= p; i++) |
| 844 | Y[i] = Y[i + 1]; |
| 845 | e--; |
| 846 | } |
| 847 | |
| 848 | EY = e; |
| 849 | } |
| 850 | #endif |
| 851 | |
| 852 | /* Invert *X and store in *Y. Relative error bound: |
| 853 | - For P = 2: 1.001 * R ^ (1 - P) |
| 854 | - For P = 3: 1.063 * R ^ (1 - P) |
| 855 | - For P > 3: 2.001 * R ^ (1 - P) |
| 856 | |
| 857 | *X = 0 is not permissible. */ |
| 858 | static void |
| 859 | SECTION |
| 860 | __inv (const mp_no *x, mp_no *y, int p) |
| 861 | { |
| 862 | long i; |
| 863 | double t; |
| 864 | mp_no z, w; |
| 865 | static const int np1[] = |
| 866 | { 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 867 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 |
| 868 | }; |
| 869 | |
| 870 | __cpy (x, &z, p); |
| 871 | z.e = 0; |
| 872 | __mp_dbl (&z, &t, p); |
| 873 | t = 1 / t; |
| 874 | |
| 875 | /* t == 0 will never happen at this point, since 1/t can only be 0 if t is |
| 876 | infinity, but before the division t == mantissa of x (exponent is 0). We |
| 877 | can instruct the compiler to ignore this case. */ |
| 878 | if (t == 0) |
| 879 | __builtin_unreachable (); |
| 880 | |
| 881 | __dbl_mp (t, y, p); |
| 882 | EY -= EX; |
| 883 | |
| 884 | for (i = 0; i < np1[p]; i++) |
| 885 | { |
| 886 | __cpy (y, &w, p); |
| 887 | __mul (x, &w, y, p); |
| 888 | __sub (&__mptwo, y, &z, p); |
| 889 | __mul (&w, &z, y, p); |
| 890 | } |
| 891 | } |
| 892 | |
| 893 | /* Divide *X by *Y and store result in *Z. X and Y may overlap but not X and Z |
| 894 | or Y and Z. Relative error bound: |
| 895 | - For P = 2: 2.001 * R ^ (1 - P) |
| 896 | - For P = 3: 2.063 * R ^ (1 - P) |
| 897 | - For P > 3: 3.001 * R ^ (1 - P) |
| 898 | |
| 899 | *X = 0 is not permissible. */ |
| 900 | void |
| 901 | SECTION |
| 902 | __dvd (const mp_no *x, const mp_no *y, mp_no *z, int p) |
| 903 | { |
| 904 | mp_no w; |
| 905 | |
| 906 | if (X[0] == 0) |
| 907 | Z[0] = 0; |
| 908 | else |
| 909 | { |
| 910 | __inv (y, &w, p); |
| 911 | __mul (x, &w, z, p); |
| 912 | } |
| 913 | } |
| 914 | |