1/***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3 * Martin Renou *
4 * Copyright (c) QuantStack *
5 * Copyright (c) Serge Guelton *
6 * *
7 * Distributed under the terms of the BSD 3-Clause License. *
8 * *
9 * The full license is in the file LICENSE, distributed with this software. *
10 ****************************************************************************/
11
12#include <cmath>
13#include <cstdint>
14#include <cstring>
15
16namespace xsimd
17{
18 namespace detail
19 {
20
21 /* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */
22 /*
23 * ====================================================
24 * copyright 2016 NumScale SAS
25 *
26 * Distributed under the Boost Software License, Version 1.0.
27 * (See copy at http://boost.org/LICENSE_1_0.txt)
28 * ====================================================
29 */
30#if defined(_MSC_VER)
31#define ONCE0 \
32 __pragma(warning(push)) \
33 __pragma(warning(disable : 4127)) while (0) \
34 __pragma(warning(pop)) /**/
35#else
36#define ONCE0 while (0)
37#endif
38
39 /*
40 * ====================================================
41 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
42 *
43 * Developed at SunPro, a Sun Microsystems, Inc. business.
44 * Permission to use, copy, modify, and distribute this
45 * software is freely granted, provided that this notice
46 * is preserved.
47 * ====================================================
48 */
49
50#if defined(__GNUC__) && defined(__BYTE_ORDER__)
51#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
52#define XSIMD_LITTLE_ENDIAN
53#endif
54#elif defined(_WIN32)
55// We can safely assume that Windows is always little endian
56#define XSIMD_LITTLE_ENDIAN
57#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
58#define XSIMD_LITTLE_ENDIAN
59#endif
60
61#ifdef XSIMD_LITTLE_ENDIAN
62#define LOW_WORD_IDX 0
63#define HIGH_WORD_IDX sizeof(std::uint32_t)
64#else
65#define LOW_WORD_IDX sizeof(std::uint32_t)
66#define HIGH_WORD_IDX 0
67#endif
68
69#define GET_HIGH_WORD(i, d) \
70 do \
71 { \
72 double f = (d); \
73 std::memcpy(&(i), reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
74 sizeof(std::uint32_t)); \
75 } \
76 ONCE0 \
77 /**/
78
79#define GET_LOW_WORD(i, d) \
80 do \
81 { \
82 double f = (d); \
83 std::memcpy(&(i), reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
84 sizeof(std::uint32_t)); \
85 } \
86 ONCE0 \
87 /**/
88
89#define SET_HIGH_WORD(d, v) \
90 do \
91 { \
92 double f = (d); \
93 std::uint32_t value = (v); \
94 std::memcpy(reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
95 &value, sizeof(std::uint32_t)); \
96 (d) = f; \
97 } \
98 ONCE0 \
99 /**/
100
101#define SET_LOW_WORD(d, v) \
102 do \
103 { \
104 double f = (d); \
105 std::uint32_t value = (v); \
106 std::memcpy(reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
107 &value, sizeof(std::uint32_t)); \
108 (d) = f; \
109 } \
110 ONCE0 \
111 /**/
112
113 /*
114 * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
115 * double x[],y[]; int e0,nx,prec; int ipio2[];
116 *
117 * __kernel_rem_pio2 return the last three digits of N with
118 * y = x - N*pi/2
119 * so that |y| < pi/2.
120 *
121 * The method is to compute the integer (mod 8) and fraction parts of
122 * (2/pi)*x without doing the full multiplication. In general we
123 * skip the part of the product that are known to be a huge integer (
124 * more accurately, = 0 mod 8 ). Thus the number of operations are
125 * independent of the exponent of the input.
126 *
127 * (2/pi) is represented by an array of 24-bit integers in ipio2[].
128 *
129 * Input parameters:
130 * x[] The input value (must be positive) is broken into nx
131 * pieces of 24-bit integers in double precision format.
132 * x[i] will be the i-th 24 bit of x. The scaled exponent
133 * of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
134 * match x's up to 24 bits.
135 *
136 * Example of breaking a double positive z into x[0]+x[1]+x[2]:
137 * e0 = ilogb(z)-23
138 * z = scalbn(z,-e0)
139 * for i = 0,1,2
140 * x[i] = floor(z)
141 * z = (z-x[i])*2**24
142 *
143 *
144 * y[] ouput result in an array of double precision numbers.
145 * The dimension of y[] is:
146 * 24-bit precision 1
147 * 53-bit precision 2
148 * 64-bit precision 2
149 * 113-bit precision 3
150 * The actual value is the sum of them. Thus for 113-bit
151 * precison, one may have to do something like:
152 *
153 * long double t,w,r_head, r_tail;
154 * t = (long double)y[2] + (long double)y[1];
155 * w = (long double)y[0];
156 * r_head = t+w;
157 * r_tail = w - (r_head - t);
158 *
159 * e0 The exponent of x[0]
160 *
161 * nx dimension of x[]
162 *
163 * prec an integer indicating the precision:
164 * 0 24 bits (single)
165 * 1 53 bits (double)
166 * 2 64 bits (extended)
167 * 3 113 bits (quad)
168 *
169 * ipio2[]
170 * integer array, contains the (24*i)-th to (24*i+23)-th
171 * bit of 2/pi after binary point. The corresponding
172 * floating value is
173 *
174 * ipio2[i] * 2^(-24(i+1)).
175 *
176 * External function:
177 * double scalbn(), floor();
178 *
179 *
180 * Here is the description of some local variables:
181 *
182 * jk jk+1 is the initial number of terms of ipio2[] needed
183 * in the computation. The recommended value is 2,3,4,
184 * 6 for single, double, extended,and quad.
185 *
186 * jz local integer variable indicating the number of
187 * terms of ipio2[] used.
188 *
189 * jx nx - 1
190 *
191 * jv index for pointing to the suitable ipio2[] for the
192 * computation. In general, we want
193 * ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
194 * is an integer. Thus
195 * e0-3-24*jv >= 0 or (e0-3)/24 >= jv
196 * Hence jv = max(0,(e0-3)/24).
197 *
198 * jp jp+1 is the number of terms in PIo2[] needed, jp = jk.
199 *
200 * q[] double array with integral value, representing the
201 * 24-bits chunk of the product of x and 2/pi.
202 *
203 * q0 the corresponding exponent of q[0]. Note that the
204 * exponent for q[i] would be q0-24*i.
205 *
206 * PIo2[] double precision array, obtained by cutting pi/2
207 * into 24 bits chunks.
208 *
209 * f[] ipio2[] in floating point
210 *
211 * iq[] integer array by breaking up q[] in 24-bits chunk.
212 *
213 * fq[] final product of x*(2/pi) in fq[0],..,fq[jk]
214 *
215 * ih integer. If >0 it indicates q[] is >= 0.5, hence
216 * it also indicates the *sign* of the result.
217 *
218 */
219
220 inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
221 {
222 static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
223
224 static const double PIo2[] = {
225 1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
226 7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
227 5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
228 3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
229 1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
230 1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
231 2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
232 2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
233 };
234
235 static const double
236 zero
237 = 0.0,
238 one = 1.0,
239 two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
240 twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
241
242 int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
243 double z, fw, f[20], fq[20], q[20];
244
245 /* initialize jk*/
246 jk = init_jk[prec];
247 jp = jk;
248
249 /* determine jx,jv,q0, note that 3>q0 */
250 jx = nx - 1;
251 jv = (e0 - 3) / 24;
252 if (jv < 0)
253 jv = 0;
254 q0 = e0 - 24 * (jv + 1);
255
256 /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
257 j = jv - jx;
258 m = jx + jk;
259 for (i = 0; i <= m; i++, j++)
260 f[i] = (j < 0) ? zero : (double)ipio2[j];
261
262 /* compute q[0],q[1],...q[jk] */
263 for (i = 0; i <= jk; i++)
264 {
265 for (j = 0, fw = 0.0; j <= jx; j++)
266 fw += x[j] * f[jx + i - j];
267 q[i] = fw;
268 }
269
270 jz = jk;
271
272 recompute:
273 /* distill q[] into iq[] reversingly */
274 for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
275 {
276 fw = (double)((int32_t)(twon24 * z));
277 iq[i] = (int)(z - two24 * fw);
278 z = q[j - 1] + fw;
279 }
280
281 /* compute n */
282 z = std::scalbn(x: z, n: q0); /* actual value of z */
283 z -= 8.0 * std::floor(x: z * 0.125); /* trim off integer >= 8 */
284 n = (int32_t)z;
285 z -= (double)n;
286 ih = 0;
287 if (q0 > 0)
288 { /* need iq[jz-1] to determine n */
289 i = (iq[jz - 1] >> (24 - q0));
290 n += i;
291 iq[jz - 1] -= i << (24 - q0);
292 ih = iq[jz - 1] >> (23 - q0);
293 }
294 else if (q0 == 0)
295 ih = iq[jz - 1] >> 23;
296 else if (z >= 0.5)
297 ih = 2;
298
299 if (ih > 0)
300 { /* q > 0.5 */
301 n += 1;
302 carry = 0;
303 for (i = 0; i < jz; i++)
304 { /* compute 1-q */
305 j = iq[i];
306 if (carry == 0)
307 {
308 if (j != 0)
309 {
310 carry = 1;
311 iq[i] = 0x1000000 - j;
312 }
313 }
314 else
315 iq[i] = 0xffffff - j;
316 }
317 if (q0 > 0)
318 { /* rare case: chance is 1 in 12 */
319 switch (q0)
320 {
321 case 1:
322 iq[jz - 1] &= 0x7fffff;
323 break;
324 case 2:
325 iq[jz - 1] &= 0x3fffff;
326 break;
327 }
328 }
329 if (ih == 2)
330 {
331 z = one - z;
332 if (carry != 0)
333 z -= std::scalbn(x: one, n: q0);
334 }
335 }
336
337 /* check if recomputation is needed */
338 if (z == zero)
339 {
340 j = 0;
341 for (i = jz - 1; i >= jk; i--)
342 j |= iq[i];
343 if (j == 0)
344 { /* need recomputation */
345 for (k = 1; iq[jk - k] == 0; k++)
346 ; /* k = no. of terms needed */
347
348 for (i = jz + 1; i <= jz + k; i++)
349 { /* add q[jz+1] to q[jz+k] */
350 f[jx + i] = (double)ipio2[jv + i];
351 for (j = 0, fw = 0.0; j <= jx; j++)
352 fw += x[j] * f[jx + i - j];
353 q[i] = fw;
354 }
355 jz += k;
356 goto recompute;
357 }
358 }
359
360 /* chop off zero terms */
361 if (z == 0.0)
362 {
363 jz -= 1;
364 q0 -= 24;
365 while (iq[jz] == 0)
366 {
367 jz--;
368 q0 -= 24;
369 }
370 }
371 else
372 { /* break z into 24-bit if necessary */
373 z = std::scalbn(x: z, n: -q0);
374 if (z >= two24)
375 {
376 fw = (double)((int32_t)(twon24 * z));
377 iq[jz] = (int32_t)(z - two24 * fw);
378 jz += 1;
379 q0 += 24;
380 iq[jz] = (int32_t)fw;
381 }
382 else
383 iq[jz] = (int32_t)z;
384 }
385
386 /* convert integer "bit" chunk to floating-point value */
387 fw = scalbn(x: one, n: q0);
388 for (i = jz; i >= 0; i--)
389 {
390 q[i] = fw * (double)iq[i];
391 fw *= twon24;
392 }
393
394 /* compute PIo2[0,...,jp]*q[jz,...,0] */
395 for (i = jz; i >= 0; i--)
396 {
397 for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
398 fw += PIo2[k] * q[i + k];
399 fq[jz - i] = fw;
400 }
401
402 /* compress fq[] into y[] */
403 switch (prec)
404 {
405 case 0:
406 fw = 0.0;
407 for (i = jz; i >= 0; i--)
408 fw += fq[i];
409 y[0] = (ih == 0) ? fw : -fw;
410 break;
411 case 1:
412 case 2:
413 fw = 0.0;
414 for (i = jz; i >= 0; i--)
415 fw += fq[i];
416 y[0] = (ih == 0) ? fw : -fw;
417 fw = fq[0] - fw;
418 for (i = 1; i <= jz; i++)
419 fw += fq[i];
420 y[1] = (ih == 0) ? fw : -fw;
421 break;
422 case 3: /* painful */
423 for (i = jz; i > 0; i--)
424 {
425 fw = fq[i - 1] + fq[i];
426 fq[i] += fq[i - 1] - fw;
427 fq[i - 1] = fw;
428 }
429 for (i = jz; i > 1; i--)
430 {
431 fw = fq[i - 1] + fq[i];
432 fq[i] += fq[i - 1] - fw;
433 fq[i - 1] = fw;
434 }
435 for (fw = 0.0, i = jz; i >= 2; i--)
436 fw += fq[i];
437 if (ih == 0)
438 {
439 y[0] = fq[0];
440 y[1] = fq[1];
441 y[2] = fw;
442 }
443 else
444 {
445 y[0] = -fq[0];
446 y[1] = -fq[1];
447 y[2] = -fw;
448 }
449 }
450 return n & 7;
451 }
452
453 inline std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
454 {
455 static const std::int32_t two_over_pi[] = {
456 0xA2F983,
457 0x6E4E44,
458 0x1529FC,
459 0x2757D1,
460 0xF534DD,
461 0xC0DB62,
462 0x95993C,
463 0x439041,
464 0xFE5163,
465 0xABDEBB,
466 0xC561B7,
467 0x246E3A,
468 0x424DD2,
469 0xE00649,
470 0x2EEA09,
471 0xD1921C,
472 0xFE1DEB,
473 0x1CB129,
474 0xA73EE8,
475 0x8235F5,
476 0x2EBB44,
477 0x84E99C,
478 0x7026B4,
479 0x5F7E41,
480 0x3991D6,
481 0x398353,
482 0x39F49C,
483 0x845F8B,
484 0xBDF928,
485 0x3B1FF8,
486 0x97FFDE,
487 0x05980F,
488 0xEF2F11,
489 0x8B5A0A,
490 0x6D1F6D,
491 0x367ECF,
492 0x27CB09,
493 0xB74F46,
494 0x3F669E,
495 0x5FEA2D,
496 0x7527BA,
497 0xC7EBE5,
498 0xF17B3D,
499 0x0739F7,
500 0x8A5292,
501 0xEA6BFB,
502 0x5FB11F,
503 0x8D5D08,
504 0x560330,
505 0x46FC7B,
506 0x6BABF0,
507 0xCFBC20,
508 0x9AF436,
509 0x1DA9E3,
510 0x91615E,
511 0xE61B08,
512 0x659985,
513 0x5F14A0,
514 0x68408D,
515 0xFFD880,
516 0x4D7327,
517 0x310606,
518 0x1556CA,
519 0x73A8C9,
520 0x60E27B,
521 0xC08C6B,
522 };
523
524 static const std::int32_t npio2_hw[] = {
525 0x3FF921FB,
526 0x400921FB,
527 0x4012D97C,
528 0x401921FB,
529 0x401F6A7A,
530 0x4022D97C,
531 0x4025FDBB,
532 0x402921FB,
533 0x402C463A,
534 0x402F6A7A,
535 0x4031475C,
536 0x4032D97C,
537 0x40346B9C,
538 0x4035FDBB,
539 0x40378FDB,
540 0x403921FB,
541 0x403AB41B,
542 0x403C463A,
543 0x403DD85A,
544 0x403F6A7A,
545 0x40407E4C,
546 0x4041475C,
547 0x4042106C,
548 0x4042D97C,
549 0x4043A28C,
550 0x40446B9C,
551 0x404534AC,
552 0x4045FDBB,
553 0x4046C6CB,
554 0x40478FDB,
555 0x404858EB,
556 0x404921FB,
557 };
558
559 /*
560 * invpio2: 53 bits of 2/pi
561 * pio2_1: first 33 bit of pi/2
562 * pio2_1t: pi/2 - pio2_1
563 * pio2_2: second 33 bit of pi/2
564 * pio2_2t: pi/2 - (pio2_1+pio2_2)
565 * pio2_3: third 33 bit of pi/2
566 * pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3)
567 */
568
569 static const double
570 zero
571 = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
572 half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
573 two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
574 invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
575 pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
576 pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
577 pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
578 pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
579 pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
580 pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
581
582 double z = 0., w, t, r, fn;
583 double tx[3];
584 std::int32_t e0, i, j, nx, n, ix, hx;
585 std::uint32_t low;
586
587 GET_HIGH_WORD(hx, x); /* high word of x */
588 ix = hx & 0x7fffffff;
589 if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
590 {
591 y[0] = x;
592 y[1] = 0;
593 return 0;
594 }
595 if (ix < 0x4002d97c)
596 { /* |x| < 3pi/4, special case with n=+-1 */
597 if (hx > 0)
598 {
599 z = x - pio2_1;
600 if (ix != 0x3ff921fb)
601 { /* 33+53 bit pi is good enough */
602 y[0] = z - pio2_1t;
603 y[1] = (z - y[0]) - pio2_1t;
604 }
605 else
606 { /* near pi/2, use 33+33+53 bit pi */
607 z -= pio2_2;
608 y[0] = z - pio2_2t;
609 y[1] = (z - y[0]) - pio2_2t;
610 }
611 return 1;
612 }
613 else
614 { /* negative x */
615 z = x + pio2_1;
616 if (ix != 0x3ff921fb)
617 { /* 33+53 bit pi is good enough */
618 y[0] = z + pio2_1t;
619 y[1] = (z - y[0]) + pio2_1t;
620 }
621 else
622 { /* near pi/2, use 33+33+53 bit pi */
623 z += pio2_2;
624 y[0] = z + pio2_2t;
625 y[1] = (z - y[0]) + pio2_2t;
626 }
627
628 return -1;
629 }
630 }
631 if (ix <= 0x413921fb)
632 { /* |x| ~<= 2^19*(pi/2), medium_ size */
633 t = std::fabs(x: x);
634 n = (std::int32_t)(t * invpio2 + half);
635 fn = (double)n;
636 r = t - fn * pio2_1;
637 w = fn * pio2_1t; /* 1st round good to 85 bit */
638 if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
639 {
640 y[0] = r - w; /* quick check no cancellation */
641 }
642 else
643 {
644 std::uint32_t high;
645 j = ix >> 20;
646 y[0] = r - w;
647 GET_HIGH_WORD(high, y[0]);
648 i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
649 if (i > 16)
650 { /* 2nd iteration needed, good to 118 */
651 t = r;
652 w = fn * pio2_2;
653 r = t - w;
654 w = fn * pio2_2t - ((t - r) - w);
655 y[0] = r - w;
656 GET_HIGH_WORD(high, y[0]);
657 i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
658 if (i > 49)
659 { /* 3rd iteration need, 151 bits acc */
660 t = r; /* will cover all possible cases */
661 w = fn * pio2_3;
662 r = t - w;
663 w = fn * pio2_3t - ((t - r) - w);
664 y[0] = r - w;
665 }
666 }
667 }
668 y[1] = (r - y[0]) - w;
669 if (hx < 0)
670 {
671 y[0] = -y[0];
672 y[1] = -y[1];
673 return -n;
674 }
675 else
676 return n;
677 }
678 /*
679 * all other (large) arguments
680 */
681 if (ix >= 0x7ff00000)
682 { /* x is inf or NaN */
683 y[0] = y[1] = x - x;
684 return 0;
685 }
686 /* set z = scalbn(|x|,ilogb(x)-23) */
687 GET_LOW_WORD(low, x);
688 SET_LOW_WORD(z, low);
689 e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */
690 SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20)));
691 for (i = 0; i < 2; i++)
692 {
693 tx[i] = (double)((std::int32_t)(z));
694 z = (z - tx[i]) * two24;
695 }
696 tx[2] = z;
697 nx = 3;
698 while (tx[nx - 1] == zero)
699 nx--; /* skip zero term */
700 n = __kernel_rem_pio2(x: tx, y, e0, nx, prec: 2, ipio2: two_over_pi);
701 if (hx < 0)
702 {
703 y[0] = -y[0];
704 y[1] = -y[1];
705 return -n;
706 }
707 return n;
708 }
709 }
710
711#undef XSIMD_LITTLE_ENDIAN
712#undef SET_LOW_WORD
713#undef SET_HIGH_WORD
714#undef GET_LOW_WORD
715#undef GET_HIGH_WORD
716#undef HIGH_WORD_IDX
717#undef LOW_WORD_IDX
718#undef ONCE0
719}
720