1/*
2r128.h: 128-bit (64.64) signed fixed-point arithmetic. Version 1.4.4
3
4COMPILATION
5-----------
6Drop this header file somewhere in your project and include it wherever it is
7needed. There is no separate .c file for this library. To get the code, in ONE
8file in your project, put:
9
10#define R128_IMPLEMENTATION
11
12before you include this file. You may also provide a definition for R128_ASSERT
13to force the library to use a custom assert macro.
14
15COMPILER/LIBRARY SUPPORT
16------------------------
17This library requires a C89 compiler with support for 64-bit integers. If your
18compiler does not support the long long data type, the R128_U64, etc. macros
19must be set appropriately. On x86 and x64 targets, Intel intrinsics are used
20for speed. If your compiler does not support these intrinsics, you can add
21#define R128_STDC_ONLY
22in your implementation file before including r128.h.
23
24The only C runtime library functionality used by this library is <assert.h>.
25This can be avoided by defining an R128_ASSERT macro in your implementation
26file. Since this library uses 64-bit arithmetic, this may implicitly add a
27runtime library dependency on 32-bit platforms.
28
29C++ SUPPORT
30-----------
31Operator overloads are supplied for C++ files that include this file. Since all
32C++ functions are declared inline (or static inline), the R128_IMPLEMENTATION
33file can be either C++ or C.
34
35LICENSE
36-------
37This is free and unencumbered software released into the public domain.
38
39Anyone is free to copy, modify, publish, use, compile, sell, or
40distribute this software, either in source code form or as a compiled
41binary, for any purpose, commercial or non-commercial, and by any
42means.
43
44In jurisdictions that recognize copyright laws, the author or authors
45of this software dedicate any and all copyright interest in the
46software to the public domain. We make this dedication for the benefit
47of the public at large and to the detriment of our heirs and
48successors. We intend this dedication to be an overt act of
49relinquishment in perpetuity of all present and future rights to this
50software under copyright law.
51
52THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
53EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
54MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
55IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
56OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
57ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
58OTHER DEALINGS IN THE SOFTWARE.
59*/
60
61#ifndef H_R128_H
62#define H_R128_H
63
64#include <stddef.h>
65
66// 64-bit integer support
67// If your compiler does not have stdint.h, add appropriate defines for these macros.
68#if defined(_MSC_VER) && (_MSC_VER < 1600)
69# define R128_S32 __int32
70# define R128_U32 unsigned __int32
71# define R128_S64 __int64
72# define R128_U64 unsigned __int64
73# define R128_LIT_S64(x) x##i64
74# define R128_LIT_U64(x) x##ui64
75#else
76# include <stdint.h>
77# define R128_S32 int32_t
78# define R128_U32 uint32_t
79# define R128_S64 long long
80# define R128_U64 unsigned long long
81# define R128_LIT_S64(x) x##ll
82# define R128_LIT_U64(x) x##ull
83#endif
84
85#ifdef __cplusplus
86extern "C" {
87#endif
88
89typedef struct R128 {
90 R128_U64 lo;
91 R128_U64 hi;
92
93#ifdef __cplusplus
94 R128();
95 R128(double);
96 R128(int);
97 R128(R128_S64);
98 R128(R128_U64 low, R128_U64 high);
99
100 operator double() const;
101 operator R128_S64() const;
102 operator int() const;
103 operator bool() const;
104
105 bool operator!() const;
106 R128 operator~() const;
107 R128 operator-() const;
108 R128 &operator|=(const R128 &rhs);
109 R128 &operator&=(const R128 &rhs);
110 R128 &operator^=(const R128 &rhs);
111 R128 &operator+=(const R128 &rhs);
112 R128 &operator-=(const R128 &rhs);
113 R128 &operator*=(const R128 &rhs);
114 R128 &operator/=(const R128 &rhs);
115 R128 &operator%=(const R128 &rhs);
116 R128 &operator<<=(int amount);
117 R128 &operator>>=(int amount);
118#endif //__cplusplus
119} R128;
120
121// Type conversion
122extern void r128FromInt(R128 *dst, R128_S64 v);
123extern void r128FromFloat(R128 *dst, double v);
124extern R128_S64 r128ToInt(const R128 *v);
125extern double r128ToFloat(const R128 *v);
126
127// Copy
128extern void r128Copy(R128 *dst, const R128 *src);
129
130// Negate
131extern void r128Neg(R128 *dst, const R128 *src);
132
133// Bitwise operations
134extern void r128Not(R128 *dst, const R128 *src); // ~a
135extern void r128Or(R128 *dst, const R128 *a, const R128 *b); // a | b
136extern void r128And(R128 *dst, const R128 *a, const R128 *b); // a & b
137extern void r128Xor(R128 *dst, const R128 *a, const R128 *b); // a ^ b
138extern void r128Shl(R128 *dst, const R128 *src, int amount); // shift left by amount mod 128
139extern void r128Shr(R128 *dst, const R128 *src, int amount); // shift right logical by amount mod 128
140extern void r128Sar(R128 *dst, const R128 *src, int amount); // shift right arithmetic by amount mod 128
141
142// Arithmetic
143extern void r128Add(R128 *dst, const R128 *a, const R128 *b); // a + b
144extern void r128Sub(R128 *dst, const R128 *a, const R128 *b); // a - b
145extern void r128Mul(R128 *dst, const R128 *a, const R128 *b); // a * b
146extern void r128Div(R128 *dst, const R128 *a, const R128 *b); // a / b
147extern void r128Mod(R128 *dst, const R128 *a, const R128 *b); // a - toInt(a / b) * b
148
149extern void r128Sqrt(R128 *dst, const R128 *v); // sqrt(v)
150extern void r128Rsqrt(R128 *dst, const R128 *v); // 1 / sqrt(v)
151
152// Comparison
153extern int r128Cmp(const R128 *a, const R128 *b); // sign of a-b
154extern void r128Min(R128 *dst, const R128 *a, const R128 *b);
155extern void r128Max(R128 *dst, const R128 *a, const R128 *b);
156extern void r128Floor(R128 *dst, const R128 *v);
157extern void r128Ceil(R128 *dst, const R128 *v);
158extern int r128IsNeg(const R128 *v); // quick check for < 0
159
160// String conversion
161//
162typedef enum R128ToStringSign {
163 R128ToStringSign_Default, // no sign character for positive values
164 R128ToStringSign_Space, // leading space for positive values
165 R128ToStringSign_Plus, // leading '+' for positive values
166} R128ToStringSign;
167
168// Formatting options for use with r128ToStringOpt. The "defaults" correspond
169// to a format string of "%f".
170//
171typedef struct R128ToStringFormat {
172 // sign character for positive values. Default is R128ToStringSign_Default.
173 R128ToStringSign sign;
174
175 // minimum number of characters to write. Default is 0.
176 int width;
177
178 // place to the right of the decimal at which rounding is performed. If negative,
179 // a maximum of 20 decimal places will be written, with no trailing zeroes.
180 // (20 places is sufficient to ensure that r128FromString will convert back to the
181 // original value.) Default is -1. NOTE: This is not the same default that the C
182 // standard library uses for %f.
183 int precision;
184
185 // If non-zero, pads the output string with leading zeroes if the final result is
186 // fewer than width characters. Otherwise, leading spaces are used. Default is 0.
187 int zeroPad;
188
189 // Always print a decimal point, even if the value is an integer. Default is 0.
190 int decimal;
191
192 // Left-align output if width specifier requires padding.
193 // Default is 0 (right align).
194 int leftAlign;
195} R128ToStringFormat;
196
197// r128ToStringOpt: convert R128 to a decimal string, with formatting.
198//
199// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
200// (including null terminator). No additional rounding is performed if dstSize is not large
201// enough to hold the entire string.
202//
203// opt: an R128ToStringFormat struct (q.v.) with formatting options.
204//
205// Uses the R128_decimal global as the decimal point character.
206// Always writes a null terminator, even if the destination buffer is not large enough.
207//
208// Number of bytes that will be written (i.e. how big does dst need to be?):
209// If width is specified: width + 1 bytes.
210// If precision is specified: at most precision + 22 bytes.
211// If neither is specified: at most 42 bytes.
212//
213// Returns the number of bytes that would have been written if dst was sufficiently large,
214// not including the final null terminator.
215//
216extern int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt);
217
218// r128ToStringf: convert R128 to a decimal string, with formatting.
219//
220// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
221// (including null terminator).
222//
223// format: a printf-style format specifier, as one would use with floating point types.
224// e.g. "%+5.2f". (The leading % and trailing f are optional.)
225// NOTE: This is NOT a full replacement for sprintf. Any characters in the format string
226// that do not correspond to a format placeholder are ignored.
227//
228// Uses the R128_decimal global as the decimal point character.
229// Always writes a null terminator, even if the destination buffer is not large enough.
230//
231// Number of bytes that will be written (i.e. how big does dst need to be?):
232// If the precision field is specified: at most max(width, precision + 21) + 1 bytes
233// Otherwise: at most max(width, 41) + 1 bytes.
234//
235// Returns the number of bytes that would have been written if dst was sufficiently large,
236// not including the final null terminator.
237//
238extern int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v);
239
240// r128ToString: convert R128 to a decimal string, with default formatting.
241// Equivalent to r128ToStringf(dst, dstSize, "%f", v).
242//
243// Uses the R128_decimal global as the decimal point character.
244// Always writes a null terminator, even if the destination buffer is not large enough.
245//
246// Will write at most 42 bytes (including NUL) to dst.
247//
248// Returns the number of bytes that would have been written if dst was sufficiently large,
249// not including the final null terminator.
250//
251extern int r128ToString(char *dst, size_t dstSize, const R128 *v);
252
253// r128FromString: Convert string to R128.
254//
255// The string can be formatted either as a decimal number with optional sign
256// or as hexadecimal with a prefix of 0x or 0X.
257//
258// endptr, if not NULL, is set to the character following the last character
259// used in the conversion.
260//
261extern void r128FromString(R128 *dst, const char *s, char **endptr);
262
263// Constants
264extern const R128 R128_min; // minimum (most negative) value
265extern const R128 R128_max; // maximum (most positive) value
266extern const R128 R128_smallest; // smallest positive value
267extern const R128 R128_zero; // zero
268extern const R128 R128_one; // 1.0
269
270extern char R128_decimal; // decimal point character used by r128From/ToString. defaults to '.'
271
272#ifdef __cplusplus
273}
274
275#include <limits>
276namespace std {
277template<>
278struct numeric_limits<R128>
279{
280 static const bool is_specialized = true;
281
282 static R128 min() throw() { return R128_min; }
283 static R128 max() throw() { return R128_max; }
284
285 static const int digits = 127;
286 static const int digits10 = 38;
287 static const bool is_signed = true;
288 static const bool is_integer = false;
289 static const bool is_exact = false;
290 static const int radix = 2;
291 static R128 epsilon() throw() { return R128_smallest; }
292 static R128 round_error() throw() { return R128_one; }
293
294 static const int min_exponent = 0;
295 static const int min_exponent10 = 0;
296 static const int max_exponent = 0;
297 static const int max_exponent10 = 0;
298
299 static const bool has_infinity = false;
300 static const bool has_quiet_NaN = false;
301 static const bool has_signaling_NaN = false;
302 static const float_denorm_style has_denorm = denorm_absent;
303 static const bool has_denorm_loss = false;
304
305 static R128 infinity() throw() { return R128_zero; }
306 static R128 quiet_NaN() throw() { return R128_zero; }
307 static R128 signaling_NaN() throw() { return R128_zero; }
308 static R128 denorm_min() throw() { return R128_zero; }
309
310 static const bool is_iec559 = false;
311 static const bool is_bounded = true;
312 static const bool is_modulo = true;
313
314 static const bool traps = numeric_limits<R128_U64>::traps;
315 static const bool tinyness_before = false;
316 static const float_round_style round_style = round_toward_zero;
317};
318} //namespace std
319
320inline R128::R128() {}
321
322inline R128::R128(double v)
323{
324 r128FromFloat(this, v);
325}
326
327inline R128::R128(int v)
328{
329 r128FromInt(this, v);
330}
331
332inline R128::R128(R128_S64 v)
333{
334 r128FromInt(this, v);
335}
336
337inline R128::R128(R128_U64 low, R128_U64 high)
338{
339 lo = low;
340 hi = high;
341}
342
343inline R128::operator double() const
344{
345 return r128ToFloat(this);
346}
347
348inline R128::operator R128_S64() const
349{
350 return r128ToInt(this);
351}
352
353inline R128::operator int() const
354{
355 return (int) r128ToInt(this);
356}
357
358inline R128::operator bool() const
359{
360 return lo || hi;
361}
362
363inline bool R128::operator!() const
364{
365 return !lo && !hi;
366}
367
368inline R128 R128::operator~() const
369{
370 R128 r;
371 r128Not(&r, this);
372 return r;
373}
374
375inline R128 R128::operator-() const
376{
377 R128 r;
378 r128Neg(&r, this);
379 return r;
380}
381
382inline R128 &R128::operator|=(const R128 &rhs)
383{
384 r128Or(this, this, &rhs);
385 return *this;
386}
387
388inline R128 &R128::operator&=(const R128 &rhs)
389{
390 r128And(this, this, &rhs);
391 return *this;
392}
393
394inline R128 &R128::operator^=(const R128 &rhs)
395{
396 r128Xor(this, this, &rhs);
397 return *this;
398}
399
400inline R128 &R128::operator+=(const R128 &rhs)
401{
402 r128Add(this, this, &rhs);
403 return *this;
404}
405
406inline R128 &R128::operator-=(const R128 &rhs)
407{
408 r128Sub(this, this, &rhs);
409 return *this;
410}
411
412inline R128 &R128::operator*=(const R128 &rhs)
413{
414 r128Mul(this, this, &rhs);
415 return *this;
416}
417
418inline R128 &R128::operator/=(const R128 &rhs)
419{
420 r128Div(this, this, &rhs);
421 return *this;
422}
423
424inline R128 &R128::operator%=(const R128 &rhs)
425{
426 r128Mod(this, this, &rhs);
427 return *this;
428}
429
430inline R128 &R128::operator<<=(int amount)
431{
432 r128Shl(this, this, amount);
433 return *this;
434}
435
436inline R128 &R128::operator>>=(int amount)
437{
438 r128Sar(this, this, amount);
439 return *this;
440}
441
442static inline R128 operator|(const R128 &lhs, const R128 &rhs)
443{
444 R128 r(lhs);
445 return r |= rhs;
446}
447
448static inline R128 operator&(const R128 &lhs, const R128 &rhs)
449{
450 R128 r(lhs);
451 return r &= rhs;
452}
453
454static inline R128 operator^(const R128 &lhs, const R128 &rhs)
455{
456 R128 r(lhs);
457 return r ^= rhs;
458}
459
460static inline R128 operator+(const R128 &lhs, const R128 &rhs)
461{
462 R128 r(lhs);
463 return r += rhs;
464}
465
466static inline R128 operator-(const R128 &lhs, const R128 &rhs)
467{
468 R128 r(lhs);
469 return r -= rhs;
470}
471
472static inline R128 operator*(const R128 &lhs, const R128 &rhs)
473{
474 R128 r(lhs);
475 return r *= rhs;
476}
477
478static inline R128 operator/(const R128 &lhs, const R128 &rhs)
479{
480 R128 r(lhs);
481 return r /= rhs;
482}
483
484static inline R128 operator%(const R128 &lhs, const R128 &rhs)
485{
486 R128 r(lhs);
487 return r %= rhs;
488}
489
490static inline R128 operator<<(const R128 &lhs, int amount)
491{
492 R128 r(lhs);
493 return r <<= amount;
494}
495
496static inline R128 operator>>(const R128 &lhs, int amount)
497{
498 R128 r(lhs);
499 return r >>= amount;
500}
501
502static inline bool operator<(const R128 &lhs, const R128 &rhs)
503{
504 return r128Cmp(&lhs, &rhs) < 0;
505}
506
507static inline bool operator>(const R128 &lhs, const R128 &rhs)
508{
509 return r128Cmp(&lhs, &rhs) > 0;
510}
511
512static inline bool operator<=(const R128 &lhs, const R128 &rhs)
513{
514 return r128Cmp(&lhs, &rhs) <= 0;
515}
516
517static inline bool operator>=(const R128 &lhs, const R128 &rhs)
518{
519 return r128Cmp(&lhs, &rhs) >= 0;
520}
521
522static inline bool operator==(const R128 &lhs, const R128 &rhs)
523{
524 return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
525}
526
527static inline bool operator!=(const R128 &lhs, const R128 &rhs)
528{
529 return lhs.lo != rhs.lo || lhs.hi != rhs.hi;
530}
531
532#endif //__cplusplus
533#endif //H_R128_H
534
535#ifdef R128_IMPLEMENTATION
536
537#ifdef R128_DEBUG_VIS
538# define R128_DEBUG_SET(x) r128ToString(R128_last, sizeof(R128_last), x)
539#else
540# define R128_DEBUG_SET(x)
541#endif
542
543#define R128_SET2(x, l, h) do { (x)->lo = (R128_U64)(l); (x)->hi = (R128_U64)(h); } while(0)
544#define R128_R0(x) ((R128_U32)(x)->lo)
545#define R128_R2(x) ((R128_U32)(x)->hi)
546#if defined(_M_IX86)
547// workaround: MSVC x86's handling of 64-bit values is not great
548# define R128_SET4(x, r0, r1, r2, r3) do { \
549 ((R128_U32*)&(x)->lo)[0] = (R128_U32)(r0); \
550 ((R128_U32*)&(x)->lo)[1] = (R128_U32)(r1); \
551 ((R128_U32*)&(x)->hi)[0] = (R128_U32)(r2); \
552 ((R128_U32*)&(x)->hi)[1] = (R128_U32)(r3); \
553 } while(0)
554# define R128_R1(x) (((R128_U32*)&(x)->lo)[1])
555# define R128_R3(x) (((R128_U32*)&(x)->hi)[1])
556#else
557# define R128_SET4(x, r0, r1, r2, r3) do { (x)->lo = (R128_U64)(r0) | ((R128_U64)(r1) << 32); \
558 (x)->hi = (R128_U64)(r2) | ((R128_U64)(r3) << 32); } while(0)
559# define R128_R1(x) ((R128_U32)((x)->lo >> 32))
560# define R128_R3(x) ((R128_U32)((x)->hi >> 32))
561#endif
562
563#if defined(_M_X64)
564# define R128_INTEL 1
565# define R128_64BIT 1
566# ifndef R128_STDC_ONLY
567# include <intrin.h>
568# endif
569#elif defined(__x86_64__)
570# define R128_INTEL 1
571# define R128_64BIT 1
572# ifndef R128_STDC_ONLY
573# include <x86intrin.h>
574# endif
575#elif defined(_M_IX86)
576# define R128_INTEL 1
577# ifndef R128_STDC_ONLY
578# include <intrin.h>
579# endif
580#elif defined(__i386__)
581# define R128_INTEL 1
582# ifndef R128_STDC_ONLY
583# include <x86intrin.h>
584# endif
585#elif defined(_M_ARM)
586# ifndef R128_STDC_ONLY
587# include <intrin.h>
588# endif
589#elif defined(_M_ARM64)
590# define R128_64BIT 1
591# ifndef R128_STDC_ONLY
592# include <intrin.h>
593# endif
594#elif defined(__aarch64__)
595# define R128_64BIT 1
596#endif
597
598#ifndef R128_INTEL
599# define R128_INTEL 0
600#endif
601
602#ifndef R128_64BIT
603# define R128_64BIT 0
604#endif
605
606#ifndef R128_ASSERT
607# include <assert.h>
608# define R128_ASSERT(x) assert(x)
609#endif
610
611#include <stdlib.h> // for NULL
612
613static const R128ToStringFormat R128__defaultFormat = {
614 R128ToStringSign_Default,
615 0,
616 -1,
617 0,
618 0,
619 0
620};
621
622const R128 R128_min = { 0, R128_LIT_U64(0x8000000000000000) };
623const R128 R128_max = { R128_LIT_U64(0xffffffffffffffff), R128_LIT_U64(0x7fffffffffffffff) };
624const R128 R128_smallest = { 1, 0 };
625const R128 R128_zero = { 0, 0 };
626const R128 R128_one = { 0, 1 };
627char R128_decimal = '.';
628#ifdef R128_DEBUG_VIS
629char R128_last[42];
630#endif
631
632static int r128__clz64(R128_U64 x)
633{
634#if defined(R128_STDC_ONLY)
635 R128_U64 n = 64, y;
636 y = x >> 32; if (y) { n -= 32; x = y; }
637 y = x >> 16; if (y) { n -= 16; x = y; }
638 y = x >> 8; if (y) { n -= 8; x = y; }
639 y = x >> 4; if (y) { n -= 4; x = y; }
640 y = x >> 2; if (y) { n -= 2; x = y; }
641 y = x >> 1; if (y) { n -= 1; x = y; }
642 return (int)(n - x);
643#elif defined(_M_X64) || defined(_M_ARM64)
644 unsigned long idx;
645 if (_BitScanReverse64(&idx, x)) {
646 return 63 - (int)idx;
647 } else {
648 return 64;
649 }
650#elif defined(_MSC_VER)
651 unsigned long idx;
652 if (_BitScanReverse(&idx, (R128_U32)(x >> 32))) {
653 return 31 - (int)idx;
654 } else if (_BitScanReverse(&idx, (R128_U32)x)) {
655 return 63 - (int)idx;
656 } else {
657 return 64;
658 }
659#else
660 return x ? __builtin_clzll(x) : 64;
661#endif
662}
663
664#if !R128_64BIT
665// 32*32->64
666static R128_U64 r128__umul64(R128_U32 a, R128_U32 b)
667{
668# if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
669 return __emulu(a, b);
670# elif defined(_M_ARM) && !defined(R128_STDC_ONLY)
671 return _arm_umull(a, b);
672# else
673 return a * (R128_U64)b;
674# endif
675}
676
677// 64/32->32
678static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
679{
680# if defined(_M_IX86) && (_MSC_VER >= 1920) && !defined(R128_STDC_ONLY)
681 unsigned __int64 n = ((unsigned __int64)nhi << 32) | nlo;
682 return _udiv64(n, d, rem);
683# elif defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
684 __asm {
685 mov eax, nlo
686 mov edx, nhi
687 div d
688 mov ecx, rem
689 mov dword ptr [ecx], edx
690 }
691# elif defined(__i386__) && !defined(R128_STDC_ONLY)
692 R128_U32 q, r;
693 __asm("divl %4"
694 : "=a"(q), "=d"(r)
695 : "a"(nlo), "d"(nhi), "X"(d));
696 *rem = r;
697 return q;
698# else
699 R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
700 *rem = (R128_U32)(n64 % d);
701 return (R128_U32)(n64 / d);
702# endif
703}
704#elif defined(R128_STDC_ONLY) || !R128_INTEL
705#define r128__umul64(a, b) ((a) * (R128_U64)(b))
706static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
707{
708 R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
709 *rem = (R128_U32)(n64 % d);
710 return (R128_U32)(n64 / d);
711}
712#endif //!R128_64BIT
713
714static void r128__neg(R128 *dst, const R128 *src)
715{
716 R128_ASSERT(dst != NULL);
717 R128_ASSERT(src != NULL);
718
719#if R128_INTEL && !defined(R128_STDC_ONLY)
720 {
721 unsigned char carry = 0;
722# if R128_64BIT
723 carry = _addcarry_u64(carry, ~src->lo, 1, &dst->lo);
724 carry = _addcarry_u64(carry, ~src->hi, 0, &dst->hi);
725# else
726 R128_U32 r0, r1, r2, r3;
727 carry = _addcarry_u32(carry, ~R128_R0(src), 1, &r0);
728 carry = _addcarry_u32(carry, ~R128_R1(src), 0, &r1);
729 carry = _addcarry_u32(carry, ~R128_R2(src), 0, &r2);
730 carry = _addcarry_u32(carry, ~R128_R3(src), 0, &r3);
731 R128_SET4(dst, r0, r1, r2, r3);
732# endif //R128_64BIT
733 }
734#else
735 if (src->lo) {
736 dst->lo = ~src->lo + 1;
737 dst->hi = ~src->hi;
738 } else {
739 dst->lo = 0;
740 dst->hi = ~src->hi + 1;
741 }
742#endif //R128_INTEL
743}
744
745// 64*64->128
746static void r128__umul128(R128 *dst, R128_U64 a, R128_U64 b)
747{
748#if defined(_M_X64) && !defined(R128_STDC_ONLY)
749 dst->lo = _umul128(a, b, &dst->hi);
750#elif R128_64BIT && !defined(_MSC_VER) && !defined(R128_STDC_ONLY)
751 unsigned __int128 p0 = a * (unsigned __int128)b;
752 dst->hi = (R128_U64)(p0 >> 64);
753 dst->lo = (R128_U64)p0;
754#else
755 R128_U32 alo = (R128_U32)a;
756 R128_U32 ahi = (R128_U32)(a >> 32);
757 R128_U32 blo = (R128_U32)b;
758 R128_U32 bhi = (R128_U32)(b >> 32);
759 R128_U64 p0, p1, p2, p3;
760
761 p0 = r128__umul64(alo, blo);
762 p1 = r128__umul64(alo, bhi);
763 p2 = r128__umul64(ahi, blo);
764 p3 = r128__umul64(ahi, bhi);
765
766 {
767#if R128_INTEL && !defined(R128_STDC_ONLY)
768 R128_U32 r0, r1, r2, r3;
769 unsigned char carry;
770
771 r0 = (R128_U32)(p0);
772 r1 = (R128_U32)(p0 >> 32);
773 r2 = (R128_U32)(p1 >> 32);
774 r3 = (R128_U32)(p3 >> 32);
775
776 carry = _addcarry_u32(0, r1, (R128_U32)p1, &r1);
777 carry = _addcarry_u32(carry, r2, (R128_U32)(p2 >> 32), &r2);
778 _addcarry_u32(carry, r3, 0, &r3);
779 carry = _addcarry_u32(0, r1, (R128_U32)p2, &r1);
780 carry = _addcarry_u32(carry, r2, (R128_U32)p3, &r2);
781 _addcarry_u32(carry, r3, 0, &r3);
782
783 R128_SET4(dst, r0, r1, r2, r3);
784#else
785 R128_U64 carry, lo, hi;
786 carry = ((R128_U64)(R128_U32)p1 + (R128_U64)(R128_U32)p2 + (p0 >> 32)) >> 32;
787
788 lo = p0 + ((p1 + p2) << 32);
789 hi = p3 + ((R128_U32)(p1 >> 32) + (R128_U32)(p2 >> 32)) + carry;
790
791 R128_SET2(dst, lo, hi);
792#endif
793 }
794#endif
795}
796
797// 128/64->64
798#if defined(_M_X64) && (_MSC_VER < 1920) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
799// MSVC x64 provides neither inline assembly nor (pre-2019) a div intrinsic, so we do fake
800// "inline assembly" to avoid long division or outline assembly.
801#pragma code_seg(".text")
802__declspec(allocate(".text") align(16)) static const unsigned char r128__udiv128Code[] = {
803 0x48, 0x8B, 0xC1, //mov rax, rcx
804 0x49, 0xF7, 0xF0, //div rax, r8
805 0x49, 0x89, 0x11, //mov qword ptr [r9], rdx
806 0xC3 //ret
807};
808typedef R128_U64 (*r128__udiv128Proc)(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem);
809static const r128__udiv128Proc r128__udiv128 = (r128__udiv128Proc)(void*)r128__udiv128Code;
810#else
811static R128_U64 r128__udiv128(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem)
812{
813#if defined(_M_X64) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
814 return _udiv128(nhi, nlo, d, rem);
815#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
816 R128_U64 q, r;
817 __asm("divq %4"
818 : "=a"(q), "=d"(r)
819 : "a"(nlo), "d"(nhi), "X"(d));
820 *rem = r;
821 return q;
822#else
823 R128_U64 tmp;
824 R128_U32 d0, d1;
825 R128_U32 n3, n2, n1, n0;
826 R128_U32 q0, q1;
827 R128_U32 r;
828 int shift;
829
830 R128_ASSERT(d != 0); //division by zero
831 R128_ASSERT(nhi < d); //overflow
832
833 // normalize
834 shift = r128__clz64(d);
835
836 if (shift) {
837 R128 tmp128;
838 R128_SET2(&tmp128, nlo, nhi);
839 r128Shl(&tmp128, &tmp128, shift);
840 n3 = R128_R3(&tmp128);
841 n2 = R128_R2(&tmp128);
842 n1 = R128_R1(&tmp128);
843 n0 = R128_R0(&tmp128);
844 d <<= shift;
845 } else {
846 n3 = (R128_U32)(nhi >> 32);
847 n2 = (R128_U32)nhi;
848 n1 = (R128_U32)(nlo >> 32);
849 n0 = (R128_U32)nlo;
850 }
851
852 d1 = (R128_U32)(d >> 32);
853 d0 = (R128_U32)d;
854
855 // first digit
856 R128_ASSERT(n3 <= d1);
857 if (n3 < d1) {
858 q1 = r128__udiv64(n2, n3, d1, &r);
859 } else {
860 q1 = 0xffffffffu;
861 r = n2 + d1;
862 }
863refine1:
864 if (r128__umul64(q1, d0) > ((R128_U64)r << 32) + n1) {
865 --q1;
866 if (r < ~d1 + 1) {
867 r += d1;
868 goto refine1;
869 }
870 }
871
872 tmp = ((R128_U64)n2 << 32) + n1 - (r128__umul64(q1, d0) + (r128__umul64(q1, d1) << 32));
873 n2 = (R128_U32)(tmp >> 32);
874 n1 = (R128_U32)tmp;
875
876 // second digit
877 R128_ASSERT(n2 <= d1);
878 if (n2 < d1) {
879 q0 = r128__udiv64(n1, n2, d1, &r);
880 } else {
881 q0 = 0xffffffffu;
882 r = n1 + d1;
883 }
884refine0:
885 if (r128__umul64(q0, d0) > ((R128_U64)r << 32) + n0) {
886 --q0;
887 if (r < ~d1 + 1) {
888 r += d1;
889 goto refine0;
890 }
891 }
892
893 tmp = ((R128_U64)n1 << 32) + n0 - (r128__umul64(q0, d0) + (r128__umul64(q0, d1) << 32));
894 n1 = (R128_U32)(tmp >> 32);
895 n0 = (R128_U32)tmp;
896
897 *rem = (((R128_U64)n1 << 32) + n0) >> shift;
898 return ((R128_U64)q1 << 32) + q0;
899#endif
900}
901#endif
902
903static int r128__ucmp(const R128 *a, const R128 *b)
904{
905 if (a->hi != b->hi) {
906 if (a->hi > b->hi) {
907 return 1;
908 } else {
909 return -1;
910 }
911 } else {
912 if (a->lo == b->lo) {
913 return 0;
914 } else if (a->lo > b->lo) {
915 return 1;
916 } else {
917 return -1;
918 }
919 }
920}
921
922static void r128__umul(R128 *dst, const R128 *a, const R128 *b)
923{
924#if defined(_M_X64) && !defined(R128_STDC_ONLY)
925 R128_U64 t0, t1;
926 R128_U64 lo, hi = 0;
927 unsigned char carry;
928
929 t0 = _umul128(a->lo, b->lo, &t1);
930 carry = _addcarry_u64(0, t1, t0 >> 63, &lo);
931 _addcarry_u64(carry, hi, hi, &hi);
932
933 t0 = _umul128(a->lo, b->hi, &t1);
934 carry = _addcarry_u64(0, lo, t0, &lo);
935 _addcarry_u64(carry, hi, t1, &hi);
936
937 t0 = _umul128(a->hi, b->lo, &t1);
938 carry = _addcarry_u64(0, lo, t0, &lo);
939 _addcarry_u64(carry, hi, t1, &hi);
940
941 t0 = _umul128(a->hi, b->hi, &t1);
942 hi += t0;
943
944 R128_SET2(dst, lo, hi);
945#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
946 unsigned __int128 p0, p1, p2, p3;
947 p0 = a->lo * (unsigned __int128)b->lo;
948 p1 = a->lo * (unsigned __int128)b->hi;
949 p2 = a->hi * (unsigned __int128)b->lo;
950 p3 = a->hi * (unsigned __int128)b->hi;
951
952 p0 = (p3 << 64) + p2 + p1 + (p0 >> 64) + ((R128_U64)p0 >> 63);
953 dst->lo = (R128_U64)p0;
954 dst->hi = (R128_U64)(p0 >> 64);
955#else
956 R128 p0, p1, p2, p3, round;
957
958 r128__umul128(&p0, a->lo, b->lo);
959 round.hi = 0; round.lo = p0.lo >> 63;
960 p0.lo = p0.hi; p0.hi = 0; //r128Shr(&p0, &p0, 64);
961 r128Add(&p0, &p0, &round);
962
963 r128__umul128(&p1, a->hi, b->lo);
964 r128Add(&p0, &p0, &p1);
965
966 r128__umul128(&p2, a->lo, b->hi);
967 r128Add(&p0, &p0, &p2);
968
969 r128__umul128(&p3, a->hi, b->hi);
970 p3.hi = p3.lo; p3.lo = 0; //r128Shl(&p3, &p3, 64);
971 r128Add(&p0, &p0, &p3);
972
973 R128_SET2(dst, p0.lo, p0.hi);
974#endif
975}
976
977// Shift d left until the high bit is set, and shift n left by the same amount.
978// returns non-zero on overflow.
979static int r128__norm(R128 *n, R128 *d, R128_U64 *n2)
980{
981 R128_U64 d0, d1;
982 R128_U64 n0, n1;
983 int shift;
984
985 d1 = d->hi;
986 d0 = d->lo;
987 n1 = n->hi;
988 n0 = n->lo;
989
990 if (d1) {
991 shift = r128__clz64(d1);
992 if (shift) {
993 d1 = (d1 << shift) | (d0 >> (64 - shift));
994 d0 = d0 << shift;
995 *n2 = n1 >> (64 - shift);
996 n1 = (n1 << shift) | (n0 >> (64 - shift));
997 n0 = n0 << shift;
998 } else {
999 *n2 = 0;
1000 }
1001 } else {
1002 shift = r128__clz64(d0);
1003 if (r128__clz64(n1) <= shift) {
1004 return 1; // overflow
1005 }
1006
1007 if (shift) {
1008 d1 = d0 << shift;
1009 d0 = 0;
1010 *n2 = (n1 << shift) | (n0 >> (64 - shift));
1011 n1 = n0 << shift;
1012 n0 = 0;
1013 } else {
1014 d1 = d0;
1015 d0 = 0;
1016 *n2 = n1;
1017 n1 = n0;
1018 n0 = 0;
1019 }
1020 }
1021
1022 R128_SET2(n, n0, n1);
1023 R128_SET2(d, d0, d1);
1024 return 0;
1025}
1026
1027static void r128__udiv(R128 *quotient, const R128 *dividend, const R128 *divisor)
1028{
1029 R128 tmp;
1030 R128_U64 d0, d1;
1031 R128_U64 n1, n2, n3;
1032 R128 q;
1033
1034 R128_ASSERT(dividend != NULL);
1035 R128_ASSERT(divisor != NULL);
1036 R128_ASSERT(quotient != NULL);
1037 R128_ASSERT(divisor->hi != 0 || divisor->lo != 0); // divide by zero
1038
1039 // scale dividend and normalize
1040 {
1041 R128 n, d;
1042 R128_SET2(&n, dividend->lo, dividend->hi);
1043 R128_SET2(&d, divisor->lo, divisor->hi);
1044 if (r128__norm(&n, &d, &n3)) {
1045 R128_SET2(quotient, R128_max.lo, R128_max.hi);
1046 return;
1047 }
1048
1049 d1 = d.hi;
1050 d0 = d.lo;
1051 n2 = n.hi;
1052 n1 = n.lo;
1053 }
1054
1055 // first digit
1056 R128_ASSERT(n3 <= d1);
1057 {
1058 R128 t0, t1;
1059 t0.lo = n1;
1060 if (n3 < d1) {
1061 q.hi = r128__udiv128(n2, n3, d1, &t0.hi);
1062 } else {
1063 q.hi = R128_LIT_U64(0xffffffffffffffff);
1064 t0.hi = n2 + d1;
1065 }
1066
1067refine1:
1068 r128__umul128(&t1, q.hi, d0);
1069 if (r128__ucmp(&t1, &t0) > 0) {
1070 --q.hi;
1071 if (t0.hi < ~d1 + 1) {
1072 t0.hi += d1;
1073 goto refine1;
1074 }
1075 }
1076 }
1077
1078 {
1079 R128 t0, t1, t2;
1080 t0.hi = n2;
1081 t0.lo = n1;
1082
1083 r128__umul128(&t1, q.hi, d0);
1084 r128__umul128(&t2, q.hi, d1);
1085
1086 t2.hi = t2.lo; t2.lo = 0; //r128Shl(&t2, &t2, 64);
1087 r128Add(&tmp, &t1, &t2);
1088 r128Sub(&tmp, &t0, &tmp);
1089 }
1090 n2 = tmp.hi;
1091 n1 = tmp.lo;
1092
1093 // second digit
1094 R128_ASSERT(n2 <= d1);
1095 {
1096 R128 t0, t1;
1097 t0.lo = 0;
1098 if (n2 < d1) {
1099 q.lo = r128__udiv128(n1, n2, d1, &t0.hi);
1100 } else {
1101 q.lo = R128_LIT_U64(0xffffffffffffffff);
1102 t0.hi = n1 + d1;
1103 }
1104
1105 refine0:
1106 r128__umul128(&t1, q.lo, d0);
1107 if (r128__ucmp(&t1, &t0) > 0) {
1108 --q.lo;
1109 if (t0.hi < ~d1 + 1) {
1110 t0.hi += d1;
1111 goto refine0;
1112 }
1113 }
1114 }
1115
1116 R128_SET2(quotient, q.lo, q.hi);
1117}
1118
1119static R128_U64 r128__umod(R128 *n, R128 *d)
1120{
1121 R128_U64 d0, d1;
1122 R128_U64 n3, n2, n1;
1123 R128_U64 q;
1124
1125 R128_ASSERT(d != NULL);
1126 R128_ASSERT(n != NULL);
1127 R128_ASSERT(d->hi != 0 || d->lo != 0); // divide by zero
1128
1129 if (r128__norm(n, d, &n3)) {
1130 return R128_LIT_U64(0xffffffffffffffff);
1131 }
1132
1133 d1 = d->hi;
1134 d0 = d->lo;
1135 n2 = n->hi;
1136 n1 = n->lo;
1137
1138 R128_ASSERT(n3 < d1);
1139 {
1140 R128 t0, t1;
1141 t0.lo = n1;
1142 q = r128__udiv128(n2, n3, d1, &t0.hi);
1143
1144 refine1:
1145 r128__umul128(&t1, q, d0);
1146 if (r128__ucmp(&t1, &t0) > 0) {
1147 --q;
1148 if (t0.hi < ~d1 + 1) {
1149 t0.hi += d1;
1150 goto refine1;
1151 }
1152 }
1153 }
1154
1155 return q;
1156}
1157
1158static int r128__format(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *format)
1159{
1160 char buf[128];
1161 R128 tmp;
1162 R128_U64 whole;
1163 char *cursor, *decimal, *dstp = dst;
1164 int sign = 0;
1165 int fullPrecision = 1;
1166 int width, precision;
1167 int padCnt, trail = 0;
1168
1169 R128_ASSERT(dst != NULL && dstSize > 0);
1170 R128_ASSERT(v != NULL);
1171 R128_ASSERT(format != NULL);
1172
1173 --dstSize;
1174
1175 R128_SET2(&tmp, v->lo, v->hi);
1176 if (r128IsNeg(&tmp)) {
1177 r128__neg(&tmp, &tmp);
1178 sign = 1;
1179 }
1180
1181 width = format->width;
1182 if (width < 0) {
1183 width = 0;
1184 }
1185
1186 precision = format->precision;
1187 if (precision < 0) {
1188 // print a maximum of 20 digits
1189 fullPrecision = 0;
1190 precision = 20;
1191 } else if (precision > sizeof(buf) - 21) {
1192 trail = precision - (sizeof(buf) - 21);
1193 precision -= trail;
1194 }
1195
1196 whole = tmp.hi;
1197 decimal = cursor = buf;
1198
1199 // fractional part first in case a carry into the whole part is required
1200 if (tmp.lo || format->decimal) {
1201 while (tmp.lo || (fullPrecision && precision)) {
1202 if ((int)(cursor - buf) == precision) {
1203 if ((R128_S64)tmp.lo < 0) {
1204 // round up, propagate carry backwards
1205 char *c;
1206 for (c = cursor - 1; c >= buf; --c) {
1207 char d = ++*c;
1208 if (d <= '9') {
1209 goto endfrac;
1210 } else {
1211 *c = '0';
1212 }
1213 }
1214
1215 // carry out into the whole part
1216 whole++;
1217 }
1218
1219 break;
1220 }
1221
1222 r128__umul128(&tmp, tmp.lo, 10);
1223 *cursor++ = (char)tmp.hi + '0';
1224 }
1225
1226 endfrac:
1227 if (format->decimal || precision) {
1228 decimal = cursor;
1229 *cursor++ = R128_decimal;
1230 }
1231 }
1232
1233 // whole part
1234 do {
1235 char digit = (char)(whole % 10);
1236 whole /= 10;
1237 *cursor++ = digit + '0';
1238 } while (whole);
1239
1240#define R128__WRITE(c) do { if (dstp < dst + dstSize) *dstp = c; ++dstp; } while(0)
1241
1242 padCnt = width - (int)(cursor - buf) - 1;
1243
1244 // left padding
1245 if (!format->leftAlign) {
1246 char padChar = format->zeroPad ? '0' : ' ';
1247 if (format->zeroPad) {
1248 if (sign) {
1249 R128__WRITE('-');
1250 } else if (format->sign == R128ToStringSign_Plus) {
1251 R128__WRITE('+');
1252 } else if (format->sign == R128ToStringSign_Space) {
1253 R128__WRITE(' ');
1254 } else {
1255 ++padCnt;
1256 }
1257 }
1258
1259 for (; padCnt > 0; --padCnt) {
1260 R128__WRITE(padChar);
1261 }
1262 }
1263
1264 if (format->leftAlign || !format->zeroPad) {
1265 if (sign) {
1266 R128__WRITE('-');
1267 } else if (format->sign == R128ToStringSign_Plus) {
1268 R128__WRITE('+');
1269 } else if (format->sign == R128ToStringSign_Space) {
1270 R128__WRITE(' ');
1271 } else {
1272 ++padCnt;
1273 }
1274 }
1275
1276 {
1277 char *i;
1278
1279 // reverse the whole part
1280 for (i = cursor - 1; i >= decimal; --i) {
1281 R128__WRITE(*i);
1282 }
1283
1284 // copy the fractional part
1285 for (i = buf; i < decimal; ++i) {
1286 R128__WRITE(*i);
1287 }
1288 }
1289
1290 // right padding
1291 if (format->leftAlign) {
1292 char padChar = format->zeroPad ? '0' : ' ';
1293 for (; padCnt > 0; --padCnt) {
1294 R128__WRITE(padChar);
1295 }
1296 }
1297
1298 // trailing zeroes for very large precision
1299 while (trail--) {
1300 R128__WRITE('0');
1301 }
1302
1303#undef R128__WRITE
1304
1305 if (dstp <= dst + dstSize) {
1306 *dstp = '\0';
1307 } else {
1308 dst[dstSize] = '\0';
1309 }
1310 return (int)(dstp - dst);
1311}
1312
1313void r128FromInt(R128 *dst, R128_S64 v)
1314{
1315 R128_ASSERT(dst != NULL);
1316 dst->lo = 0;
1317 dst->hi = (R128_U64)v;
1318 R128_DEBUG_SET(dst);
1319}
1320
1321void r128FromFloat(R128 *dst, double v)
1322{
1323 R128_ASSERT(dst != NULL);
1324
1325 if (v < -9223372036854775808.0) {
1326 r128Copy(dst, &R128_min);
1327 } else if (v >= 9223372036854775808.0) {
1328 r128Copy(dst, &R128_max);
1329 } else {
1330 R128 r;
1331 int sign = 0;
1332
1333 if (v < 0) {
1334 v = -v;
1335 sign = 1;
1336 }
1337
1338 r.hi = (R128_U64)(R128_S64)v;
1339 v -= (R128_S64)v;
1340 r.lo = (R128_U64)(v * 18446744073709551616.0);
1341
1342 if (sign) {
1343 r128__neg(&r, &r);
1344 }
1345
1346 r128Copy(dst, &r);
1347 }
1348}
1349
1350void r128FromString(R128 *dst, const char *s, char **endptr)
1351{
1352 R128_U64 lo = 0, hi = 0;
1353 R128_U64 base = 10;
1354
1355 int sign = 0;
1356
1357 R128_ASSERT(dst != NULL);
1358 R128_ASSERT(s != NULL);
1359
1360 R128_SET2(dst, 0, 0);
1361
1362 // consume whitespace
1363 for (;;) {
1364 if (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n' || *s == '\v') {
1365 ++s;
1366 } else {
1367 break;
1368 }
1369 }
1370
1371 // sign
1372 if (*s == '-') {
1373 sign = 1;
1374 ++s;
1375 } else if (*s == '+') {
1376 ++s;
1377 }
1378
1379 // parse base prefix
1380 if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
1381 base = 16;
1382 s += 2;
1383 }
1384
1385 // whole part
1386 for (;; ++s) {
1387 R128_U64 digit;
1388
1389 if ('0' <= *s && *s <= '9') {
1390 digit = *s - '0';
1391 } else if (base == 16 && 'a' <= *s && *s <= 'f') {
1392 digit = *s - 'a' + 10;
1393 } else if (base == 16 && 'A' <= *s && *s <= 'F') {
1394 digit = *s - 'A' + 10;
1395 } else {
1396 break;
1397 }
1398
1399 hi = hi * base + digit;
1400 }
1401
1402 // fractional part
1403 if (*s == R128_decimal) {
1404 const char *exp = ++s;
1405
1406 // find the last digit and work backwards
1407 for (;; ++s) {
1408 if ('0' <= *s && *s <= '9') {
1409 } else if (base == 16 && ('a' <= *s && *s <= 'f')) {
1410 } else if (base == 16 && ('A' <= *s && *s <= 'F')) {
1411 } else {
1412 break;
1413 }
1414 }
1415
1416 for (--s; s >= exp; --s) {
1417 R128_U64 digit, unused;
1418
1419 if ('0' <= *s && *s <= '9') {
1420 digit = *s - '0';
1421 } else if ('a' <= *s && *s <= 'f') {
1422 digit = *s - 'a' + 10;
1423 } else {
1424 digit = *s - 'A' + 10;
1425 }
1426
1427 lo = r128__udiv128(lo, digit, base, &unused);
1428 }
1429 }
1430
1431 R128_SET2(dst, lo, hi);
1432 if (sign) {
1433 r128__neg(dst, dst);
1434 }
1435
1436 if (endptr) {
1437 *endptr = (char *) s;
1438 }
1439}
1440
1441R128_S64 r128ToInt(const R128 *v)
1442{
1443 R128_ASSERT(v != NULL);
1444 return (R128_S64)v->hi;
1445}
1446
1447double r128ToFloat(const R128 *v)
1448{
1449 R128 tmp;
1450 int sign = 0;
1451 double d;
1452
1453 R128_ASSERT(v != NULL);
1454
1455 R128_SET2(&tmp, v->lo, v->hi);
1456 if (r128IsNeg(&tmp)) {
1457 r128__neg(&tmp, &tmp);
1458 sign = 1;
1459 }
1460
1461 d = tmp.hi + tmp.lo * (1 / 18446744073709551616.0);
1462 if (sign) {
1463 d = -d;
1464 }
1465
1466 return d;
1467}
1468
1469int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt)
1470{
1471 return r128__format(dst, dstSize, v, opt);
1472}
1473
1474int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v)
1475{
1476 R128ToStringFormat opts;
1477
1478 R128_ASSERT(dst != NULL && dstSize);
1479 R128_ASSERT(format != NULL);
1480 R128_ASSERT(v != NULL);
1481
1482 opts.sign = R128__defaultFormat.sign;
1483 opts.precision = R128__defaultFormat.precision;
1484 opts.zeroPad = R128__defaultFormat.zeroPad;
1485 opts.decimal = R128__defaultFormat.decimal;
1486 opts.leftAlign = R128__defaultFormat.leftAlign;
1487
1488 if (*format == '%') {
1489 ++format;
1490 }
1491
1492 // flags field
1493 for (;; ++format) {
1494 if (*format == ' ' && opts.sign != R128ToStringSign_Plus) {
1495 opts.sign = R128ToStringSign_Space;
1496 } else if (*format == '+') {
1497 opts.sign = R128ToStringSign_Plus;
1498 } else if (*format == '0') {
1499 opts.zeroPad = 1;
1500 } else if (*format == '-') {
1501 opts.leftAlign = 1;
1502 } else if (*format == '#') {
1503 opts.decimal = 1;
1504 } else {
1505 break;
1506 }
1507 }
1508
1509 // width field
1510 opts.width = 0;
1511 for (;;) {
1512 if ('0' <= *format && *format <= '9') {
1513 opts.width = opts.width * 10 + *format++ - '0';
1514 } else {
1515 break;
1516 }
1517 }
1518
1519 // precision field
1520 if (*format == '.') {
1521 opts.precision = 0;
1522 ++format;
1523 for (;;) {
1524 if ('0' <= *format && *format <= '9') {
1525 opts.precision = opts.precision * 10 + *format++ - '0';
1526 } else {
1527 break;
1528 }
1529 }
1530 }
1531
1532 return r128__format(dst, dstSize, v, &opts);
1533}
1534
1535int r128ToString(char *dst, size_t dstSize, const R128 *v)
1536{
1537 return r128__format(dst, dstSize, v, &R128__defaultFormat);
1538}
1539
1540void r128Copy(R128 *dst, const R128 *src)
1541{
1542 R128_ASSERT(dst != NULL);
1543 R128_ASSERT(src != NULL);
1544 dst->lo = src->lo;
1545 dst->hi = src->hi;
1546 R128_DEBUG_SET(dst);
1547}
1548
1549void r128Neg(R128 *dst, const R128 *src)
1550{
1551 r128__neg(dst, src);
1552 R128_DEBUG_SET(dst);
1553}
1554
1555void r128Not(R128 *dst, const R128 *src)
1556{
1557 R128_ASSERT(dst != NULL);
1558 R128_ASSERT(src != NULL);
1559
1560 dst->lo = ~src->lo;
1561 dst->hi = ~src->hi;
1562 R128_DEBUG_SET(dst);
1563}
1564
1565void r128Or(R128 *dst, const R128 *a, const R128 *b)
1566{
1567 R128_ASSERT(dst != NULL);
1568 R128_ASSERT(a != NULL);
1569 R128_ASSERT(b != NULL);
1570
1571 dst->lo = a->lo | b->lo;
1572 dst->hi = a->hi | b->hi;
1573 R128_DEBUG_SET(dst);
1574}
1575
1576void r128And(R128 *dst, const R128 *a, const R128 *b)
1577{
1578 R128_ASSERT(dst != NULL);
1579 R128_ASSERT(a != NULL);
1580 R128_ASSERT(b != NULL);
1581
1582 dst->lo = a->lo & b->lo;
1583 dst->hi = a->hi & b->hi;
1584 R128_DEBUG_SET(dst);
1585}
1586
1587void r128Xor(R128 *dst, const R128 *a, const R128 *b)
1588{
1589 R128_ASSERT(dst != NULL);
1590 R128_ASSERT(a != NULL);
1591 R128_ASSERT(b != NULL);
1592
1593 dst->lo = a->lo ^ b->lo;
1594 dst->hi = a->hi ^ b->hi;
1595 R128_DEBUG_SET(dst);
1596}
1597
1598void r128Shl(R128 *dst, const R128 *src, int amount)
1599{
1600 R128_U64 r[4];
1601
1602 R128_ASSERT(dst != NULL);
1603 R128_ASSERT(src != NULL);
1604
1605#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
1606 __asm {
1607 // load src
1608 mov edx, dword ptr[src]
1609 mov ecx, amount
1610
1611 mov edi, dword ptr[edx]
1612 mov esi, dword ptr[edx + 4]
1613 mov ebx, dword ptr[edx + 8]
1614 mov eax, dword ptr[edx + 12]
1615
1616 // shift mod 32
1617 shld eax, ebx, cl
1618 shld ebx, esi, cl
1619 shld esi, edi, cl
1620 shl edi, cl
1621
1622 // clear out low 12 bytes of stack
1623 xor edx, edx
1624 mov dword ptr[r], edx
1625 mov dword ptr[r + 4], edx
1626 mov dword ptr[r + 8], edx
1627
1628 // store shifted amount offset by count/32 bits
1629 shr ecx, 5
1630 and ecx, 3
1631 mov dword ptr[r + ecx * 4 + 0], edi
1632 mov dword ptr[r + ecx * 4 + 4], esi
1633 mov dword ptr[r + ecx * 4 + 8], ebx
1634 mov dword ptr[r + ecx * 4 + 12], eax
1635 }
1636#else
1637
1638 r[0] = src->lo;
1639 r[1] = src->hi;
1640
1641 amount &= 127;
1642 if (amount >= 64) {
1643 r[1] = r[0] << (amount - 64);
1644 r[0] = 0;
1645 } else if (amount) {
1646# ifdef _M_X64
1647 r[1] = __shiftleft128(r[0], r[1], (char) amount);
1648# else
1649 r[1] = (r[1] << amount) | (r[0] >> (64 - amount));
1650# endif
1651 r[0] = r[0] << amount;
1652 }
1653#endif //_M_IX86
1654
1655 dst->lo = r[0];
1656 dst->hi = r[1];
1657 R128_DEBUG_SET(dst);
1658}
1659
1660void r128Shr(R128 *dst, const R128 *src, int amount)
1661{
1662 R128_U64 r[4];
1663
1664 R128_ASSERT(dst != NULL);
1665 R128_ASSERT(src != NULL);
1666
1667#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
1668 __asm {
1669 // load src
1670 mov edx, dword ptr[src]
1671 mov ecx, amount
1672
1673 mov edi, dword ptr[edx]
1674 mov esi, dword ptr[edx + 4]
1675 mov ebx, dword ptr[edx + 8]
1676 mov eax, dword ptr[edx + 12]
1677
1678 // shift mod 32
1679 shrd edi, esi, cl
1680 shrd esi, ebx, cl
1681 shrd ebx, eax, cl
1682 shr eax, cl
1683
1684 // clear out high 12 bytes of stack
1685 xor edx, edx
1686 mov dword ptr[r + 20], edx
1687 mov dword ptr[r + 24], edx
1688 mov dword ptr[r + 28], edx
1689
1690 // store shifted amount offset by -count/32 bits
1691 shr ecx, 5
1692 and ecx, 3
1693 neg ecx
1694 mov dword ptr[r + ecx * 4 + 16], edi
1695 mov dword ptr[r + ecx * 4 + 20], esi
1696 mov dword ptr[r + ecx * 4 + 24], ebx
1697 mov dword ptr[r + ecx * 4 + 28], eax
1698 }
1699#else
1700 r[2] = src->lo;
1701 r[3] = src->hi;
1702
1703 amount &= 127;
1704 if (amount >= 64) {
1705 r[2] = r[3] >> (amount - 64);
1706 r[3] = 0;
1707 } else if (amount) {
1708#ifdef _M_X64
1709 r[2] = __shiftright128(r[2], r[3], (char) amount);
1710#else
1711 r[2] = (r[2] >> amount) | (r[3] << (64 - amount));
1712#endif
1713 r[3] = r[3] >> amount;
1714 }
1715#endif
1716
1717 dst->lo = r[2];
1718 dst->hi = r[3];
1719 R128_DEBUG_SET(dst);
1720}
1721
1722void r128Sar(R128 *dst, const R128 *src, int amount)
1723{
1724 R128_U64 r[4];
1725
1726 R128_ASSERT(dst != NULL);
1727 R128_ASSERT(src != NULL);
1728
1729#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
1730 __asm {
1731 // load src
1732 mov edx, dword ptr[src]
1733 mov ecx, amount
1734
1735 mov edi, dword ptr[edx]
1736 mov esi, dword ptr[edx + 4]
1737 mov ebx, dword ptr[edx + 8]
1738 mov eax, dword ptr[edx + 12]
1739
1740 // shift mod 32
1741 shrd edi, esi, cl
1742 shrd esi, ebx, cl
1743 shrd ebx, eax, cl
1744 sar eax, cl
1745
1746 // copy sign to high 12 bytes of stack
1747 cdq
1748 mov dword ptr[r + 20], edx
1749 mov dword ptr[r + 24], edx
1750 mov dword ptr[r + 28], edx
1751
1752 // store shifted amount offset by -count/32 bits
1753 shr ecx, 5
1754 and ecx, 3
1755 neg ecx
1756 mov dword ptr[r + ecx * 4 + 16], edi
1757 mov dword ptr[r + ecx * 4 + 20], esi
1758 mov dword ptr[r + ecx * 4 + 24], ebx
1759 mov dword ptr[r + ecx * 4 + 28], eax
1760 }
1761#else
1762 r[2] = src->lo;
1763 r[3] = src->hi;
1764
1765 amount &= 127;
1766 if (amount >= 64) {
1767 r[2] = (R128_U64)((R128_S64)r[3] >> (amount - 64));
1768 r[3] = (R128_U64)((R128_S64)r[3] >> 63);
1769 } else if (amount) {
1770 r[2] = (r[2] >> amount) | (R128_U64)((R128_S64)r[3] << (64 - amount));
1771 r[3] = (R128_U64)((R128_S64)r[3] >> amount);
1772 }
1773#endif
1774
1775 dst->lo = r[2];
1776 dst->hi = r[3];
1777 R128_DEBUG_SET(dst);
1778}
1779
1780void r128Add(R128 *dst, const R128 *a, const R128 *b)
1781{
1782 unsigned char carry = 0;
1783 R128_ASSERT(dst != NULL);
1784 R128_ASSERT(a != NULL);
1785 R128_ASSERT(b != NULL);
1786
1787#if R128_INTEL && !defined(R128_STDC_ONLY)
1788# if R128_64BIT
1789 carry = _addcarry_u64(carry, a->lo, b->lo, &dst->lo);
1790 carry = _addcarry_u64(carry, a->hi, b->hi, &dst->hi);
1791# else
1792 R128_U32 r0, r1, r2, r3;
1793 carry = _addcarry_u32(carry, R128_R0(a), R128_R0(b), &r0);
1794 carry = _addcarry_u32(carry, R128_R1(a), R128_R1(b), &r1);
1795 carry = _addcarry_u32(carry, R128_R2(a), R128_R2(b), &r2);
1796 carry = _addcarry_u32(carry, R128_R3(a), R128_R3(b), &r3);
1797 R128_SET4(dst, r0, r1, r2, r3);
1798# endif //R128_64BIT
1799#else
1800 {
1801 R128_U64 r = a->lo + b->lo;
1802 carry = r < a->lo;
1803 dst->lo = r;
1804 dst->hi = a->hi + b->hi + carry;
1805 }
1806#endif //R128_INTEL
1807
1808 R128_DEBUG_SET(dst);
1809}
1810
1811void r128Sub(R128 *dst, const R128 *a, const R128 *b)
1812{
1813 unsigned char borrow = 0;
1814 R128_ASSERT(dst != NULL);
1815 R128_ASSERT(a != NULL);
1816 R128_ASSERT(b != NULL);
1817
1818#if R128_INTEL && !defined(R128_STDC_ONLY)
1819# if R128_64BIT
1820 borrow = _subborrow_u64(borrow, a->lo, b->lo, &dst->lo);
1821 borrow = _subborrow_u64(borrow, a->hi, b->hi, &dst->hi);
1822# else
1823 R128_U32 r0, r1, r2, r3;
1824 borrow = _subborrow_u32(borrow, R128_R0(a), R128_R0(b), &r0);
1825 borrow = _subborrow_u32(borrow, R128_R1(a), R128_R1(b), &r1);
1826 borrow = _subborrow_u32(borrow, R128_R2(a), R128_R2(b), &r2);
1827 borrow = _subborrow_u32(borrow, R128_R3(a), R128_R3(b), &r3);
1828 R128_SET4(dst, r0, r1, r2, r3);
1829# endif //R128_64BIT
1830#else
1831 {
1832 R128_U64 r = a->lo - b->lo;
1833 borrow = r > a->lo;
1834 dst->lo = r;
1835 dst->hi = a->hi - b->hi - borrow;
1836 }
1837#endif //R128_INTEL
1838
1839 R128_DEBUG_SET(dst);
1840}
1841
1842void r128Mul(R128 *dst, const R128 *a, const R128 *b)
1843{
1844 int sign = 0;
1845 R128 ta, tb, tc;
1846
1847 R128_ASSERT(dst != NULL);
1848 R128_ASSERT(a != NULL);
1849 R128_ASSERT(b != NULL);
1850
1851 R128_SET2(&ta, a->lo, a->hi);
1852 R128_SET2(&tb, b->lo, b->hi);
1853
1854 if (r128IsNeg(&ta)) {
1855 r128__neg(&ta, &ta);
1856 sign = !sign;
1857 }
1858 if (r128IsNeg(&tb)) {
1859 r128__neg(&tb, &tb);
1860 sign = !sign;
1861 }
1862
1863 r128__umul(&tc, &ta, &tb);
1864 if (sign) {
1865 r128__neg(&tc, &tc);
1866 }
1867
1868 r128Copy(dst, &tc);
1869}
1870
1871void r128Div(R128 *dst, const R128 *a, const R128 *b)
1872{
1873 int sign = 0;
1874 R128 tn, td, tq;
1875
1876 R128_ASSERT(dst != NULL);
1877 R128_ASSERT(a != NULL);
1878 R128_ASSERT(b != NULL);
1879
1880 R128_SET2(&tn, a->lo, a->hi);
1881 R128_SET2(&td, b->lo, b->hi);
1882
1883 if (r128IsNeg(&tn)) {
1884 r128__neg(&tn, &tn);
1885 sign = !sign;
1886 }
1887
1888 if (td.lo == 0 && td.hi == 0) {
1889 // divide by zero
1890 if (sign) {
1891 r128Copy(dst, &R128_min);
1892 } else {
1893 r128Copy(dst, &R128_max);
1894 }
1895 return;
1896 } else if (r128IsNeg(&td)) {
1897 r128__neg(&td, &td);
1898 sign = !sign;
1899 }
1900
1901 r128__udiv(&tq, &tn, &td);
1902
1903 if (sign) {
1904 r128__neg(&tq, &tq);
1905 }
1906
1907 r128Copy(dst, &tq);
1908}
1909
1910void r128Mod(R128 *dst, const R128 *a, const R128 *b)
1911{
1912 int sign = 0;
1913 R128 tn, td, tq;
1914
1915 R128_ASSERT(dst != NULL);
1916 R128_ASSERT(a != NULL);
1917 R128_ASSERT(b != NULL);
1918
1919 R128_SET2(&tn, a->lo, a->hi);
1920 R128_SET2(&td, b->lo, b->hi);
1921
1922 if (r128IsNeg(&tn)) {
1923 r128__neg(&tn, &tn);
1924 sign = !sign;
1925 }
1926
1927 if (td.lo == 0 && td.hi == 0) {
1928 // divide by zero
1929 if (sign) {
1930 r128Copy(dst, &R128_min);
1931 } else {
1932 r128Copy(dst, &R128_max);
1933 }
1934 return;
1935 } else if (r128IsNeg(&td)) {
1936 r128__neg(&td, &td);
1937 sign = !sign;
1938 }
1939
1940 tq.hi = r128__umod(&tn, &td);
1941 tq.lo = 0;
1942
1943 if (sign) {
1944 tq.hi = ~tq.hi + 1;
1945 }
1946
1947 r128Mul(&tq, &tq, b);
1948 r128Sub(dst, a, &tq);
1949}
1950
1951void r128Rsqrt(R128 *dst, const R128 *v)
1952{
1953 static const R128 threeHalves = { R128_LIT_U64(0x8000000000000000), 1 };
1954 R128 x, est;
1955 int i;
1956
1957 if ((R128_S64)v->hi < 0) {
1958 r128Copy(dst, &R128_min);
1959 return;
1960 }
1961
1962 R128_SET2(&x, v->lo, v->hi);
1963
1964 // get initial estimate
1965 if (x.hi) {
1966 int shift = (64 + r128__clz64(x.hi)) >> 1;
1967 est.lo = R128_LIT_U64(1) << shift;
1968 est.hi = 0;
1969 } else if (x.lo) {
1970 int shift = r128__clz64(x.lo) >> 1;
1971 est.hi = R128_LIT_U64(1) << shift;
1972 est.lo = 0;
1973 } else {
1974 R128_SET2(dst, 0, 0);
1975 return;
1976 }
1977
1978 // x /= 2
1979 r128Shr(&x, &x, 1);
1980
1981 // Newton-Raphson iterate
1982 for (i = 0; i < 7; ++i) {
1983 R128 newEst;
1984
1985 // newEst = est * (threeHalves - (x / 2) * est * est);
1986 r128__umul(&newEst, &est, &est);
1987 r128__umul(&newEst, &newEst, &x);
1988 r128Sub(&newEst, &threeHalves, &newEst);
1989 r128__umul(&newEst, &est, &newEst);
1990
1991 if (newEst.lo == est.lo && newEst.hi == est.hi) {
1992 break;
1993 }
1994 R128_SET2(&est, newEst.lo, newEst.hi);
1995 }
1996
1997 r128Copy(dst, &est);
1998}
1999
2000void r128Sqrt(R128 *dst, const R128 *v)
2001{
2002 R128 x, est;
2003 int i;
2004
2005 if ((R128_S64)v->hi < 0) {
2006 r128Copy(dst, &R128_min);
2007 return;
2008 }
2009
2010 R128_SET2(&x, v->lo, v->hi);
2011
2012 // get initial estimate
2013 if (x.hi) {
2014 int shift = (63 - r128__clz64(x.hi)) >> 1;
2015 r128Shr(&est, &x, shift);
2016 } else if (x.lo) {
2017 int shift = (1 + r128__clz64(x.lo)) >> 1;
2018 r128Shl(&est, &x, shift);
2019 } else {
2020 R128_SET2(dst, 0, 0);
2021 return;
2022 }
2023
2024 // Newton-Raphson iterate
2025 for (i = 0; i < 7; ++i) {
2026 R128 newEst;
2027
2028 // newEst = (est + x / est) / 2
2029 r128__udiv(&newEst, &x, &est);
2030 r128Add(&newEst, &newEst, &est);
2031 r128Shr(&newEst, &newEst, 1);
2032
2033 if (newEst.lo == est.lo && newEst.hi == est.hi) {
2034 break;
2035 }
2036 R128_SET2(&est, newEst.lo, newEst.hi);
2037 }
2038
2039 r128Copy(dst, &est);
2040}
2041
2042int r128Cmp(const R128 *a, const R128 *b)
2043{
2044 R128_ASSERT(a != NULL);
2045 R128_ASSERT(b != NULL);
2046
2047 if (a->hi == b->hi) {
2048 if (a->lo == b->lo) {
2049 return 0;
2050 } else if (a->lo > b->lo) {
2051 return 1;
2052 } else {
2053 return -1;
2054 }
2055 } else if ((R128_S64)a->hi > (R128_S64)b->hi) {
2056 return 1;
2057 } else {
2058 return -1;
2059 }
2060}
2061
2062int r128IsNeg(const R128 *v)
2063{
2064 R128_ASSERT(v != NULL);
2065
2066 return (R128_S64)v->hi < 0;
2067}
2068
2069void r128Min(R128 *dst, const R128 *a, const R128 *b)
2070{
2071 R128_ASSERT(dst != NULL);
2072 R128_ASSERT(a != NULL);
2073 R128_ASSERT(b != NULL);
2074
2075 if (r128Cmp(a, b) < 0) {
2076 r128Copy(dst, a);
2077 } else {
2078 r128Copy(dst, b);
2079 }
2080}
2081
2082void r128Max(R128 *dst, const R128 *a, const R128 *b)
2083{
2084 R128_ASSERT(dst != NULL);
2085 R128_ASSERT(a != NULL);
2086 R128_ASSERT(b != NULL);
2087
2088 if (r128Cmp(a, b) > 0) {
2089 r128Copy(dst, a);
2090 } else {
2091 r128Copy(dst, b);
2092 }
2093}
2094
2095void r128Floor(R128 *dst, const R128 *v)
2096{
2097 R128_ASSERT(dst != NULL);
2098 R128_ASSERT(v != NULL);
2099
2100 if ((R128_S64)v->hi < 0) {
2101 dst->hi = v->hi - (v->lo != 0);
2102 } else {
2103 dst->hi = v->hi;
2104 }
2105 dst->lo = 0;
2106 R128_DEBUG_SET(dst);
2107}
2108
2109void r128Ceil(R128 *dst, const R128 *v)
2110{
2111 R128_ASSERT(dst != NULL);
2112 R128_ASSERT(v != NULL);
2113
2114 if ((R128_S64)v->hi > 0) {
2115 dst->hi = v->hi + (v->lo != 0);
2116 } else {
2117 dst->hi = v->hi;
2118 }
2119 dst->lo = 0;
2120 R128_DEBUG_SET(dst);
2121}
2122
2123#endif //R128_IMPLEMENTATION
2124