1/* libdivide.h
2 Copyright 2010 ridiculous_fish
3*/
4#pragma GCC diagnostic push
5#pragma GCC diagnostic ignored "-Wold-style-cast"
6
7#if defined(_WIN32) || defined(WIN32)
8#define LIBDIVIDE_WINDOWS 1
9#endif
10
11#if defined(_MSC_VER)
12#define LIBDIVIDE_VC 1
13#endif
14
15#ifdef __cplusplus
16#include <cstdlib>
17#include <cstdio>
18#include <cassert>
19#else
20#include <stdlib.h>
21#include <stdio.h>
22#include <assert.h>
23#endif
24
25#if ! LIBDIVIDE_HAS_STDINT_TYPES && (! LIBDIVIDE_VC || _MSC_VER >= 1600)
26/* Only Visual C++ 2010 and later include stdint.h */
27#include <stdint.h>
28#define LIBDIVIDE_HAS_STDINT_TYPES 1
29#endif
30
31#if ! LIBDIVIDE_HAS_STDINT_TYPES
32typedef __int32 int32_t;
33typedef unsigned __int32 uint32_t;
34typedef __int64 int64_t;
35typedef unsigned __int64 uint64_t;
36typedef __int8 int8_t;
37typedef unsigned __int8 uint8_t;
38#endif
39
40#if LIBDIVIDE_USE_SSE2
41 #include <emmintrin.h>
42#endif
43
44#if LIBDIVIDE_VC
45 #include <intrin.h>
46#endif
47
48#ifndef __has_builtin
49#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
50#endif
51
52#ifdef __ICC
53#define HAS_INT128_T 0
54#else
55#define HAS_INT128_T __LP64__
56#endif
57
58#if defined(__x86_64__) || defined(_WIN64) || defined(_M_64)
59#define LIBDIVIDE_IS_X86_64 1
60#endif
61
62#if defined(__i386__)
63#define LIBDIVIDE_IS_i386 1
64#endif
65
66#if __GNUC__ || __clang__
67#define LIBDIVIDE_GCC_STYLE_ASM 1
68#endif
69
70
71/* libdivide may use the pmuldq (vector signed 32x32->64 mult instruction) which is in SSE 4.1. However, signed multiplication can be emulated efficiently with unsigned multiplication, and SSE 4.1 is currently rare, so it is OK to not turn this on */
72#ifdef LIBDIVIDE_USE_SSE4_1
73#include <smmintrin.h>
74#endif
75
76#ifdef __cplusplus
77/* We place libdivide within the libdivide namespace, and that goes in an anonymous namespace so that the functions are only visible to files that #include this header and don't get external linkage. At least that's the theory. */
78namespace {
79namespace libdivide {
80#endif
81
82/* Explanation of "more" field: bit 6 is whether to use shift path. If we are using the shift path, bit 7 is whether the divisor is negative in the signed case; in the unsigned case it is 0. Bits 0-4 is shift value (for shift path or mult path). In 32 bit case, bit 5 is always 0. We use bit 7 as the "negative divisor indicator" so that we can use sign extension to efficiently go to a full-width -1.
83
84
85u32: [0-4] shift value
86 [5] ignored
87 [6] add indicator
88 [7] shift path
89
90s32: [0-4] shift value
91 [5] shift path
92 [6] add indicator
93 [7] indicates negative divisor
94
95u64: [0-5] shift value
96 [6] add indicator
97 [7] shift path
98
99s64: [0-5] shift value
100 [6] add indicator
101 [7] indicates negative divisor
102 magic number of 0 indicates shift path (we ran out of bits!)
103*/
104
105enum {
106 LIBDIVIDE_32_SHIFT_MASK = 0x1F,
107 LIBDIVIDE_64_SHIFT_MASK = 0x3F,
108 LIBDIVIDE_ADD_MARKER = 0x40,
109 LIBDIVIDE_U32_SHIFT_PATH = 0x80,
110 LIBDIVIDE_U64_SHIFT_PATH = 0x80,
111 LIBDIVIDE_S32_SHIFT_PATH = 0x20,
112 LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
113};
114
115
116struct libdivide_u32_t {
117 uint32_t magic;
118 uint8_t more;
119};
120
121struct libdivide_s32_t {
122 int32_t magic;
123 uint8_t more;
124};
125
126struct libdivide_u64_t {
127 uint64_t magic;
128 uint8_t more;
129};
130
131struct libdivide_s64_t {
132 int64_t magic;
133 uint8_t more;
134};
135
136
137
138#ifndef LIBDIVIDE_API
139 #ifdef __cplusplus
140 /* In C++, we don't want our public functions to be static, because they are arguments to templates and static functions can't do that. They get internal linkage through virtue of the anonymous namespace. In C, they should be static. */
141 #define LIBDIVIDE_API
142 #else
143 #define LIBDIVIDE_API static
144 #endif
145#endif
146
147#ifdef __APPLE__
148typedef signed long Int64;
149typedef unsigned long UInt64;
150#endif
151
152LIBDIVIDE_API struct libdivide_s32_t libdivide_s32_gen(int32_t y);
153LIBDIVIDE_API struct libdivide_u32_t libdivide_u32_gen(uint32_t y);
154LIBDIVIDE_API struct libdivide_s64_t libdivide_s64_gen(int64_t y);
155LIBDIVIDE_API struct libdivide_u64_t libdivide_u64_gen(uint64_t y);
156#if defined(__APPLE__) && defined(__cplusplus)
157#pragma GCC diagnostic push
158#pragma GCC diagnostic ignored "-Wunused-function"
159LIBDIVIDE_API struct libdivide_s64_t libdivide_s64_gen(Int64 y) { return libdivide_s64_gen(int64_t(y)); };
160LIBDIVIDE_API struct libdivide_u64_t libdivide_u64_gen(UInt64 y) { return libdivide_u64_gen(uint64_t(y)); };
161#pragma GCC diagnostic pop
162#endif
163
164LIBDIVIDE_API int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom);
165LIBDIVIDE_API uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom);
166LIBDIVIDE_API int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom);
167LIBDIVIDE_API uint64_t libdivide_u64_do(uint64_t y, const struct libdivide_u64_t *denom);
168#if defined(__APPLE__) && defined(__cplusplus)
169#pragma GCC diagnostic push
170#pragma GCC diagnostic ignored "-Wunused-function"
171LIBDIVIDE_API Int64 libdivide_s64_do(Int64 numer, const struct libdivide_s64_t *denom) { return Int64(libdivide_s64_do(int64_t(numer), denom)); };
172LIBDIVIDE_API UInt64 libdivide_u64_do(UInt64 y, const struct libdivide_u64_t *denom) { return UInt64(libdivide_u64_do(uint64_t(y), denom)); };
173#pragma GCC diagnostic pop
174#endif
175
176LIBDIVIDE_API int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom);
177LIBDIVIDE_API uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom);
178LIBDIVIDE_API uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom);
179LIBDIVIDE_API uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom);
180
181LIBDIVIDE_API int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom);
182LIBDIVIDE_API uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *denom);
183LIBDIVIDE_API uint64_t libdivide_u64_do_alg1(uint64_t numer, const struct libdivide_u64_t *denom);
184LIBDIVIDE_API uint64_t libdivide_u64_do_alg2(uint64_t numer, const struct libdivide_u64_t *denom);
185#if defined(__APPLE__) && defined(__cplusplus)
186#pragma GCC diagnostic push
187#pragma GCC diagnostic ignored "-Wunused-function"
188LIBDIVIDE_API UInt64 libdivide_u64_do_alg0(UInt64 numer, const struct libdivide_u64_t *denom) { return UInt64(libdivide_u64_do_alg0(uint64_t(numer), denom)); }
189LIBDIVIDE_API UInt64 libdivide_u64_do_alg1(UInt64 numer, const struct libdivide_u64_t *denom) { return UInt64(libdivide_u64_do_alg1(uint64_t(numer), denom)); }
190LIBDIVIDE_API UInt64 libdivide_u64_do_alg2(UInt64 numer, const struct libdivide_u64_t *denom) { return UInt64(libdivide_u64_do_alg2(uint64_t(numer), denom)); }
191#pragma GCC diagnostic pop
192#endif
193
194LIBDIVIDE_API int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom);
195LIBDIVIDE_API int32_t libdivide_s32_do_alg0(int32_t numer, const struct libdivide_s32_t *denom);
196LIBDIVIDE_API int32_t libdivide_s32_do_alg1(int32_t numer, const struct libdivide_s32_t *denom);
197LIBDIVIDE_API int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom);
198LIBDIVIDE_API int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom);
199LIBDIVIDE_API int32_t libdivide_s32_do_alg4(int32_t numer, const struct libdivide_s32_t *denom);
200
201LIBDIVIDE_API int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom);
202LIBDIVIDE_API int64_t libdivide_s64_do_alg0(int64_t numer, const struct libdivide_s64_t *denom);
203LIBDIVIDE_API int64_t libdivide_s64_do_alg1(int64_t numer, const struct libdivide_s64_t *denom);
204LIBDIVIDE_API int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom);
205LIBDIVIDE_API int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom);
206LIBDIVIDE_API int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom);
207#if defined(__APPLE__) && defined(__cplusplus)
208#pragma GCC diagnostic push
209#pragma GCC diagnostic ignored "-Wunused-function"
210LIBDIVIDE_API Int64 libdivide_s64_do_alg0(Int64 numer, const struct libdivide_s64_t *denom) { return Int64(libdivide_s64_do_alg0(int64_t(numer), denom)); }
211LIBDIVIDE_API Int64 libdivide_s64_do_alg1(Int64 numer, const struct libdivide_s64_t *denom) { return Int64(libdivide_s64_do_alg1(int64_t(numer), denom)); }
212LIBDIVIDE_API Int64 libdivide_s64_do_alg2(Int64 numer, const struct libdivide_s64_t *denom) { return Int64(libdivide_s64_do_alg2(int64_t(numer), denom)); }
213LIBDIVIDE_API Int64 libdivide_s64_do_alg3(Int64 numer, const struct libdivide_s64_t *denom) { return Int64(libdivide_s64_do_alg3(int64_t(numer), denom)); }
214LIBDIVIDE_API Int64 libdivide_s64_do_alg4(Int64 numer, const struct libdivide_s64_t *denom) { return Int64(libdivide_s64_do_alg4(int64_t(numer), denom)); }
215#pragma GCC diagnostic pop
216#endif
217
218
219#if LIBDIVIDE_USE_SSE2
220LIBDIVIDE_API __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t * denom);
221LIBDIVIDE_API __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t * denom);
222LIBDIVIDE_API __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t * denom);
223LIBDIVIDE_API __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t * denom);
224
225LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t * denom);
226LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t * denom);
227LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t * denom);
228
229LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t * denom);
230LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t * denom);
231LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t * denom);
232LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t * denom);
233LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t * denom);
234
235LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t * denom);
236LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t * denom);
237LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t * denom);
238
239LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t * denom);
240LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t * denom);
241LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t * denom);
242LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t * denom);
243LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t * denom);
244#endif
245
246
247
248//////// Internal Utility Functions
249
250static inline uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) {
251 uint64_t xl = x, yl = y;
252 uint64_t rl = xl * yl;
253 return (uint32_t)(rl >> 32);
254}
255
256static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) {
257#if HAS_INT128_T
258 __uint128_t xl = x, yl = y;
259 __uint128_t rl = xl * yl;
260 return (uint64_t)(rl >> 64);
261#else
262 //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
263 const uint32_t mask = 0xFFFFFFFF;
264 const uint32_t x0 = (uint32_t)(x & mask), x1 = (uint32_t)(x >> 32);
265 const uint32_t y0 = (uint32_t)(y & mask), y1 = (uint32_t)(y >> 32);
266 const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
267 const uint64_t x0y1 = x0 * (uint64_t)y1;
268 const uint64_t x1y0 = x1 * (uint64_t)y0;
269 const uint64_t x1y1 = x1 * (uint64_t)y1;
270
271 uint64_t temp = x1y0 + x0y0_hi;
272 uint64_t temp_lo = temp & mask, temp_hi = temp >> 32;
273 return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
274#endif
275}
276
277static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) {
278#if HAS_INT128_T
279 __int128_t xl = x, yl = y;
280 __int128_t rl = xl * yl;
281 return (int64_t)(rl >> 64);
282#else
283 //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
284 const uint32_t mask = 0xFFFFFFFF;
285 const uint32_t x0 = (uint32_t)(x & mask), y0 = (uint32_t)(y & mask);
286 const int32_t x1 = (int32_t)(x >> 32), y1 = (int32_t)(y >> 32);
287 const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
288 const int64_t t = x1*(int64_t)y0 + x0y0_hi;
289 const int64_t w1 = x0*(int64_t)y1 + (t & mask);
290 return x1*(int64_t)y1 + (t >> 32) + (w1 >> 32);
291#endif
292}
293
294#if LIBDIVIDE_USE_SSE2
295
296static inline __m128i libdivide__u64_to_m128(uint64_t x) {
297#if LIBDIVIDE_VC && ! _WIN64
298 //64 bit windows doesn't seem to have an implementation of any of these load intrinsics, and 32 bit Visual C++ crashes
299 _declspec(align(16)) uint64_t temp[2] = {x, x};
300 return _mm_load_si128((const __m128i*)temp);
301#elif defined(__ICC)
302 uint64_t __attribute__((aligned(16))) temp[2] = {x,x};
303 return _mm_load_si128((const __m128i*)temp);
304#elif __clang__
305#pragma clang diagnostic push
306#pragma clang diagnostic ignored "-Wc++11-narrowing" // narrowing from uint64_t (aka 'unsigned long') to 'long long'
307 // clang does not provide this intrinsic either
308 return (__m128i){x, x};
309#pragma clang diagnostic pop
310#else
311 // everyone else gets it right
312 return _mm_set1_epi64x(x);
313#endif
314}
315
316static inline __m128i libdivide_get_FFFFFFFF00000000(void) {
317 //returns the same as _mm_set1_epi64(0xFFFFFFFF00000000ULL) without touching memory
318 __m128i result = _mm_set1_epi8(-1); //optimizes to pcmpeqd on OS X
319 return _mm_slli_epi64(result, 32);
320}
321
322static inline __m128i libdivide_get_00000000FFFFFFFF(void) {
323 //returns the same as _mm_set1_epi64(0x00000000FFFFFFFFULL) without touching memory
324 __m128i result = _mm_set1_epi8(-1); //optimizes to pcmpeqd on OS X
325 result = _mm_srli_epi64(result, 32);
326 return result;
327}
328
329#if __clang__
330#pragma clang diagnostic push
331#pragma clang diagnostic ignored "-Wuninitialized"
332#endif
333static inline __m128i libdivide_get_0000FFFF(void) {
334 //returns the same as _mm_set1_epi32(0x0000FFFFULL) without touching memory
335 __m128i result; //we don't care what its contents are
336 result = _mm_cmpeq_epi8(result, result); //all 1s
337 result = _mm_srli_epi32(result, 16);
338 return result;
339}
340#if __clang__
341#pragma clang diagnostic pop
342#endif
343
344/// This is a bug in gcc-8, _MM_SHUFFLE was forgotten, though in trunk it is ok https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/xmmintrin.h#L61
345#if defined(__PPC__)
346#ifndef _MM_SHUFFLE
347#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
348#endif
349#endif
350
351static inline __m128i libdivide_s64_signbits(__m128i v) {
352 //we want to compute v >> 63, that is, _mm_srai_epi64(v, 63). But there is no 64 bit shift right arithmetic instruction in SSE2. So we have to fake it by first duplicating the high 32 bit values, and then using a 32 bit shift. Another option would be to use _mm_srli_epi64(v, 63) and then subtract that from 0, but that approach appears to be substantially slower for unknown reasons
353 __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
354 __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
355 return signBits;
356}
357
358/* Returns an __m128i whose low 32 bits are equal to amt and has zero elsewhere. */
359static inline __m128i libdivide_u32_to_m128i(uint32_t amt) {
360 return _mm_set_epi32(0, 0, 0, amt);
361}
362
363static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) {
364 //implementation of _mm_sra_epi64. Here we have two 64 bit values which are shifted right to logically become (64 - amt) values, and are then sign extended from a (64 - amt) bit number.
365 const int b = 64 - amt;
366 __m128i m = libdivide__u64_to_m128(1ULL << (b - 1));
367 __m128i x = _mm_srl_epi64(v, libdivide_u32_to_m128i(amt));
368 __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); //result = x^m - m
369 return result;
370}
371
372/* Here, b is assumed to contain one 32 bit value repeated four times. If it did not, the function would not work. */
373static inline __m128i libdivide__mullhi_u32_flat_vector(__m128i a, __m128i b) {
374 __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
375 __m128i a1X3X = _mm_srli_epi64(a, 32);
376 __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), libdivide_get_FFFFFFFF00000000());
377 return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123
378}
379
380
381/* Here, y is assumed to contain one 64 bit value repeated twice. */
382static inline __m128i libdivide_mullhi_u64_flat_vector(__m128i x, __m128i y) {
383 //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
384 const __m128i mask = libdivide_get_00000000FFFFFFFF();
385 const __m128i x0 = _mm_and_si128(x, mask), x1 = _mm_srli_epi64(x, 32); //x0 is low half of 2 64 bit values, x1 is high half in low slots
386 const __m128i y0 = _mm_and_si128(y, mask), y1 = _mm_srli_epi64(y, 32);
387 const __m128i x0y0_hi = _mm_srli_epi64(_mm_mul_epu32(x0, y0), 32); //x0 happens to have the low half of the two 64 bit values in 32 bit slots 0 and 2, so _mm_mul_epu32 computes their full product, and then we shift right by 32 to get just the high values
388 const __m128i x0y1 = _mm_mul_epu32(x0, y1);
389 const __m128i x1y0 = _mm_mul_epu32(x1, y0);
390 const __m128i x1y1 = _mm_mul_epu32(x1, y1);
391
392 const __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
393 __m128i temp_lo = _mm_and_si128(temp, mask), temp_hi = _mm_srli_epi64(temp, 32);
394 temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
395 temp_hi = _mm_add_epi64(x1y1, temp_hi);
396
397 return _mm_add_epi64(temp_lo, temp_hi);
398}
399
400/* y is one 64 bit value repeated twice */
401static inline __m128i libdivide_mullhi_s64_flat_vector(__m128i x, __m128i y) {
402 __m128i p = libdivide_mullhi_u64_flat_vector(x, y);
403 __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
404 p = _mm_sub_epi64(p, t1);
405 __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
406 p = _mm_sub_epi64(p, t2);
407 return p;
408}
409
410#ifdef LIBDIVIDE_USE_SSE4_1
411
412/* b is one 32 bit value repeated four times. */
413static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
414 __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epi32(a, b), 32);
415 __m128i a1X3X = _mm_srli_epi64(a, 32);
416 __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epi32(a1X3X, b), libdivide_get_FFFFFFFF00000000());
417 return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123
418}
419
420#else
421
422/* SSE2 does not have a signed multiplication instruction, but we can convert unsigned to signed pretty efficiently. Again, b is just a 32 bit value repeated four times. */
423static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
424 __m128i p = libdivide__mullhi_u32_flat_vector(a, b);
425 __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); //t1 = (a >> 31) & y, arithmetic shift
426 __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
427 p = _mm_sub_epi32(p, t1);
428 p = _mm_sub_epi32(p, t2);
429 return p;
430}
431#endif
432#endif
433
434static inline int32_t libdivide__count_trailing_zeros32(uint32_t val) {
435#if __GNUC__ || __has_builtin(__builtin_ctz)
436 /* Fast way to count trailing zeros */
437 return __builtin_ctz(val);
438#elif LIBDIVIDE_VC
439 unsigned long result;
440 if (_BitScanForward(&result, val)) {
441 return result;
442 }
443 return 0;
444#else
445 /* Dorky way to count trailing zeros. Note that this hangs for val = 0! */
446 int32_t result = 0;
447 val = (val ^ (val - 1)) >> 1; // Set v's trailing 0s to 1s and zero rest
448 while (val) {
449 val >>= 1;
450 result++;
451 }
452 return result;
453#endif
454}
455
456static inline int32_t libdivide__count_trailing_zeros64(uint64_t val) {
457#if __LP64__ && (__GNUC__ || __has_builtin(__builtin_ctzll))
458 /* Fast way to count trailing zeros. Note that we disable this in 32 bit because gcc does something horrible - it calls through to a dynamically bound function. */
459 return __builtin_ctzll(val);
460#elif LIBDIVIDE_VC && _WIN64
461 unsigned long result;
462 if (_BitScanForward64(&result, val)) {
463 return result;
464 }
465 return 0;
466#else
467 /* Pretty good way to count trailing zeros. Note that this hangs for val = 0! */
468 uint32_t lo = val & 0xFFFFFFFF;
469 if (lo != 0) return libdivide__count_trailing_zeros32(lo);
470 return 32 + libdivide__count_trailing_zeros32(val >> 32);
471#endif
472}
473
474static inline int32_t libdivide__count_leading_zeros32(uint32_t val) {
475#if __GNUC__ || __has_builtin(__builtin_clzll)
476 /* Fast way to count leading zeros */
477 return __builtin_clz(val);
478#elif LIBDIVIDE_VC
479 unsigned long result;
480 if (_BitScanReverse(&result, val)) {
481 return 31 - result;
482 }
483 return 0;
484#else
485 /* Dorky way to count leading zeros. Note that this hangs for val = 0! */
486 int32_t result = 0;
487 while (! (val & (1U << 31))) {
488 val <<= 1;
489 result++;
490 }
491 return result;
492#endif
493}
494
495static inline int32_t libdivide__count_leading_zeros64(uint64_t val) {
496#if __GNUC__ || __has_builtin(__builtin_clzll)
497 /* Fast way to count leading zeros */
498 return __builtin_clzll(val);
499#elif LIBDIVIDE_VC && _WIN64
500 unsigned long result;
501 if (_BitScanReverse64(&result, val)) {
502 return 63 - result;
503 }
504 return 0;
505#else
506 /* Dorky way to count leading zeros. Note that this hangs for val = 0! */
507 int32_t result = 0;
508 while (! (val & (1ULL << 63))) {
509 val <<= 1;
510 result++;
511 }
512 return result;
513#endif
514}
515
516//libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit uint {v}. The result must fit in 32 bits. Returns the quotient directly and the remainder in *r
517#if (LIBDIVIDE_IS_i386 || LIBDIVIDE_IS_X86_64) && LIBDIVIDE_GCC_STYLE_ASM
518static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
519 uint32_t result;
520 __asm__("divl %[v]"
521 : "=a"(result), "=d"(*r)
522 : [v] "r"(v), "a"(u0), "d"(u1)
523 );
524 return result;
525}
526#else
527static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
528 uint64_t n = (((uint64_t)u1) << 32) | u0;
529 uint32_t result = (uint32_t)(n / v);
530 *r = (uint32_t)(n - result * (uint64_t)v);
531 return result;
532}
533#endif
534
535#if LIBDIVIDE_IS_X86_64 && LIBDIVIDE_GCC_STYLE_ASM
536static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
537 //u0 -> rax
538 //u1 -> rdx
539 //divq
540 uint64_t result;
541 __asm__("divq %[v]"
542 : "=a"(result), "=d"(*r)
543 : [v] "r"(v), "a"(u0), "d"(u1)
544 );
545 return result;
546
547}
548#else
549
550/* Code taken from Hacker's Delight, http://www.hackersdelight.org/HDcode/divlu.c . License permits inclusion here per http://www.hackersdelight.org/permissions.htm
551 */
552static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
553 const uint64_t b = (1ULL << 32); // Number base (16 bits).
554 uint64_t un1, un0, // Norm. dividend LSD's.
555 vn1, vn0, // Norm. divisor digits.
556 q1, q0, // Quotient digits.
557 un64, un21, un10,// Dividend digit pairs.
558 rhat; // A remainder.
559 int s; // Shift amount for norm.
560
561 if (u1 >= v) { // If overflow, set rem.
562 if (r != NULL) // to an impossible value,
563 *r = (uint64_t)(-1); // and return the largest
564 return (uint64_t)(-1);} // possible quotient.
565
566 /* count leading zeros */
567 s = libdivide__count_leading_zeros64(v); // 0 <= s <= 63.
568 if (s > 0) {
569 v = v << s; // Normalize divisor.
570 un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31));
571 un10 = u0 << s; // Shift dividend left.
572 } else {
573 // Avoid undefined behavior.
574 un64 = u1 | u0;
575 un10 = u0;
576 }
577
578 vn1 = v >> 32; // Break divisor up into
579 vn0 = v & 0xFFFFFFFF; // two 32-bit digits.
580
581 un1 = un10 >> 32; // Break right half of
582 un0 = un10 & 0xFFFFFFFF; // dividend into two digits.
583
584 q1 = un64/vn1; // Compute the first
585 rhat = un64 - q1*vn1; // quotient digit, q1.
586again1:
587 if (q1 >= b || q1*vn0 > b*rhat + un1) {
588 q1 = q1 - 1;
589 rhat = rhat + vn1;
590 if (rhat < b) goto again1;}
591
592 un21 = un64*b + un1 - q1*v; // Multiply and subtract.
593
594 q0 = un21/vn1; // Compute the second
595 rhat = un21 - q0*vn1; // quotient digit, q0.
596again2:
597 if (q0 >= b || q0*vn0 > b*rhat + un0) {
598 q0 = q0 - 1;
599 rhat = rhat + vn1;
600 if (rhat < b) goto again2;}
601
602 if (r != NULL) // If remainder is wanted,
603 *r = (un21*b + un0 - q0*v) >> s; // return it.
604 return q1*b + q0;
605}
606#endif
607
608#if LIBDIVIDE_ASSERTIONS_ON
609#define LIBDIVIDE_ASSERT(x) do { if (! (x)) { fprintf(stderr, "Assertion failure on line %ld: %s\n", (long)__LINE__, #x); exit(-1); } } while (0)
610#else
611#define LIBDIVIDE_ASSERT(x)
612#endif
613
614#ifndef LIBDIVIDE_HEADER_ONLY
615
616////////// UINT32
617
618struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
619 struct libdivide_u32_t result;
620 if ((d & (d - 1)) == 0) {
621 result.magic = 0;
622 result.more = libdivide__count_trailing_zeros32(d) | LIBDIVIDE_U32_SHIFT_PATH;
623 }
624 else {
625 const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d);
626
627 uint8_t more;
628 uint32_t rem, proposed_m;
629 proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem);
630
631 LIBDIVIDE_ASSERT(rem > 0 && rem < d);
632 const uint32_t e = d - rem;
633
634 /* This power works if e < 2**floor_log_2_d. */
635 if (e < (1U << floor_log_2_d)) {
636 /* This power works */
637 more = floor_log_2_d;
638 }
639 else {
640 /* We have to use the general 33-bit algorithm. We need to compute (2**power) / d. However, we already have (2**(power-1))/d and its remainder. By doubling both, and then correcting the remainder, we can compute the larger division. */
641 proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it
642 const uint32_t twice_rem = rem + rem;
643 if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
644 more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
645 }
646 result.magic = 1 + proposed_m;
647 result.more = more;
648 //result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract one from the shift because we're using the smaller power. If we're using the larger power, we subtract one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be correct in both cases.
649
650 }
651 return result;
652}
653
654uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
655 uint8_t more = denom->more;
656 if (more & LIBDIVIDE_U32_SHIFT_PATH) {
657 return numer >> (more & LIBDIVIDE_32_SHIFT_MASK);
658 }
659 else {
660 uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
661 if (more & LIBDIVIDE_ADD_MARKER) {
662 uint32_t t = ((numer - q) >> 1) + q;
663 return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
664 }
665 else {
666 return q >> more; //all upper bits are 0 - don't need to mask them off
667 }
668 }
669}
670
671
672int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom) {
673 uint8_t more = denom->more;
674 if (more & LIBDIVIDE_U32_SHIFT_PATH) return 0;
675 else if (! (more & LIBDIVIDE_ADD_MARKER)) return 1;
676 else return 2;
677}
678
679uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom) {
680 return numer >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
681}
682
683uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom) {
684 uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
685 return q >> denom->more;
686}
687
688uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom) {
689 // denom->add != 0
690 uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
691 uint32_t t = ((numer - q) >> 1) + q;
692 return t >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
693}
694
695
696
697
698#if LIBDIVIDE_USE_SSE2
699__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) {
700 uint8_t more = denom->more;
701 if (more & LIBDIVIDE_U32_SHIFT_PATH) {
702 return _mm_srl_epi32(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
703 }
704 else {
705 __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
706 if (more & LIBDIVIDE_ADD_MARKER) {
707 //uint32_t t = ((numer - q) >> 1) + q;
708 //return t >> denom->shift;
709 __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
710 return _mm_srl_epi32(t, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
711
712 }
713 else {
714 //q >> denom->shift
715 return _mm_srl_epi32(q, libdivide_u32_to_m128i(more));
716 }
717 }
718}
719
720__m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom) {
721 return _mm_srl_epi32(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
722}
723
724__m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom) {
725 __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
726 return _mm_srl_epi32(q, libdivide_u32_to_m128i(denom->more));
727}
728
729__m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom) {
730 __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
731 __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
732 return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
733}
734
735#endif
736
737/////////// UINT64
738
739struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
740 struct libdivide_u64_t result;
741 if ((d & (d - 1)) == 0) {
742 result.more = libdivide__count_trailing_zeros64(d) | LIBDIVIDE_U64_SHIFT_PATH;
743 result.magic = 0;
744 }
745 else {
746 const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(d);
747
748 uint64_t proposed_m, rem;
749 uint8_t more;
750 proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem); //== (1 << (64 + floor_log_2_d)) / d
751
752 LIBDIVIDE_ASSERT(rem > 0 && rem < d);
753 const uint64_t e = d - rem;
754
755 /* This power works if e < 2**floor_log_2_d. */
756 if (e < (1ULL << floor_log_2_d)) {
757 /* This power works */
758 more = floor_log_2_d;
759 }
760 else {
761 /* We have to use the general 65-bit algorithm. We need to compute (2**power) / d. However, we already have (2**(power-1))/d and its remainder. By doubling both, and then correcting the remainder, we can compute the larger division. */
762 proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it
763 const uint64_t twice_rem = rem + rem;
764 if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
765 more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
766 }
767 result.magic = 1 + proposed_m;
768 result.more = more;
769 //result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract one from the shift because we're using the smaller power. If we're using the larger power, we subtract one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be correct in both cases, which is why we do it outside of the if statement.
770 }
771 return result;
772}
773
774uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
775 uint8_t more = denom->more;
776 if (more & LIBDIVIDE_U64_SHIFT_PATH) {
777 return numer >> (more & LIBDIVIDE_64_SHIFT_MASK);
778 }
779 else {
780 uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
781 if (more & LIBDIVIDE_ADD_MARKER) {
782 uint64_t t = ((numer - q) >> 1) + q;
783 return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
784 }
785 else {
786 return q >> more; //all upper bits are 0 - don't need to mask them off
787 }
788 }
789}
790
791
792int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom) {
793 uint8_t more = denom->more;
794 if (more & LIBDIVIDE_U64_SHIFT_PATH) return 0;
795 else if (! (more & LIBDIVIDE_ADD_MARKER)) return 1;
796 else return 2;
797}
798
799uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *denom) {
800 return numer >> (denom->more & LIBDIVIDE_64_SHIFT_MASK);
801}
802
803uint64_t libdivide_u64_do_alg1(uint64_t numer, const struct libdivide_u64_t *denom) {
804 uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
805 return q >> denom->more;
806}
807
808uint64_t libdivide_u64_do_alg2(uint64_t numer, const struct libdivide_u64_t *denom) {
809 uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
810 uint64_t t = ((numer - q) >> 1) + q;
811 return t >> (denom->more & LIBDIVIDE_64_SHIFT_MASK);
812}
813
814#if LIBDIVIDE_USE_SSE2
815__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t * denom) {
816 uint8_t more = denom->more;
817 if (more & LIBDIVIDE_U64_SHIFT_PATH) {
818 return _mm_srl_epi64(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK));
819 }
820 else {
821 __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
822 if (more & LIBDIVIDE_ADD_MARKER) {
823 //uint32_t t = ((numer - q) >> 1) + q;
824 //return t >> denom->shift;
825 __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
826 return _mm_srl_epi64(t, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK));
827 }
828 else {
829 //q >> denom->shift
830 return _mm_srl_epi64(q, libdivide_u32_to_m128i(more));
831 }
832 }
833}
834
835__m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom) {
836 return _mm_srl_epi64(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
837}
838
839__m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom) {
840 __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
841 return _mm_srl_epi64(q, libdivide_u32_to_m128i(denom->more));
842}
843
844__m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom) {
845 __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
846 __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
847 return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
848}
849
850
851#endif
852
853/////////// SINT32
854
855
856static inline int32_t libdivide__mullhi_s32(int32_t x, int32_t y) {
857 int64_t xl = x, yl = y;
858 int64_t rl = xl * yl;
859 return (int32_t)(rl >> 32); //needs to be arithmetic shift
860}
861
862struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
863 struct libdivide_s32_t result;
864
865 /* If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. */
866 uint32_t absD = (uint32_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick
867 if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero
868 result.magic = 0;
869 result.more = libdivide__count_trailing_zeros32(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0) | LIBDIVIDE_S32_SHIFT_PATH;
870 }
871 else {
872 const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(absD);
873 LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
874
875 uint8_t more;
876 //the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word is 0 and the high word is floor_log_2_d - 1
877 uint32_t rem, proposed_m;
878 proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem);
879 const uint32_t e = absD - rem;
880
881 /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */
882 if (e < (1U << floor_log_2_d)) {
883 /* This power works */
884 more = floor_log_2_d - 1;
885 }
886 else {
887 /* We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. */
888 proposed_m += proposed_m;
889 const uint32_t twice_rem = rem + rem;
890 if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
891 more = floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); //use the general algorithm
892 }
893 proposed_m += 1;
894 result.magic = (d < 0 ? -(int32_t)proposed_m : (int32_t)proposed_m);
895 result.more = more;
896
897 }
898 return result;
899}
900
901int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
902 uint8_t more = denom->more;
903 if (more & LIBDIVIDE_S32_SHIFT_PATH) {
904 uint8_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
905 int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1));
906 q = q >> shifter;
907 int32_t shiftMask = (int8_t)more >> 7; //must be arithmetic shift and then sign-extend
908 q = (q ^ shiftMask) - shiftMask;
909 return q;
910 }
911 else {
912 int32_t q = libdivide__mullhi_s32(denom->magic, numer);
913 if (more & LIBDIVIDE_ADD_MARKER) {
914 int32_t sign = (int8_t)more >> 7; //must be arithmetic shift and then sign extend
915 q += ((numer ^ sign) - sign);
916 }
917 q >>= more & LIBDIVIDE_32_SHIFT_MASK;
918 q += (q < 0);
919 return q;
920 }
921}
922
923int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom) {
924 uint8_t more = denom->more;
925 int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR);
926 if (more & LIBDIVIDE_S32_SHIFT_PATH) return (positiveDivisor ? 0 : 1);
927 else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? 2 : 3);
928 else return 4;
929}
930
931int32_t libdivide_s32_do_alg0(int32_t numer, const struct libdivide_s32_t *denom) {
932 uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
933 int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1));
934 return q >> shifter;
935}
936
937int32_t libdivide_s32_do_alg1(int32_t numer, const struct libdivide_s32_t *denom) {
938 uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
939 int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1));
940 return - (q >> shifter);
941}
942
943int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom) {
944 int32_t q = libdivide__mullhi_s32(denom->magic, numer);
945 q += numer;
946 q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
947 q += (q < 0);
948 return q;
949}
950
951int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom) {
952 int32_t q = libdivide__mullhi_s32(denom->magic, numer);
953 q -= numer;
954 q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
955 q += (q < 0);
956 return q;
957}
958
959int32_t libdivide_s32_do_alg4(int32_t numer, const struct libdivide_s32_t *denom) {
960 int32_t q = libdivide__mullhi_s32(denom->magic, numer);
961 q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
962 q += (q < 0);
963 return q;
964}
965
966#if LIBDIVIDE_USE_SSE2
967__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t * denom) {
968 uint8_t more = denom->more;
969 if (more & LIBDIVIDE_S32_SHIFT_PATH) {
970 uint32_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
971 __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1); //could use _mm_srli_epi32 with an all -1 register
972 __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); //q = numer + ((numer >> 31) & roundToZeroTweak);
973 q = _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)); // q = q >> shifter
974 __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); //set all bits of shift mask = to the sign bit of more
975 q = _mm_sub_epi32(_mm_xor_si128(q, shiftMask), shiftMask); //q = (q ^ shiftMask) - shiftMask;
976 return q;
977 }
978 else {
979 __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
980 if (more & LIBDIVIDE_ADD_MARKER) {
981 __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7); //must be arithmetic shift
982 q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign);
983 }
984 q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); //q >>= shift
985 q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
986 return q;
987 }
988}
989
990__m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom) {
991 uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
992 __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1);
993 __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
994 return _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter));
995}
996
997__m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom) {
998 uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
999 __m128i roundToZeroTweak = _mm_set1_epi32((1 << shifter) - 1);
1000 __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
1001 return _mm_sub_epi32(_mm_setzero_si128(), _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)));
1002}
1003
1004__m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom) {
1005 __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
1006 q = _mm_add_epi32(q, numers);
1007 q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
1008 q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
1009 return q;
1010}
1011
1012__m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom) {
1013 __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
1014 q = _mm_sub_epi32(q, numers);
1015 q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
1016 q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
1017 return q;
1018}
1019
1020__m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom) {
1021 __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
1022 q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more)); //q >>= shift
1023 q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
1024 return q;
1025}
1026#endif
1027
1028///////////// SINT64
1029
1030
1031struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
1032 struct libdivide_s64_t result;
1033
1034 /* If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. */
1035 const uint64_t absD = (uint64_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick
1036 if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero
1037 result.more = libdivide__count_trailing_zeros64(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
1038 result.magic = 0;
1039 }
1040 else {
1041 const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(absD);
1042
1043 //the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word is 0 and the high word is floor_log_2_d - 1
1044 uint8_t more;
1045 uint64_t rem, proposed_m;
1046 proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem);
1047 const uint64_t e = absD - rem;
1048
1049 /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */
1050 if (e < (1ULL << floor_log_2_d)) {
1051 /* This power works */
1052 more = floor_log_2_d - 1;
1053 }
1054 else {
1055 /* We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. */
1056 proposed_m += proposed_m;
1057 const uint64_t twice_rem = rem + rem;
1058 if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1059 more = floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
1060 }
1061 proposed_m += 1;
1062 result.more = more;
1063 result.magic = (d < 0 ? -(int64_t)proposed_m : (int64_t)proposed_m);
1064 }
1065 return result;
1066}
1067
1068int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
1069 uint8_t more = denom->more;
1070 int64_t magic = denom->magic;
1071 if (magic == 0) { //shift path
1072 uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
1073 int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1));
1074 q = q >> shifter;
1075 int64_t shiftMask = (int8_t)more >> 7; //must be arithmetic shift and then sign-extend
1076 q = (q ^ shiftMask) - shiftMask;
1077 return q;
1078 }
1079 else {
1080 int64_t q = libdivide__mullhi_s64(magic, numer);
1081 if (more & LIBDIVIDE_ADD_MARKER) {
1082 int64_t sign = (int8_t)more >> 7; //must be arithmetic shift and then sign extend
1083 q += ((numer ^ sign) - sign);
1084 }
1085 q >>= more & LIBDIVIDE_64_SHIFT_MASK;
1086 q += (q < 0);
1087 return q;
1088 }
1089}
1090
1091
1092int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom) {
1093 uint8_t more = denom->more;
1094 int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1095 if (denom->magic == 0) return (positiveDivisor ? 0 : 1); //shift path
1096 else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? 2 : 3);
1097 else return 4;
1098}
1099
1100int64_t libdivide_s64_do_alg0(int64_t numer, const struct libdivide_s64_t *denom) {
1101 uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1102 int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1));
1103 return q >> shifter;
1104}
1105
1106int64_t libdivide_s64_do_alg1(int64_t numer, const struct libdivide_s64_t *denom) {
1107 //denom->shifter != -1 && demo->shiftMask != 0
1108 uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1109 int64_t q = numer + ((numer >> 63) & ((1LL << shifter) - 1));
1110 return - (q >> shifter);
1111}
1112
1113int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom) {
1114 int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1115 q += numer;
1116 q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
1117 q += (q < 0);
1118 return q;
1119}
1120
1121int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom) {
1122 int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1123 q -= numer;
1124 q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
1125 q += (q < 0);
1126 return q;
1127}
1128
1129int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom) {
1130 int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1131 q >>= denom->more;
1132 q += (q < 0);
1133 return q;
1134}
1135
1136
1137#if LIBDIVIDE_USE_SSE2
1138__m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t * denom) {
1139 uint8_t more = denom->more;
1140 int64_t magic = denom->magic;
1141 if (magic == 0) { //shift path
1142 uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
1143 __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1);
1144 __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); //q = numer + ((numer >> 63) & roundToZeroTweak);
1145 q = libdivide_s64_shift_right_vector(q, shifter); // q = q >> shifter
1146 __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7));
1147 q = _mm_sub_epi64(_mm_xor_si128(q, shiftMask), shiftMask); //q = (q ^ shiftMask) - shiftMask;
1148 return q;
1149 }
1150 else {
1151 __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(magic));
1152 if (more & LIBDIVIDE_ADD_MARKER) {
1153 __m128i sign = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); //must be arithmetic shift
1154 q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign);
1155 }
1156 q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); //q >>= denom->mult_path.shift
1157 q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
1158 return q;
1159 }
1160}
1161
1162__m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom) {
1163 uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1164 __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1);
1165 __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
1166 q = libdivide_s64_shift_right_vector(q, shifter);
1167 return q;
1168}
1169
1170__m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom) {
1171 uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1172 __m128i roundToZeroTweak = libdivide__u64_to_m128((1LL << shifter) - 1);
1173 __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
1174 q = libdivide_s64_shift_right_vector(q, shifter);
1175 return _mm_sub_epi64(_mm_setzero_si128(), q);
1176}
1177
1178__m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom) {
1179 __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1180 q = _mm_add_epi64(q, numers);
1181 q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
1182 q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
1183 return q;
1184}
1185
1186__m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom) {
1187 __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1188 q = _mm_sub_epi64(q, numers);
1189 q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
1190 q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
1191 return q;
1192}
1193
1194__m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom) {
1195 __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1196 q = libdivide_s64_shift_right_vector(q, denom->more);
1197 q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));
1198 return q;
1199}
1200
1201#endif
1202
1203/////////// C++ stuff
1204
1205#ifdef __cplusplus
1206
1207/* The C++ template design here is a total mess. This needs to be fixed by someone better at templates than I. The current design is:
1208
1209- The base is a template divider_base that takes the integer type, the libdivide struct, a generating function, a get algorithm function, a do function, and either a do vector function or a dummy int.
1210- The base has storage for the libdivide struct. This is the only storage (so the C++ class should be no larger than the libdivide struct).
1211
1212- Above that, there's divider_mid. This is an empty struct by default, but it is specialized against our four int types. divider_mid contains a template struct algo, that contains a typedef for a specialization of divider_base. struct algo is specialized to take an "algorithm number," where -1 means to use the general algorithm.
1213
1214- Publicly we have class divider, which inherits from divider_mid::algo. This also take an algorithm number, which defaults to -1 (the general algorithm).
1215- divider has a operator / which allows you to use a divider as the divisor in a quotient expression.
1216
1217*/
1218
1219namespace libdivide_internal {
1220
1221#if LIBDIVIDE_USE_SSE2
1222#define MAYBE_VECTOR(x) x
1223#define MAYBE_VECTOR_PARAM __m128i vector_func(__m128i, const DenomType *)
1224#else
1225#define MAYBE_VECTOR(x) 0
1226#define MAYBE_VECTOR_PARAM int vector_func
1227#endif
1228
1229 /* Some bogus unswitch functions for unsigned types so the same (presumably templated) code can work for both signed and unsigned. */
1230 uint32_t crash_u32(uint32_t, const libdivide_u32_t *) { abort(); }
1231 uint64_t crash_u64(uint64_t, const libdivide_u64_t *) { abort(); }
1232#ifdef __APPLE__
1233 UInt64 crash_u64(UInt64, const libdivide_u64_t *) { abort(); }
1234#endif
1235#if LIBDIVIDE_USE_SSE2
1236 __m128i crash_u32_vector(__m128i, const libdivide_u32_t *) { abort(); }
1237 __m128i crash_u64_vector(__m128i, const libdivide_u64_t *) { abort(); }
1238#endif
1239
1240 template<typename IntType, typename DenomType, DenomType gen_func(IntType), int get_algo(const DenomType *), IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM>
1241 class divider_base {
1242 public:
1243 DenomType denom;
1244 divider_base(IntType d) : denom(gen_func(d)) { }
1245 divider_base(const DenomType & d) : denom(d) { }
1246
1247 IntType perform_divide(IntType val) const { return do_func(val, &denom); }
1248#if LIBDIVIDE_USE_SSE2
1249 __m128i perform_divide_vector(__m128i val) const { return vector_func(val, &denom); }
1250#endif
1251
1252 int get_algorithm() const { return get_algo(&denom); }
1253 };
1254
1255
1256 template<class T> struct divider_mid { };
1257
1258 template<> struct divider_mid<uint32_t> {
1259 typedef uint32_t IntType;
1260 typedef struct libdivide_u32_t DenomType;
1261 template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM> struct denom {
1262 typedef divider_base<IntType, DenomType, libdivide_u32_gen, libdivide_u32_get_algorithm, do_func, vector_func> divider;
1263 };
1264
1265 template<int ALGO, int J = 0> struct algo { };
1266 template<int J> struct algo<-1, J> { typedef denom<libdivide_u32_do, MAYBE_VECTOR(libdivide_u32_do_vector)>::divider divider; };
1267 template<int J> struct algo<0, J> { typedef denom<libdivide_u32_do_alg0, MAYBE_VECTOR(libdivide_u32_do_vector_alg0)>::divider divider; };
1268 template<int J> struct algo<1, J> { typedef denom<libdivide_u32_do_alg1, MAYBE_VECTOR(libdivide_u32_do_vector_alg1)>::divider divider; };
1269 template<int J> struct algo<2, J> { typedef denom<libdivide_u32_do_alg2, MAYBE_VECTOR(libdivide_u32_do_vector_alg2)>::divider divider; };
1270
1271 /* Define two more bogus ones so that the same (templated, presumably) code can handle both signed and unsigned */
1272 template<int J> struct algo<3, J> { typedef denom<crash_u32, MAYBE_VECTOR(crash_u32_vector)>::divider divider; };
1273 template<int J> struct algo<4, J> { typedef denom<crash_u32, MAYBE_VECTOR(crash_u32_vector)>::divider divider; };
1274
1275 };
1276
1277 template<> struct divider_mid<int32_t> {
1278 typedef int32_t IntType;
1279 typedef struct libdivide_s32_t DenomType;
1280 template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM> struct denom {
1281 typedef divider_base<IntType, DenomType, libdivide_s32_gen, libdivide_s32_get_algorithm, do_func, vector_func> divider;
1282 };
1283
1284
1285 template<int ALGO, int J = 0> struct algo { };
1286 template<int J> struct algo<-1, J> { typedef denom<libdivide_s32_do, MAYBE_VECTOR(libdivide_s32_do_vector)>::divider divider; };
1287 template<int J> struct algo<0, J> { typedef denom<libdivide_s32_do_alg0, MAYBE_VECTOR(libdivide_s32_do_vector_alg0)>::divider divider; };
1288 template<int J> struct algo<1, J> { typedef denom<libdivide_s32_do_alg1, MAYBE_VECTOR(libdivide_s32_do_vector_alg1)>::divider divider; };
1289 template<int J> struct algo<2, J> { typedef denom<libdivide_s32_do_alg2, MAYBE_VECTOR(libdivide_s32_do_vector_alg2)>::divider divider; };
1290 template<int J> struct algo<3, J> { typedef denom<libdivide_s32_do_alg3, MAYBE_VECTOR(libdivide_s32_do_vector_alg3)>::divider divider; };
1291 template<int J> struct algo<4, J> { typedef denom<libdivide_s32_do_alg4, MAYBE_VECTOR(libdivide_s32_do_vector_alg4)>::divider divider; };
1292
1293 };
1294
1295#ifdef __APPLE__
1296 template<> struct divider_mid<Int64> {
1297 typedef Int64 IntType;
1298 typedef struct libdivide_s64_t DenomType;
1299 template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM> struct denom {
1300 typedef divider_base<IntType, DenomType, libdivide_s64_gen, libdivide_s64_get_algorithm, do_func, vector_func> divider;
1301 };
1302
1303 template<int ALGO, int J = 0> struct algo { };
1304 template<int J> struct algo<-1, J> { typedef denom<libdivide_s64_do, MAYBE_VECTOR(libdivide_s64_do_vector)>::divider divider; };
1305 template<int J> struct algo<0, J> { typedef denom<libdivide_s64_do_alg0, MAYBE_VECTOR(libdivide_s64_do_vector_alg0)>::divider divider; };
1306 template<int J> struct algo<1, J> { typedef denom<libdivide_s64_do_alg1, MAYBE_VECTOR(libdivide_s64_do_vector_alg1)>::divider divider; };
1307 template<int J> struct algo<2, J> { typedef denom<libdivide_s64_do_alg2, MAYBE_VECTOR(libdivide_s64_do_vector_alg2)>::divider divider; };
1308 template<int J> struct algo<3, J> { typedef denom<libdivide_s64_do_alg3, MAYBE_VECTOR(libdivide_s64_do_vector_alg3)>::divider divider; };
1309 template<int J> struct algo<4, J> { typedef denom<libdivide_s64_do_alg4, MAYBE_VECTOR(libdivide_s64_do_vector_alg4)>::divider divider; };
1310 };
1311
1312 template<> struct divider_mid<UInt64> {
1313 typedef UInt64 IntType;
1314 typedef struct libdivide_u64_t DenomType;
1315 template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM> struct denom {
1316 typedef divider_base<IntType, DenomType, libdivide_u64_gen, libdivide_u64_get_algorithm, do_func, vector_func> divider;
1317 };
1318
1319 template<int ALGO, int J = 0> struct algo { };
1320 template<int J> struct algo<-1, J> { typedef denom<libdivide_u64_do, MAYBE_VECTOR(libdivide_u64_do_vector)>::divider divider; };
1321 template<int J> struct algo<0, J> { typedef denom<libdivide_u64_do_alg0, MAYBE_VECTOR(libdivide_u64_do_vector_alg0)>::divider divider; };
1322 template<int J> struct algo<1, J> { typedef denom<libdivide_u64_do_alg1, MAYBE_VECTOR(libdivide_u64_do_vector_alg1)>::divider divider; };
1323 template<int J> struct algo<2, J> { typedef denom<libdivide_u64_do_alg2, MAYBE_VECTOR(libdivide_u64_do_vector_alg2)>::divider divider; };
1324
1325 /* Define two more bogus ones so that the same (templated, presumably) code can handle both signed and unsigned */
1326 template<int J> struct algo<3, J> { typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1327 template<int J> struct algo<4, J> { typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1328
1329
1330 };
1331#endif
1332
1333 template<> struct divider_mid<uint64_t> {
1334 typedef uint64_t IntType;
1335 typedef struct libdivide_u64_t DenomType;
1336 template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM> struct denom {
1337 typedef divider_base<IntType, DenomType, libdivide_u64_gen, libdivide_u64_get_algorithm, do_func, vector_func> divider;
1338 };
1339
1340 template<int ALGO, int J = 0> struct algo { };
1341 template<int J> struct algo<-1, J> { typedef denom<libdivide_u64_do, MAYBE_VECTOR(libdivide_u64_do_vector)>::divider divider; };
1342 template<int J> struct algo<0, J> { typedef denom<libdivide_u64_do_alg0, MAYBE_VECTOR(libdivide_u64_do_vector_alg0)>::divider divider; };
1343 template<int J> struct algo<1, J> { typedef denom<libdivide_u64_do_alg1, MAYBE_VECTOR(libdivide_u64_do_vector_alg1)>::divider divider; };
1344 template<int J> struct algo<2, J> { typedef denom<libdivide_u64_do_alg2, MAYBE_VECTOR(libdivide_u64_do_vector_alg2)>::divider divider; };
1345
1346 /* Define two more bogus ones so that the same (templated, presumably) code can handle both signed and unsigned */
1347 template<int J> struct algo<3, J> { typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1348 template<int J> struct algo<4, J> { typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1349
1350
1351 };
1352
1353 template<> struct divider_mid<int64_t> {
1354 typedef int64_t IntType;
1355 typedef struct libdivide_s64_t DenomType;
1356 template<IntType do_func(IntType, const DenomType *), MAYBE_VECTOR_PARAM> struct denom {
1357 typedef divider_base<IntType, DenomType, libdivide_s64_gen, libdivide_s64_get_algorithm, do_func, vector_func> divider;
1358 };
1359
1360 template<int ALGO, int J = 0> struct algo { };
1361 template<int J> struct algo<-1, J> { typedef denom<libdivide_s64_do, MAYBE_VECTOR(libdivide_s64_do_vector)>::divider divider; };
1362 template<int J> struct algo<0, J> { typedef denom<libdivide_s64_do_alg0, MAYBE_VECTOR(libdivide_s64_do_vector_alg0)>::divider divider; };
1363 template<int J> struct algo<1, J> { typedef denom<libdivide_s64_do_alg1, MAYBE_VECTOR(libdivide_s64_do_vector_alg1)>::divider divider; };
1364 template<int J> struct algo<2, J> { typedef denom<libdivide_s64_do_alg2, MAYBE_VECTOR(libdivide_s64_do_vector_alg2)>::divider divider; };
1365 template<int J> struct algo<3, J> { typedef denom<libdivide_s64_do_alg3, MAYBE_VECTOR(libdivide_s64_do_vector_alg3)>::divider divider; };
1366 template<int J> struct algo<4, J> { typedef denom<libdivide_s64_do_alg4, MAYBE_VECTOR(libdivide_s64_do_vector_alg4)>::divider divider; };
1367 };
1368
1369}
1370
1371template<typename T, int ALGO = -1>
1372class divider
1373{
1374 private:
1375 typename libdivide_internal::divider_mid<T>::template algo<ALGO>::divider sub;
1376 template<int NEW_ALGO, typename S> friend divider<S, NEW_ALGO> unswitch(const divider<S, -1> & d);
1377 divider(const typename libdivide_internal::divider_mid<T>::DenomType & denom) : sub(denom) { }
1378
1379 public:
1380
1381 /* Ordinary constructor, that takes the divisor as a parameter. */
1382 divider(T n) : sub(n) { }
1383
1384 /* Default constructor, that divides by 1 */
1385 divider() : sub(1) { }
1386
1387 /* Divides the parameter by the divisor, returning the quotient */
1388 T perform_divide(T val) const { return sub.perform_divide(val); }
1389
1390#if LIBDIVIDE_USE_SSE2
1391 /* Treats the vector as either two or four packed values (depending on the size), and divides each of them by the divisor, returning the packed quotients. */
1392 __m128i perform_divide_vector(__m128i val) const { return sub.perform_divide_vector(val); }
1393#endif
1394
1395 /* Returns the index of algorithm, for use in the unswitch function */
1396 int get_algorithm() const { return sub.get_algorithm(); } // returns the algorithm for unswitching
1397
1398 /* operator== */
1399 bool operator==(const divider<T, ALGO> & him) const { return sub.denom.magic == him.sub.denom.magic && sub.denom.more == him.sub.denom.more; }
1400
1401 bool operator!=(const divider<T, ALGO> & him) const { return ! (*this == him); }
1402};
1403
1404/* Returns a divider specialized for the given algorithm. */
1405template<int NEW_ALGO, typename S>
1406divider<S, NEW_ALGO> unswitch(const divider<S, -1> & d) { return divider<S, NEW_ALGO>(d.sub.denom); }
1407
1408/* Overload of the / operator for scalar division. */
1409template<typename int_type, int ALGO>
1410int_type operator/(int_type numer, const divider<int_type, ALGO> & denom) {
1411 return denom.perform_divide(numer);
1412}
1413
1414#if LIBDIVIDE_USE_SSE2
1415/* Overload of the / operator for vector division. */
1416template<typename int_type, int ALGO>
1417__m128i operator/(__m128i numer, const divider<int_type, ALGO> & denom) {
1418 return denom.perform_divide_vector(numer);
1419}
1420#endif
1421
1422
1423#endif //__cplusplus
1424
1425#endif //LIBDIVIDE_HEADER_ONLY
1426#ifdef __cplusplus
1427} //close namespace libdivide
1428} //close anonymous namespace
1429#endif
1430
1431#pragma GCC diagnostic pop
1432