libdivide.h source code [ClickHouse/contrib/libdivide/libdivide.h]

1	/ libdivide.h*
2	Copyright 2010 ridiculous_fish
3	*/
4	#pragma GCC diagnostic push
5	#pragma GCC diagnostic ignored "-Wold-style-cast"
6
7	#if defined(_WIN32) \|\| defined(WIN32)
8	#define LIBDIVIDE_WINDOWS 1
9	#endif
10
11	#if defined(_MSC_VER)
12	#define LIBDIVIDE_VC 1
13	#endif
14
15	#ifdef __cplusplus
16	#include <cstdlib>
17	#include <cstdio>
18	#include <cassert>
19	#else
20	#include <stdlib.h>
21	#include <stdio.h>
22	#include <assert.h>
23	#endif
24
25	#if ! LIBDIVIDE_HAS_STDINT_TYPES && (! LIBDIVIDE_VC \|\| _MSC_VER >= 1600)
26	/ Only Visual C++ 2010 and later include stdint.h /
27	#include <stdint.h>
28	#define LIBDIVIDE_HAS_STDINT_TYPES 1
29	#endif
30
31	#if ! LIBDIVIDE_HAS_STDINT_TYPES
32	typedef __int32 int32_t;
33	typedef unsigned __int32 uint32_t;
34	typedef __int64 int64_t;
35	typedef unsigned __int64 uint64_t;
36	typedef __int8 int8_t;
37	typedef unsigned __int8 uint8_t;
38	#endif
39
40	#if LIBDIVIDE_USE_SSE2
41	#include <emmintrin.h>
42	#endif
43
44	#if LIBDIVIDE_VC
45	#include <intrin.h>
46	#endif
47
48	#ifndef __has_builtin
49	#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
50	#endif
51
52	#ifdef __ICC
53	#define HAS_INT128_T 0
54	#else
55	#define HAS_INT128_T __LP64__
56	#endif
57
58	#if defined(__x86_64__) \|\| defined(_WIN64) \|\| defined(_M_64)
59	#define LIBDIVIDE_IS_X86_64 1
60	#endif
61
62	#if defined(__i386__)
63	#define LIBDIVIDE_IS_i386 1
64	#endif
65
66	#if __GNUC__ \|\| __clang__
67	#define LIBDIVIDE_GCC_STYLE_ASM 1
68	#endif
69
70
71	/* libdivide may use the pmuldq (vector signed 32x32->64 mult instruction) which is in SSE 4.1. However, signed multiplication can be emulated efficiently with unsigned multiplication, and SSE 4.1 is currently rare, so it is OK to not turn this on */
72	#ifdef LIBDIVIDE_USE_SSE4_1
73	#include <smmintrin.h>
74	#endif
75
76	#ifdef __cplusplus
77	/* We place libdivide within the libdivide namespace, and that goes in an anonymous namespace so that the functions are only visible to files that #include this header and don't get external linkage. At least that's the theory. */
78	namespace {
79	namespace libdivide {
80	#endif
81
82	/* Explanation of "more" field: bit 6 is whether to use shift path. If we are using the shift path, bit 7 is whether the divisor is negative in the signed case; in the unsigned case it is 0. Bits 0-4 is shift value (for shift path or mult path). In 32 bit case, bit 5 is always 0. We use bit 7 as the "negative divisor indicator" so that we can use sign extension to efficiently go to a full-width -1.
83
84
85	u32: [0-4] shift value
86	[5] ignored
87	[6] add indicator
88	[7] shift path
89
90	s32: [0-4] shift value
91	[5] shift path
92	[6] add indicator
93	[7] indicates negative divisor
94
95	u64: [0-5] shift value
96	[6] add indicator
97	[7] shift path
98
99	s64: [0-5] shift value
100	[6] add indicator
101	[7] indicates negative divisor
102	magic number of 0 indicates shift path (we ran out of bits!)
103	*/
104
105	enum {
106	LIBDIVIDE_32_SHIFT_MASK = `0x1F`,
107	LIBDIVIDE_64_SHIFT_MASK = `0x3F`,
108	LIBDIVIDE_ADD_MARKER = `0x40`,
109	LIBDIVIDE_U32_SHIFT_PATH = `0x80`,
110	LIBDIVIDE_U64_SHIFT_PATH = `0x80`,
111	LIBDIVIDE_S32_SHIFT_PATH = `0x20`,
112	LIBDIVIDE_NEGATIVE_DIVISOR = `0x80`
113	};
114
115
116	struct libdivide_u32_t {
117	uint32_t magic;
118	uint8_t more;
119	};
120
121	struct libdivide_s32_t {
122	int32_t magic;
123	uint8_t more;
124	};
125
126	struct libdivide_u64_t {
127	uint64_t magic;
128	uint8_t more;
129	};
130
131	struct libdivide_s64_t {
132	int64_t magic;
133	uint8_t more;
134	};
135
136
137
138	#ifndef LIBDIVIDE_API
139	#ifdef __cplusplus
140	/* In C++, we don't want our public functions to be static, because they are arguments to templates and static functions can't do that. They get internal linkage through virtue of the anonymous namespace. In C, they should be static. */
141	#define LIBDIVIDE_API
142	#else
143	#define LIBDIVIDE_API static
144	#endif
145	#endif
146
147	#ifdef __APPLE__
148	typedef signed long Int64;
149	typedef unsigned long UInt64;
150	#endif
151
152	LIBDIVIDE_API struct libdivide_s32_t libdivide_s32_gen(int32_t y);
153	LIBDIVIDE_API struct libdivide_u32_t libdivide_u32_gen(uint32_t y);
154	LIBDIVIDE_API struct libdivide_s64_t libdivide_s64_gen(int64_t y);
155	LIBDIVIDE_API struct libdivide_u64_t libdivide_u64_gen(uint64_t y);
156	#if defined(__APPLE__) && defined(__cplusplus)
157	#pragma GCC diagnostic push
158	#pragma GCC diagnostic ignored "-Wunused-function"
159	LIBDIVIDE_API struct libdivide_s64_t libdivide_s64_gen(Int64 y) { return libdivide_s64_gen(int64_t(y)); };
160	LIBDIVIDE_API struct libdivide_u64_t libdivide_u64_gen(UInt64 y) { return libdivide_u64_gen(uint64_t(y)); };
161	#pragma GCC diagnostic pop
162	#endif
163
164	LIBDIVIDE_API int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom);
165	LIBDIVIDE_API uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom);
166	LIBDIVIDE_API int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom);
167	LIBDIVIDE_API uint64_t libdivide_u64_do(uint64_t y, const struct libdivide_u64_t *denom);
168	#if defined(__APPLE__) && defined(__cplusplus)
169	#pragma GCC diagnostic push
170	#pragma GCC diagnostic ignored "-Wunused-function"
171	LIBDIVIDE_API Int64 libdivide_s64_do(Int64 numer, const struct libdivide_s64_t denom) { return* Int64(libdivide_s64_do(int64_t(numer), denom)); };
172	LIBDIVIDE_API UInt64 libdivide_u64_do(UInt64 y, const struct libdivide_u64_t denom) { return* UInt64(libdivide_u64_do(uint64_t(y), denom)); };
173	#pragma GCC diagnostic pop
174	#endif
175
176	LIBDIVIDE_API int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom);
177	LIBDIVIDE_API uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom);
178	LIBDIVIDE_API uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom);
179	LIBDIVIDE_API uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom);
180
181	LIBDIVIDE_API int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom);
182	LIBDIVIDE_API uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *denom);
183	LIBDIVIDE_API uint64_t libdivide_u64_do_alg1(uint64_t numer, const struct libdivide_u64_t *denom);
184	LIBDIVIDE_API uint64_t libdivide_u64_do_alg2(uint64_t numer, const struct libdivide_u64_t *denom);
185	#if defined(__APPLE__) && defined(__cplusplus)
186	#pragma GCC diagnostic push
187	#pragma GCC diagnostic ignored "-Wunused-function"
188	LIBDIVIDE_API UInt64 libdivide_u64_do_alg0(UInt64 numer, const struct libdivide_u64_t denom) { return* UInt64(libdivide_u64_do_alg0(uint64_t(numer), denom)); }
189	LIBDIVIDE_API UInt64 libdivide_u64_do_alg1(UInt64 numer, const struct libdivide_u64_t denom) { return* UInt64(libdivide_u64_do_alg1(uint64_t(numer), denom)); }
190	LIBDIVIDE_API UInt64 libdivide_u64_do_alg2(UInt64 numer, const struct libdivide_u64_t denom) { return* UInt64(libdivide_u64_do_alg2(uint64_t(numer), denom)); }
191	#pragma GCC diagnostic pop
192	#endif
193
194	LIBDIVIDE_API int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom);
195	LIBDIVIDE_API int32_t libdivide_s32_do_alg0(int32_t numer, const struct libdivide_s32_t *denom);
196	LIBDIVIDE_API int32_t libdivide_s32_do_alg1(int32_t numer, const struct libdivide_s32_t *denom);
197	LIBDIVIDE_API int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom);
198	LIBDIVIDE_API int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom);
199	LIBDIVIDE_API int32_t libdivide_s32_do_alg4(int32_t numer, const struct libdivide_s32_t *denom);
200
201	LIBDIVIDE_API int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom);
202	LIBDIVIDE_API int64_t libdivide_s64_do_alg0(int64_t numer, const struct libdivide_s64_t *denom);
203	LIBDIVIDE_API int64_t libdivide_s64_do_alg1(int64_t numer, const struct libdivide_s64_t *denom);
204	LIBDIVIDE_API int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom);
205	LIBDIVIDE_API int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom);
206	LIBDIVIDE_API int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom);
207	#if defined(__APPLE__) && defined(__cplusplus)
208	#pragma GCC diagnostic push
209	#pragma GCC diagnostic ignored "-Wunused-function"
210	LIBDIVIDE_API Int64 libdivide_s64_do_alg0(Int64 numer, const struct libdivide_s64_t denom) { return* Int64(libdivide_s64_do_alg0(int64_t(numer), denom)); }
211	LIBDIVIDE_API Int64 libdivide_s64_do_alg1(Int64 numer, const struct libdivide_s64_t denom) { return* Int64(libdivide_s64_do_alg1(int64_t(numer), denom)); }
212	LIBDIVIDE_API Int64 libdivide_s64_do_alg2(Int64 numer, const struct libdivide_s64_t denom) { return* Int64(libdivide_s64_do_alg2(int64_t(numer), denom)); }
213	LIBDIVIDE_API Int64 libdivide_s64_do_alg3(Int64 numer, const struct libdivide_s64_t denom) { return* Int64(libdivide_s64_do_alg3(int64_t(numer), denom)); }
214	LIBDIVIDE_API Int64 libdivide_s64_do_alg4(Int64 numer, const struct libdivide_s64_t denom) { return* Int64(libdivide_s64_do_alg4(int64_t(numer), denom)); }
215	#pragma GCC diagnostic pop
216	#endif
217
218
219	#if LIBDIVIDE_USE_SSE2
220	LIBDIVIDE_API __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t * denom);
221	LIBDIVIDE_API __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t * denom);
222	LIBDIVIDE_API __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t * denom);
223	LIBDIVIDE_API __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t * denom);
224
225	LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t * denom);
226	LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t * denom);
227	LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t * denom);
228
229	LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t * denom);
230	LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t * denom);
231	LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t * denom);
232	LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t * denom);
233	LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t * denom);
234
235	LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t * denom);
236	LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t * denom);
237	LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t * denom);
238
239	LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t * denom);
240	LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t * denom);
241	LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t * denom);
242	LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t * denom);
243	LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t * denom);
244	#endif
245
246
247
248	//////// Internal Utility Functions
249
250	static inline uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) {
251	uint64_t xl = x, yl = y;
252	uint64_t rl = xl * yl;
253	return (uint32_t)(rl >> `32`);
254	}
255
256	static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) {
257	#if HAS_INT128_T
258	__uint128_t xl = x, yl = y;
259	__uint128_t rl = xl * yl;
260	return (uint64_t)(rl >> `64`);
261	#else
262	//full 128 bits are x0 y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)*
263	const uint32_t mask = `0xFFFFFFFF`;
264	const uint32_t x0 = (uint32_t)(x & mask), x1 = (uint32_t)(x >> `32`);
265	const uint32_t y0 = (uint32_t)(y & mask), y1 = (uint32_t)(y >> `32`);
266	const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
267	const uint64_t x0y1 = x0 * (uint64_t)y1;
268	const uint64_t x1y0 = x1 * (uint64_t)y0;
269	const uint64_t x1y1 = x1 * (uint64_t)y1;
270
271	uint64_t temp = x1y0 + x0y0_hi;
272	uint64_t temp_lo = temp & mask, temp_hi = temp >> `32`;
273	return x1y1 + temp_hi + ((temp_lo + x0y1) >> `32`);
274	#endif
275	}
276
277	static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) {
278	#if HAS_INT128_T
279	__int128_t xl = x, yl = y;
280	__int128_t rl = xl * yl;
281	return (int64_t)(rl >> `64`);
282	#else
283	//full 128 bits are x0 y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)*
284	const uint32_t mask = `0xFFFFFFFF`;
285	const uint32_t x0 = (uint32_t)(x & mask), y0 = (uint32_t)(y & mask);
286	const int32_t x1 = (int32_t)(x >> `32`), y1 = (int32_t)(y >> `32`);
287	const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
288	const int64_t t = x1*(int64_t)y0 + x0y0_hi;
289	const int64_t w1 = x0*(int64_t)y1 + (t & mask);
290	return x1*(int64_t)y1 + (t >> `32`) + (w1 >> `32`);
291	#endif
292	}
293
294	#if LIBDIVIDE_USE_SSE2
295
296	static inline __m128i libdivide__u64_to_m128(uint64_t x) {
297	#if LIBDIVIDE_VC && ! _WIN64
298	//64 bit windows doesn't seem to have an implementation of any of these load intrinsics, and 32 bit Visual C++ crashes
299	_declspec(align(`16`)) uint64_t temp[`2`] = {x, x};
300	return _mm_load_si128((const __m128i*)temp);
301	#elif defined(__ICC)
302	uint64_t __attribute__((aligned(`16`))) temp[`2`] = {x,x};
303	return _mm_load_si128((const __m128i*)temp);
304	#elif __clang__
305	#pragma clang diagnostic push
306	#pragma clang diagnostic ignored "-Wc++11-narrowing" // narrowing from uint64_t (aka 'unsigned long') to 'long long'
307	// clang does not provide this intrinsic either
308	return (__m128i){x, x};
309	#pragma clang diagnostic pop
310	#else
311	// everyone else gets it right
312	return _mm_set1_epi64x(x);
313	#endif
314	}
315
316	static inline __m128i libdivide_get_FFFFFFFF00000000(void) {
317	//returns the same as _mm_set1_epi64(0xFFFFFFFF00000000ULL) without touching memory
318	__m128i result = _mm_set1_epi8(-`1`); //optimizes to pcmpeqd on OS X
319	return _mm_slli_epi64(result, `32`);
320	}
321
322	static inline __m128i libdivide_get_00000000FFFFFFFF(void) {
323	//returns the same as _mm_set1_epi64(0x00000000FFFFFFFFULL) without touching memory
324	__m128i result = _mm_set1_epi8(-`1`); //optimizes to pcmpeqd on OS X
325	result = _mm_srli_epi64(result, `32`);
326	return result;
327	}
328
329	#if __clang__
330	#pragma clang diagnostic push
331	#pragma clang diagnostic ignored "-Wuninitialized"
332	#endif
333	static inline __m128i libdivide_get_0000FFFF(void) {
334	//returns the same as _mm_set1_epi32(0x0000FFFFULL) without touching memory
335	__m128i result; //we don't care what its contents are
336	result = _mm_cmpeq_epi8(result, result); //all 1s
337	result = _mm_srli_epi32(result, `16`);
338	return result;
339	}
340	#if __clang__
341	#pragma clang diagnostic pop
342	#endif
343
344	/// This is a bug in gcc-8, _MM_SHUFFLE was forgotten, though in trunk it is ok https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/xmmintrin.h#L61
345	#if defined(__PPC__)
346	#ifndef _MM_SHUFFLE
347	#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) \| ((x) << 4) \| ((y) << 2) \| (z))
348	#endif
349	#endif
350
351	static inline __m128i libdivide_s64_signbits(__m128i v) {
352	//we want to compute v >> 63, that is, _mm_srai_epi64(v, 63). But there is no 64 bit shift right arithmetic instruction in SSE2. So we have to fake it by first duplicating the high 32 bit values, and then using a 32 bit shift. Another option would be to use _mm_srli_epi64(v, 63) and then subtract that from 0, but that approach appears to be substantially slower for unknown reasons
353	__m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(`3`, `3`, `1`, `1`));
354	__m128i signBits = _mm_srai_epi32(hiBitsDuped, `31`);
355	return signBits;
356	}
357
358	/ Returns an __m128i whose low 32 bits are equal to amt and has zero elsewhere. /
359	static inline __m128i libdivide_u32_to_m128i(uint32_t amt) {
360	return _mm_set_epi32(`0`, `0`, `0`, amt);
361	}
362
363	static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) {
364	//implementation of _mm_sra_epi64. Here we have two 64 bit values which are shifted right to logically become (64 - amt) values, and are then sign extended from a (64 - amt) bit number.
365	const int b = `64` - amt;
366	__m128i m = libdivide__u64_to_m128(`1ULL` << (b - `1`));
367	__m128i x = _mm_srl_epi64(v, libdivide_u32_to_m128i(amt));
368	__m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); //result = x^m - m
369	return result;
370	}
371
372	/ Here, b is assumed to contain one 32 bit value repeated four times. If it did not, the function would not work. /
373	static inline __m128i libdivide__mullhi_u32_flat_vector(__m128i a, __m128i b) {
374	__m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), `32`);
375	__m128i a1X3X = _mm_srli_epi64(a, `32`);
376	__m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), libdivide_get_FFFFFFFF00000000());
377	return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123
378	}
379
380
381	/ Here, y is assumed to contain one 64 bit value repeated twice. /
382	static inline __m128i libdivide_mullhi_u64_flat_vector(__m128i x, __m128i y) {
383	//full 128 bits are x0 y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)*
384	const __m128i mask = libdivide_get_00000000FFFFFFFF();
385	const __m128i x0 = _mm_and_si128(x, mask), x1 = _mm_srli_epi64(x, `32`); //x0 is low half of 2 64 bit values, x1 is high half in low slots
386	const __m128i y0 = _mm_and_si128(y, mask), y1 = _mm_srli_epi64(y, `32`);
387	const __m128i x0y0_hi = _mm_srli_epi64(_mm_mul_epu32(x0, y0), `32`); //x0 happens to have the low half of the two 64 bit values in 32 bit slots 0 and 2, so _mm_mul_epu32 computes their full product, and then we shift right by 32 to get just the high values
388	const __m128i x0y1 = _mm_mul_epu32(x0, y1);
389	const __m128i x1y0 = _mm_mul_epu32(x1, y0);
390	const __m128i x1y1 = _mm_mul_epu32(x1, y1);
391
392	const __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
393	__m128i temp_lo = _mm_and_si128(temp, mask), temp_hi = _mm_srli_epi64(temp, `32`);
394	temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), `32`);
395	temp_hi = _mm_add_epi64(x1y1, temp_hi);
396
397	return _mm_add_epi64(temp_lo, temp_hi);
398	}
399
400	/ y is one 64 bit value repeated twice /
401	static inline __m128i libdivide_mullhi_s64_flat_vector(__m128i x, __m128i y) {
402	__m128i p = libdivide_mullhi_u64_flat_vector(x, y);
403	__m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
404	p = _mm_sub_epi64(p, t1);
405	__m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
406	p = _mm_sub_epi64(p, t2);
407	return p;
408	}
409
410	#ifdef LIBDIVIDE_USE_SSE4_1
411
412	/ b is one 32 bit value repeated four times. /
413	static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
414	__m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epi32(a, b), `32`);
415	__m128i a1X3X = _mm_srli_epi64(a, `32`);
416	__m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epi32(a1X3X, b), libdivide_get_FFFFFFFF00000000());
417	return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123
418	}
419
420	#else
421
422	/ SSE2 does not have a signed multiplication instruction, but we can convert unsigned to signed pretty efficiently. Again, b is just a 32 bit value repeated four times. /
423	static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
424	__m128i p = libdivide__mullhi_u32_flat_vector(a, b);
425	__m128i t1 = _mm_and_si128(_mm_srai_epi32(a, `31`), b); //t1 = (a >> 31) & y, arithmetic shift
426	__m128i t2 = _mm_and_si128(_mm_srai_epi32(b, `31`), a);
427	p = _mm_sub_epi32(p, t1);
428	p = _mm_sub_epi32(p, t2);
429	return p;
430	}
431	#endif
432	#endif
433
434	static inline int32_t libdivide__count_trailing_zeros32(uint32_t val) {
435	#if __GNUC__ \|\| __has_builtin(__builtin_ctz)
436	/ Fast way to count trailing zeros /
437	return __builtin_ctz(val);
438	#elif LIBDIVIDE_VC
439	unsigned long result;
440	if (_BitScanForward(&result, val)) {
441	return result;
442	}
443	return `0`;
444	#else
445	/ Dorky way to count trailing zeros. Note that this hangs for val = 0! /
446	int32_t result = `0`;
447	val = (val ^ (val - `1`)) >> `1`; // Set v's trailing 0s to 1s and zero rest
448	while (val) {
449	val >>= `1`;
450	result++;
451	}
452	return result;
453	#endif
454	}
455
456	static inline int32_t libdivide__count_trailing_zeros64(uint64_t val) {
457	#if __LP64__ && (__GNUC__ \|\| __has_builtin(__builtin_ctzll))
458	/ Fast way to count trailing zeros. Note that we disable this in 32 bit because gcc does something horrible - it calls through to a dynamically bound function. /
459	return __builtin_ctzll(val);
460	#elif LIBDIVIDE_VC && _WIN64
461	unsigned long result;
462	if (_BitScanForward64(&result, val)) {
463	return result;
464	}
465	return `0`;
466	#else
467	/ Pretty good way to count trailing zeros. Note that this hangs for val = 0! /
468	uint32_t lo = val & `0xFFFFFFFF`;
469	if (lo != `0`) return libdivide__count_trailing_zeros32(lo);
470	return `32` + libdivide__count_trailing_zeros32(val >> `32`);
471	#endif
472	}
473
474	static inline int32_t libdivide__count_leading_zeros32(uint32_t val) {
475	#if __GNUC__ \|\| __has_builtin(__builtin_clzll)
476	/ Fast way to count leading zeros /
477	return __builtin_clz(val);
478	#elif LIBDIVIDE_VC
479	unsigned long result;
480	if (_BitScanReverse(&result, val)) {
481	return `31` - result;
482	}
483	return `0`;
484	#else
485	/ Dorky way to count leading zeros. Note that this hangs for val = 0! /
486	int32_t result = `0`;
487	while (! (val & (`1U` << `31`))) {
488	val <<= `1`;
489	result++;
490	}
491	return result;
492	#endif
493	}
494
495	static inline int32_t libdivide__count_leading_zeros64(uint64_t val) {
496	#if __GNUC__ \|\| __has_builtin(__builtin_clzll)
497	/ Fast way to count leading zeros /
498	return __builtin_clzll(val);
499	#elif LIBDIVIDE_VC && _WIN64
500	unsigned long result;
501	if (_BitScanReverse64(&result, val)) {
502	return `63` - result;
503	}
504	return `0`;
505	#else
506	/ Dorky way to count leading zeros. Note that this hangs for val = 0! /
507	int32_t result = `0`;
508	while (! (val & (`1ULL` << `63`))) {
509	val <<= `1`;
510	result++;
511	}
512	return result;
513	#endif
514	}
515
516	//libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit uint {v}. The result must fit in 32 bits. Returns the quotient directly and the remainder in r*
517	#if (LIBDIVIDE_IS_i386 \|\| LIBDIVIDE_IS_X86_64) && LIBDIVIDE_GCC_STYLE_ASM
518	static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
519	uint32_t result;
520	__asm__("divl %[v]"
521	: "=a"(result), "=d"(*r)
522	: [v] "r"(v), "a"(u0), "d"(u1)
523	);
524	return result;
525	}
526	#else
527	static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
528	uint64_t n = (((uint64_t)u1) << `32`) \| u0;
529	uint32_t result = (uint32_t)(n / v);
530	r = (uint32_t)(n - result (uint64_t)v);
531	return result;
532	}
533	#endif
534
535	#if LIBDIVIDE_IS_X86_64 && LIBDIVIDE_GCC_STYLE_ASM
536	static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
537	//u0 -> rax
538	//u1 -> rdx
539	//divq
540	uint64_t result;
541	__asm__("divq %[v]"
542	: "=a"(result), "=d"(*r)
543	: [v] "r"(v), "a"(u0), "d"(u1)
544	);
545	return result;
546
547	}
548	#else
549
550	/ Code taken from Hacker's Delight, http://www.hackersdelight.org/HDcode/divlu.c . License permits inclusion here per http://www.hackersdelight.org/permissions.htm*
551	*/
552	static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
553	const uint64_t b = (`1ULL` << `32`); // Number base (16 bits).
554	uint64_t un1, un0, // Norm. dividend LSD's.
555	vn1, vn0, // Norm. divisor digits.
556	q1, q0, // Quotient digits.
557	un64, un21, un10,// Dividend digit pairs.
558	rhat; // A remainder.
559	int s; // Shift amount for norm.
560
561	if (u1 >= v) { // If overflow, set rem.
562	if (r != NULL) // to an impossible value,
563	r = (uint64_t)(-`1`); // and return the largest*
564	return (uint64_t)(-`1`);} // possible quotient.
565
566	/ count leading zeros /
567	s = libdivide__count_leading_zeros64(v); // 0 <= s <= 63.
568	if (s > `0`) {
569	v = v << s; // Normalize divisor.
570	un64 = (u1 << s) \| ((u0 >> (`64` - s)) & (-s >> `31`));
571	un10 = u0 << s; // Shift dividend left.
572	} else {
573	// Avoid undefined behavior.
574	un64 = u1 \| u0;
575	un10 = u0;
576	}
577
578	vn1 = v >> `32`; // Break divisor up into
579	vn0 = v & `0xFFFFFFFF`; // two 32-bit digits.
580
581	un1 = un10 >> `32`; // Break right half of
582	un0 = un10 & `0xFFFFFFFF`; // dividend into two digits.
583
584	q1 = un64/vn1; // Compute the first
585	rhat = un64 - q1vn1; // quotient digit, q1.*
586	again1:
587	if (q1 >= b \|\| q1vn0 > brhat + un1) {
588	q1 = q1 - `1`;
589	rhat = rhat + vn1;
590	if (rhat < b) goto again1;}
591
592	un21 = un64b + un1 - q1v; // Multiply and subtract.
593
594	q0 = un21/vn1; // Compute the second
595	rhat = un21 - q0vn1; // quotient digit, q0.*
596	again2:
597	if (q0 >= b \|\| q0vn0 > brhat + un0) {
598	q0 = q0 - `1`;
599	rhat = rhat + vn1;
600	if (rhat < b) goto again2;}
601
602	if (r != NULL) // If remainder is wanted,
603	r = (un21b + un0 - q0v) >> s; // return it.*
604	return q1*b + q0;
605	}
606	#endif
607
608	#if LIBDIVIDE_ASSERTIONS_ON
609	#define LIBDIVIDE_ASSERT(x) do { if (! (x)) { fprintf(stderr, "Assertion failure on line %ld: %s\n", (long)__LINE__, #x); exit(-1); } } while (0)
610	#else
611	#define LIBDIVIDE_ASSERT(x)
612	#endif
613
614	#ifndef LIBDIVIDE_HEADER_ONLY
615
616	////////// UINT32
617
618	struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
619	struct libdivide_u32_t result;
620	if ((d & (d - `1`)) == `0`) {
621	result.magic = `0`;
622	result.more = libdivide__count_trailing_zeros32(d) \| LIBDIVIDE_U32_SHIFT_PATH;
623	}
624	else {
625	const uint32_t floor_log_2_d = `31` - libdivide__count_leading_zeros32(d);
626
627	uint8_t more;
628	uint32_t rem, proposed_m;
629	proposed_m = libdivide_64_div_32_to_32(`1U` << floor_log_2_d, `0`, d, &rem);
630
631	LIBDIVIDE_ASSERT(rem > `0` && rem < d);
632	const uint32_t e = d - rem;
633
634	/ This power works if e < 2*floor_log_2_d. /*
635	if (e < (`1U` << floor_log_2_d)) {
636	/ This power works /
637	more = floor_log_2_d;
638	}
639	else {
640	/* We have to use the general 33-bit algorithm. We need to compute (2power) / d. However, we already have (2(power-1))/d and its remainder. By doubling both, and then correcting the remainder, we can compute the larger division. */
641	proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it
642	const uint32_t twice_rem = rem + rem;
643	if (twice_rem >= d \|\| twice_rem < rem) proposed_m += `1`;
644	more = floor_log_2_d \| LIBDIVIDE_ADD_MARKER;
645	}
646	result.magic = `1` + proposed_m;
647	result.more = more;
648	//result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract one from the shift because we're using the smaller power. If we're using the larger power, we subtract one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be correct in both cases.
649
650	}
651	return result;
652	}
653
654	uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
655	uint8_t more = denom->more;
656	if (more & LIBDIVIDE_U32_SHIFT_PATH) {
657	return numer >> (more & LIBDIVIDE_32_SHIFT_MASK);
658	}
659	else {
660	uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
661	if (more & LIBDIVIDE_ADD_MARKER) {
662	uint32_t t = ((numer - q) >> `1`) + q;
663	return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
664	}
665	else {
666	return q >> more; //all upper bits are 0 - don't need to mask them off
667	}
668	}
669	}
670
671
672	int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom) {
673	uint8_t more = denom->more;
674	if (more & LIBDIVIDE_U32_SHIFT_PATH) return `0`;
675	else if (! (more & LIBDIVIDE_ADD_MARKER)) return `1`;
676	else return `2`;
677	}
678
679	uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom) {
680	return numer >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
681	}
682
683	uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom) {
684	uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
685	return q >> denom->more;
686	}
687
688	uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom) {
689	// denom->add != 0
690	uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
691	uint32_t t = ((numer - q) >> `1`) + q;
692	return t >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
693	}
694
695
696
697
698	#if LIBDIVIDE_USE_SSE2
699	__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) {
700	uint8_t more = denom->more;
701	if (more & LIBDIVIDE_U32_SHIFT_PATH) {
702	return _mm_srl_epi32(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
703	}
704	else {
705	__m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
706	if (more & LIBDIVIDE_ADD_MARKER) {
707	//uint32_t t = ((numer - q) >> 1) + q;
708	//return t >> denom->shift;
709	__m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), `1`), q);
710	return _mm_srl_epi32(t, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
711
712	}
713	else {
714	//q >> denom->shift
715	return _mm_srl_epi32(q, libdivide_u32_to_m128i(more));
716	}
717	}
718	}
719
720	__m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom) {
721	return _mm_srl_epi32(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
722	}
723
724	__m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom) {
725	__m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
726	return _mm_srl_epi32(q, libdivide_u32_to_m128i(denom->more));
727	}
728
729	__m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom) {
730	__m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
731	__m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), `1`), q);
732	return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
733	}
734
735	#endif
736
737	/////////// UINT64
738
739	struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
740	struct libdivide_u64_t result;
741	if ((d & (d - `1`)) == `0`) {
742	result.more = libdivide__count_trailing_zeros64(d) \| LIBDIVIDE_U64_SHIFT_PATH;
743	result.magic = `0`;
744	}
745	else {
746	const uint32_t floor_log_2_d = `63` - libdivide__count_leading_zeros64(d);
747
748	uint64_t proposed_m, rem;
749	uint8_t more;
750	proposed_m = libdivide_128_div_64_to_64(`1ULL` << floor_log_2_d, `0`, d, &rem); //== (1 << (64 + floor_log_2_d)) / d
751
752	LIBDIVIDE_ASSERT(rem > `0` && rem < d);
753	const uint64_t e = d - rem;
754
755	/ This power works if e < 2*floor_log_2_d. /*
756	if (e < (`1ULL` << floor_log_2_d)) {
757	/ This power works /
758	more = floor_log_2_d;
759	}
760	else {
761	/* We have to use the general 65-bit algorithm. We need to compute (2power) / d. However, we already have (2(power-1))/d and its remainder. By doubling both, and then correcting the remainder, we can compute the larger division. */
762	proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it
763	const uint64_t twice_rem = rem + rem;
764	if (twice_rem >= d \|\| twice_rem < rem) proposed_m += `1`;
765	more = floor_log_2_d \| LIBDIVIDE_ADD_MARKER;
766	}
767	result.magic = `1` + proposed_m;
768	result.more = more;
769	//result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract one from the shift because we're using the smaller power. If we're using the larger power, we subtract one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be correct in both cases, which is why we do it outside of the if statement.
770	}
771	return result;
772	}
773
774	uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
775	uint8_t more = denom->more;
776	if (more & LIBDIVIDE_U64_SHIFT_PATH) {
777	return numer >> (more & LIBDIVIDE_64_SHIFT_MASK);
778	}
779	else {
780	uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
781	if (more & LIBDIVIDE_ADD_MARKER) {
782	uint64_t t = ((numer - q) >> `1`) + q;
783	return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
784	}
785	else {
786	return q >> more; //all upper bits are 0 - don't need to mask them off
787	}
788	}
789	}
790
791
792	int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom) {
793	uint8_t more = denom->more;
794	if (more & LIBDIVIDE_U64_SHIFT_PATH) return `0`;
795	else if (! (more & LIBDIVIDE_ADD_MARKER)) return `1`;
796	else return `2`;
797	}
798
799	uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *denom) {
800	return numer >> (denom->more & LIBDIVIDE_64_SHIFT_MASK);
801	}
802
803	uint64_t libdivide_u64_do_alg1(uint64_t numer, const struct libdivide_u64_t *denom) {
804	uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
805	return q >> denom->more;
806	}
807
808	uint64_t libdivide_u64_do_alg2(uint64_t numer, const struct libdivide_u64_t *denom) {
809	uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
810	uint64_t t = ((numer - q) >> `1`) + q;
811	return t >> (denom->more & LIBDIVIDE_64_SHIFT_MASK);
812	}
813
814	#if LIBDIVIDE_USE_SSE2
815	__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t * denom) {
816	uint8_t more = denom->more;
817	if (more & LIBDIVIDE_U64_SHIFT_PATH) {
818	return _mm_srl_epi64(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK));
819	}
820	else {
821	__m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
822	if (more & LIBDIVIDE_ADD_MARKER) {
823	//uint32_t t = ((numer - q) >> 1) + q;
824	//return t >> denom->shift;
825	__m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), `1`), q);
826	return _mm_srl_epi64(t, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK));
827	}
828	else {
829	//q >> denom->shift
830	return _mm_srl_epi64(q, libdivide_u32_to_m128i(more));
831	}
832	}
833	}
834
835	__m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom) {
836	return _mm_srl_epi64(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
837	}
838
839	__m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom) {
840	__m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
841	return _mm_srl_epi64(q, libdivide_u32_to_m128i(denom->more));
842	}
843
844	__m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom) {
845	__m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
846	__m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), `1`), q);
847	return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
848	}
849
850
851	#endif
852
853	/////////// SINT32
854
855
856	static inline int32_t libdivide__mullhi_s32(int32_t x, int32_t y) {
857	int64_t xl = x, yl = y;
858	int64_t rl = xl * yl;
859	return (int32_t)(rl >> `32`); //needs to be arithmetic shift
860	}
861
862	struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
863	struct libdivide_s32_t result;
864
865	/* If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. */
866	uint32_t absD = (uint32_t)(d < `0` ? -d : d); //gcc optimizes this to the fast abs trick
867	if ((absD & (absD - `1`)) == `0`) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero
868	result.magic = `0`;
869	result.more = libdivide__count_trailing_zeros32(absD) \| (d < `0` ? LIBDIVIDE_NEGATIVE_DIVISOR : `0`) \| LIBDIVIDE_S32_SHIFT_PATH;
870	}
871	else {
872	const uint32_t floor_log_2_d = `31` - libdivide__count_leading_zeros32(absD);
873	LIBDIVIDE_ASSERT(floor_log_2_d >= `1`);
874
875	uint8_t more;
876	//the dividend here is 2(floor_log_2_d + 31), so the low 32 bit word is 0 and the high word is floor_log_2_d - 1
877	uint32_t rem, proposed_m;
878	proposed_m = libdivide_64_div_32_to_32(`1U` << (floor_log_2_d - `1`), `0`, absD, &rem);
879	const uint32_t e = absD - rem;
880
881	/ We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2*floor_log_2_d. /*
882	if (e < (`1U` << floor_log_2_d)) {
883	/ This power works /
884	more = floor_log_2_d - `1`;
885	}
886	else {
887	/ We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. /
888	proposed_m += proposed_m;
889	const uint32_t twice_rem = rem + rem;
890	if (twice_rem >= absD \|\| twice_rem < rem) proposed_m += `1`;
891	more = floor_log_2_d \| LIBDIVIDE_ADD_MARKER \| (d < `0` ? LIBDIVIDE_NEGATIVE_DIVISOR : `0`); //use the general algorithm
892	}
893	proposed_m += `1`;
894	result.magic = (d < `0` ? -(int32_t)proposed_m : (int32_t)proposed_m);
895	result.more = more;
896
897	}
898	return result;
899	}
900
901	int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
902	uint8_t more = denom->more;
903	if (more & LIBDIVIDE_S32_SHIFT_PATH) {
904	uint8_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
905	int32_t q = numer + ((numer >> `31`) & ((`1` << shifter) - `1`));
906	q = q >> shifter;
907	int32_t shiftMask = (int8_t)more >> `7`; //must be arithmetic shift and then sign-extend
908	q = (q ^ shiftMask) - shiftMask;
909	return q;
910	}
911	else {
912	int32_t q = libdivide__mullhi_s32(denom->magic, numer);
913	if (more & LIBDIVIDE_ADD_MARKER) {
914	int32_t sign = (int8_t)more >> `7`; //must be arithmetic shift and then sign extend
915	q += ((numer ^ sign) - sign);
916	}
917	q >>= more & LIBDIVIDE_32_SHIFT_MASK;
918	q += (q < `0`);
919	return q;
920	}
921	}
922
923	int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom) {
924	uint8_t more = denom->more;
925	int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR);
926	if (more & LIBDIVIDE_S32_SHIFT_PATH) return (positiveDivisor ? `0` : `1`);
927	else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? `2` : `3`);
928	else return `4`;
929	}
930
931	int32_t libdivide_s32_do_alg0(int32_t numer, const struct libdivide_s32_t *denom) {
932	uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
933	int32_t q = numer + ((numer >> `31`) & ((`1` << shifter) - `1`));
934	return q >> shifter;
935	}
936
937	int32_t libdivide_s32_do_alg1(int32_t numer, const struct libdivide_s32_t *denom) {
938	uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
939	int32_t q = numer + ((numer >> `31`) & ((`1` << shifter) - `1`));
940	return - (q >> shifter);
941	}
942
943	int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom) {
944	int32_t q = libdivide__mullhi_s32(denom->magic, numer);
945	q += numer;
946	q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
947	q += (q < `0`);
948	return q;
949	}
950
951	int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom) {
952	int32_t q = libdivide__mullhi_s32(denom->magic, numer);
953	q -= numer;
954	q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
955	q += (q < `0`);
956	return q;
957	}
958
959	int32_t libdivide_s32_do_alg4(int32_t numer, const struct libdivide_s32_t *denom) {
960	int32_t q = libdivide__mullhi_s32(denom->magic, numer);
961	q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
962	q += (q < `0`);
963	return q;
964	}
965
966	#if LIBDIVIDE_USE_SSE2
967	__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t * denom) {
968	uint8_t more = denom->more;
969	if (more & LIBDIVIDE_S32_SHIFT_PATH) {
970	uint32_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
971	__m128i roundToZeroTweak = _mm_set1_epi32((`1` << shifter) - `1`); //could use _mm_srli_epi32 with an all -1 register
972	__m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, `31`), roundToZeroTweak)); //q = numer + ((numer >> 31) & roundToZeroTweak);
973	q = _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)); // q = q >> shifter
974	__m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> `7`)); //set all bits of shift mask = to the sign bit of more
975	q = _mm_sub_epi32(_mm_xor_si128(q, shiftMask), shiftMask); //q = (q ^ shiftMask) - shiftMask;
976	return q;
977	}
978	else {
979	__m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
980	if (more & LIBDIVIDE_ADD_MARKER) {
981	__m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> `7`); //must be arithmetic shift
982	q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign);
983	}
984	q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); //q >>= shift
985	q = _mm_add_epi32(q, _mm_srli_epi32(q, `31`)); // q += (q < 0)
986	return q;
987	}
988	}
989
990	__m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom) {
991	uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
992	__m128i roundToZeroTweak = _mm_set1_epi32((`1` << shifter) - `1`);
993	__m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, `31`), roundToZeroTweak));
994	return _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter));
995	}
996
997	__m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom) {
998	uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
999	__m128i roundToZeroTweak = _mm_set1_epi32((`1` << shifter) - `1`);
1000	__m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, `31`), roundToZeroTweak));
1001	return _mm_sub_epi32(_mm_setzero_si128(), _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)));
1002	}
1003
1004	__m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom) {
1005	__m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
1006	q = _mm_add_epi32(q, numers);
1007	q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
1008	q = _mm_add_epi32(q, _mm_srli_epi32(q, `31`));
1009	return q;
1010	}
1011
1012	__m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom) {
1013	__m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
1014	q = _mm_sub_epi32(q, numers);
1015	q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
1016	q = _mm_add_epi32(q, _mm_srli_epi32(q, `31`));
1017	return q;
1018	}
1019
1020	__m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom) {
1021	__m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
1022	q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more)); //q >>= shift
1023	q = _mm_add_epi32(q, _mm_srli_epi32(q, `31`)); // q += (q < 0)
1024	return q;
1025	}
1026	#endif
1027
1028	///////////// SINT64
1029
1030
1031	struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
1032	struct libdivide_s64_t result;
1033
1034	/* If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. */
1035	const uint64_t absD = (uint64_t)(d < `0` ? -d : d); //gcc optimizes this to the fast abs trick
1036	if ((absD & (absD - `1`)) == `0`) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero
1037	result.more = libdivide__count_trailing_zeros64(absD) \| (d < `0` ? LIBDIVIDE_NEGATIVE_DIVISOR : `0`);
1038	result.magic = `0`;
1039	}
1040	else {
1041	const uint32_t floor_log_2_d = `63` - libdivide__count_leading_zeros64(absD);
1042
1043	//the dividend here is 2(floor_log_2_d + 63), so the low 64 bit word is 0 and the high word is floor_log_2_d - 1
1044	uint8_t more;
1045	uint64_t rem, proposed_m;
1046	proposed_m = libdivide_128_div_64_to_64(`1ULL` << (floor_log_2_d - `1`), `0`, absD, &rem);
1047	const uint64_t e = absD - rem;
1048
1049	/ We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2*floor_log_2_d. /*
1050	if (e < (`1ULL` << floor_log_2_d)) {
1051	/ This power works /
1052	more = floor_log_2_d - `1`;
1053	}
1054	else {
1055	/ We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. /
1056	proposed_m += proposed_m;
1057	const uint64_t twice_rem = rem + rem;
1058	if (twice_rem >= absD \|\| twice_rem < rem) proposed_m += `1`;
1059	more = floor_log_2_d \| LIBDIVIDE_ADD_MARKER \| (d < `0` ? LIBDIVIDE_NEGATIVE_DIVISOR : `0`);
1060	}
1061	proposed_m += `1`;
1062	result.more = more;
1063	result.magic = (d < `0` ? -(int64_t)proposed_m : (int64_t)proposed_m);
1064	}
1065	return result;
1066	}
1067
1068	int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
1069	uint8_t more = denom->more;
1070	int64_t magic = denom->magic;
1071	if (magic == `0`) { //shift path
1072	uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
1073	int64_t q = numer + ((numer >> `63`) & ((`1LL` << shifter) - `1`));
1074	q = q >> shifter;
1075	int64_t shiftMask = (int8_t)more >> `7`; //must be arithmetic shift and then sign-extend
1076	q = (q ^ shiftMask) - shiftMask;
1077	return q;
1078	}
1079	else {
1080	int64_t q = libdivide__mullhi_s64(magic, numer);
1081	if (more & LIBDIVIDE_ADD_MARKER) {
1082	int64_t sign = (int8_t)more >> `7`; //must be arithmetic shift and then sign extend
1083	q += ((numer ^ sign) - sign);
1084	}
1085	q >>= more & LIBDIVIDE_64_SHIFT_MASK;
1086	q += (q < `0`);
1087	return q;
1088	}
1089	}
1090
1091
1092	int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom) {
1093	uint8_t more = denom->more;
1094	int positiveDivisor = ! (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1095	if (denom->magic == `0`) return (positiveDivisor ? `0` : `1`); //shift path
1096	else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? `2` : `3`);
1097	else return `4`;
1098	}
1099
1100	int64_t libdivide_s64_do_alg0(int64_t numer, const struct libdivide_s64_t *denom) {
1101	uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1102	int64_t q = numer + ((numer >> `63`) & ((`1LL` << shifter) - `1`));
1103	return q >> shifter;
1104	}
1105
1106	int64_t libdivide_s64_do_alg1(int64_t numer, const struct libdivide_s64_t *denom) {
1107	//denom->shifter != -1 && demo->shiftMask != 0
1108	uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1109	int64_t q = numer + ((numer >> `63`) & ((`1LL` << shifter) - `1`));
1110	return - (q >> shifter);
1111	}
1112
1113	int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom) {
1114	int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1115	q += numer;
1116	q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
1117	q += (q < `0`);
1118	return q;
1119	}
1120
1121	int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom) {
1122	int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1123	q -= numer;
1124	q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
1125	q += (q < `0`);
1126	return q;
1127	}
1128
1129	int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom) {
1130	int64_t q = libdivide__mullhi_s64(denom->magic, numer);
1131	q >>= denom->more;
1132	q += (q < `0`);
1133	return q;
1134	}
1135
1136
1137	#if LIBDIVIDE_USE_SSE2
1138	__m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t * denom) {
1139	uint8_t more = denom->more;
1140	int64_t magic = denom->magic;
1141	if (magic == `0`) { //shift path
1142	uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
1143	__m128i roundToZeroTweak = libdivide__u64_to_m128((`1LL` << shifter) - `1`);
1144	__m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); //q = numer + ((numer >> 63) & roundToZeroTweak);
1145	q = libdivide_s64_shift_right_vector(q, shifter); // q = q >> shifter
1146	__m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> `7`));
1147	q = _mm_sub_epi64(_mm_xor_si128(q, shiftMask), shiftMask); //q = (q ^ shiftMask) - shiftMask;
1148	return q;
1149	}
1150	else {
1151	__m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(magic));
1152	if (more & LIBDIVIDE_ADD_MARKER) {
1153	__m128i sign = _mm_set1_epi32((int32_t)((int8_t)more >> `7`)); //must be arithmetic shift
1154	q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign);
1155	}
1156	q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); //q >>= denom->mult_path.shift
1157	q = _mm_add_epi64(q, _mm_srli_epi64(q, `63`)); // q += (q < 0)
1158	return q;
1159	}
1160	}
1161
1162	__m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom) {
1163	uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1164	__m128i roundToZeroTweak = libdivide__u64_to_m128((`1LL` << shifter) - `1`);
1165	__m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
1166	q = libdivide_s64_shift_right_vector(q, shifter);
1167	return q;
1168	}
1169
1170	__m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom) {
1171	uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
1172	__m128i roundToZeroTweak = libdivide__u64_to_m128((`1LL` << shifter) - `1`);
1173	__m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
1174	q = libdivide_s64_shift_right_vector(q, shifter);
1175	return _mm_sub_epi64(_mm_setzero_si128(), q);
1176	}
1177
1178	__m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom) {
1179	__m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1180	q = _mm_add_epi64(q, numers);
1181	q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
1182	q = _mm_add_epi64(q, _mm_srli_epi64(q, `63`)); // q += (q < 0)
1183	return q;
1184	}
1185
1186	__m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom) {
1187	__m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1188	q = _mm_sub_epi64(q, numers);
1189	q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
1190	q = _mm_add_epi64(q, _mm_srli_epi64(q, `63`)); // q += (q < 0)
1191	return q;
1192	}
1193
1194	__m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom) {
1195	__m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
1196	q = libdivide_s64_shift_right_vector(q, denom->more);
1197	q = _mm_add_epi64(q, _mm_srli_epi64(q, `63`));
1198	return q;
1199	}
1200
1201	#endif
1202
1203	/////////// C++ stuff
1204
1205	#ifdef __cplusplus
1206
1207	/ The C++ template design here is a total mess. This needs to be fixed by someone better at templates than I. The current design is:*
1208
1209	- The base is a template divider_base that takes the integer type, the libdivide struct, a generating function, a get algorithm function, a do function, and either a do vector function or a dummy int.
1210	- The base has storage for the libdivide struct. This is the only storage (so the C++ class should be no larger than the libdivide struct).
1211
1212	- Above that, there's divider_mid. This is an empty struct by default, but it is specialized against our four int types. divider_mid contains a template struct algo, that contains a typedef for a specialization of divider_base. struct algo is specialized to take an "algorithm number," where -1 means to use the general algorithm.
1213
1214	- Publicly we have class divider, which inherits from divider_mid::algo. This also take an algorithm number, which defaults to -1 (the general algorithm).
1215	- divider has a operator / which allows you to use a divider as the divisor in a quotient expression.
1216
1217	*/
1218
1219	namespace libdivide_internal {
1220
1221	#if LIBDIVIDE_USE_SSE2
1222	#define MAYBE_VECTOR(x) x
1223	#define MAYBE_VECTOR_PARAM __m128i vector_func(__m128i, const DenomType *)
1224	#else
1225	#define MAYBE_VECTOR(x) 0
1226	#define MAYBE_VECTOR_PARAM int vector_func
1227	#endif
1228
1229	/ Some bogus unswitch functions for unsigned types so the same (presumably templated) code can work for both signed and unsigned. /
1230	uint32_t crash_u32(uint32_t, const libdivide_u32_t *) { abort(); }
1231	uint64_t crash_u64(uint64_t, const libdivide_u64_t *) { abort(); }
1232	#ifdef __APPLE__
1233	UInt64 crash_u64(UInt64, const libdivide_u64_t *) { abort(); }
1234	#endif
1235	#if LIBDIVIDE_USE_SSE2
1236	__m128i crash_u32_vector(__m128i, const libdivide_u32_t *) { abort(); }
1237	__m128i crash_u64_vector(__m128i, const libdivide_u64_t *) { abort(); }
1238	#endif
1239
1240	template<typename IntType, typename DenomType, DenomType gen_func(IntType), int get_algo(const DenomType ), IntType do_func(IntType, const* DenomType *), MAYBE_VECTOR_PARAM>
1241	class divider_base {
1242	public:
1243	DenomType denom;
1244	divider_base(IntType d) : denom(gen_func(d)) { }
1245	divider_base(const DenomType & d) : denom(d) { }
1246
1247	IntType perform_divide(IntType val) const { return do_func(val, &denom); }
1248	#if LIBDIVIDE_USE_SSE2
1249	__m128i perform_divide_vector(__m128i val) const { return vector_func(val, &denom); }
1250	#endif
1251
1252	int get_algorithm() const { return get_algo(&denom); }
1253	};
1254
1255
1256	template<class T> struct divider_mid { };
1257
1258	template<> struct divider_mid<uint32_t> {
1259	typedef uint32_t IntType;
1260	typedef struct libdivide_u32_t DenomType;
1261	template<IntType do_func(IntType, const DenomType ), MAYBE_VECTOR_PARAM> struct* denom {
1262	typedef divider_base<IntType, DenomType, libdivide_u32_gen, libdivide_u32_get_algorithm, do_func, vector_func> divider;
1263	};
1264
1265	template<int ALGO, int J = `0`> struct algo { };
1266	template<int J> struct algo<-`1`, J> { typedef denom<libdivide_u32_do, MAYBE_VECTOR(libdivide_u32_do_vector)>::divider divider; };
1267	template<int J> struct algo<`0`, J> { typedef denom<libdivide_u32_do_alg0, MAYBE_VECTOR(libdivide_u32_do_vector_alg0)>::divider divider; };
1268	template<int J> struct algo<`1`, J> { typedef denom<libdivide_u32_do_alg1, MAYBE_VECTOR(libdivide_u32_do_vector_alg1)>::divider divider; };
1269	template<int J> struct algo<`2`, J> { typedef denom<libdivide_u32_do_alg2, MAYBE_VECTOR(libdivide_u32_do_vector_alg2)>::divider divider; };
1270
1271	/ Define two more bogus ones so that the same (templated, presumably) code can handle both signed and unsigned /
1272	template<int J> struct algo<`3`, J> { typedef denom<crash_u32, MAYBE_VECTOR(crash_u32_vector)>::divider divider; };
1273	template<int J> struct algo<`4`, J> { typedef denom<crash_u32, MAYBE_VECTOR(crash_u32_vector)>::divider divider; };
1274
1275	};
1276
1277	template<> struct divider_mid<int32_t> {
1278	typedef int32_t IntType;
1279	typedef struct libdivide_s32_t DenomType;
1280	template<IntType do_func(IntType, const DenomType ), MAYBE_VECTOR_PARAM> struct* denom {
1281	typedef divider_base<IntType, DenomType, libdivide_s32_gen, libdivide_s32_get_algorithm, do_func, vector_func> divider;
1282	};
1283
1284
1285	template<int ALGO, int J = `0`> struct algo { };
1286	template<int J> struct algo<-`1`, J> { typedef denom<libdivide_s32_do, MAYBE_VECTOR(libdivide_s32_do_vector)>::divider divider; };
1287	template<int J> struct algo<`0`, J> { typedef denom<libdivide_s32_do_alg0, MAYBE_VECTOR(libdivide_s32_do_vector_alg0)>::divider divider; };
1288	template<int J> struct algo<`1`, J> { typedef denom<libdivide_s32_do_alg1, MAYBE_VECTOR(libdivide_s32_do_vector_alg1)>::divider divider; };
1289	template<int J> struct algo<`2`, J> { typedef denom<libdivide_s32_do_alg2, MAYBE_VECTOR(libdivide_s32_do_vector_alg2)>::divider divider; };
1290	template<int J> struct algo<`3`, J> { typedef denom<libdivide_s32_do_alg3, MAYBE_VECTOR(libdivide_s32_do_vector_alg3)>::divider divider; };
1291	template<int J> struct algo<`4`, J> { typedef denom<libdivide_s32_do_alg4, MAYBE_VECTOR(libdivide_s32_do_vector_alg4)>::divider divider; };
1292
1293	};
1294
1295	#ifdef __APPLE__
1296	template<> struct divider_mid<Int64> {
1297	typedef Int64 IntType;
1298	typedef struct libdivide_s64_t DenomType;
1299	template<IntType do_func(IntType, const DenomType ), MAYBE_VECTOR_PARAM> struct* denom {
1300	typedef divider_base<IntType, DenomType, libdivide_s64_gen, libdivide_s64_get_algorithm, do_func, vector_func> divider;
1301	};
1302
1303	template<int ALGO, int J = `0`> struct algo { };
1304	template<int J> struct algo<-`1`, J> { typedef denom<libdivide_s64_do, MAYBE_VECTOR(libdivide_s64_do_vector)>::divider divider; };
1305	template<int J> struct algo<`0`, J> { typedef denom<libdivide_s64_do_alg0, MAYBE_VECTOR(libdivide_s64_do_vector_alg0)>::divider divider; };
1306	template<int J> struct algo<`1`, J> { typedef denom<libdivide_s64_do_alg1, MAYBE_VECTOR(libdivide_s64_do_vector_alg1)>::divider divider; };
1307	template<int J> struct algo<`2`, J> { typedef denom<libdivide_s64_do_alg2, MAYBE_VECTOR(libdivide_s64_do_vector_alg2)>::divider divider; };
1308	template<int J> struct algo<`3`, J> { typedef denom<libdivide_s64_do_alg3, MAYBE_VECTOR(libdivide_s64_do_vector_alg3)>::divider divider; };
1309	template<int J> struct algo<`4`, J> { typedef denom<libdivide_s64_do_alg4, MAYBE_VECTOR(libdivide_s64_do_vector_alg4)>::divider divider; };
1310	};
1311
1312	template<> struct divider_mid<UInt64> {
1313	typedef UInt64 IntType;
1314	typedef struct libdivide_u64_t DenomType;
1315	template<IntType do_func(IntType, const DenomType ), MAYBE_VECTOR_PARAM> struct* denom {
1316	typedef divider_base<IntType, DenomType, libdivide_u64_gen, libdivide_u64_get_algorithm, do_func, vector_func> divider;
1317	};
1318
1319	template<int ALGO, int J = `0`> struct algo { };
1320	template<int J> struct algo<-`1`, J> { typedef denom<libdivide_u64_do, MAYBE_VECTOR(libdivide_u64_do_vector)>::divider divider; };
1321	template<int J> struct algo<`0`, J> { typedef denom<libdivide_u64_do_alg0, MAYBE_VECTOR(libdivide_u64_do_vector_alg0)>::divider divider; };
1322	template<int J> struct algo<`1`, J> { typedef denom<libdivide_u64_do_alg1, MAYBE_VECTOR(libdivide_u64_do_vector_alg1)>::divider divider; };
1323	template<int J> struct algo<`2`, J> { typedef denom<libdivide_u64_do_alg2, MAYBE_VECTOR(libdivide_u64_do_vector_alg2)>::divider divider; };
1324
1325	/ Define two more bogus ones so that the same (templated, presumably) code can handle both signed and unsigned /
1326	template<int J> struct algo<`3`, J> { typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1327	template<int J> struct algo<`4`, J> { typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1328
1329
1330	};
1331	#endif
1332
1333	template<> struct divider_mid<uint64_t> {
1334	typedef uint64_t IntType;
1335	typedef struct libdivide_u64_t DenomType;
1336	template<IntType do_func(IntType, const DenomType ), MAYBE_VECTOR_PARAM> struct* denom {
1337	typedef divider_base<IntType, DenomType, libdivide_u64_gen, libdivide_u64_get_algorithm, do_func, vector_func> divider;
1338	};
1339
1340	template<int ALGO, int J = `0`> struct algo { };
1341	template<int J> struct algo<-`1`, J> { typedef denom<libdivide_u64_do, MAYBE_VECTOR(libdivide_u64_do_vector)>::divider divider; };
1342	template<int J> struct algo<`0`, J> { typedef denom<libdivide_u64_do_alg0, MAYBE_VECTOR(libdivide_u64_do_vector_alg0)>::divider divider; };
1343	template<int J> struct algo<`1`, J> { typedef denom<libdivide_u64_do_alg1, MAYBE_VECTOR(libdivide_u64_do_vector_alg1)>::divider divider; };
1344	template<int J> struct algo<`2`, J> { typedef denom<libdivide_u64_do_alg2, MAYBE_VECTOR(libdivide_u64_do_vector_alg2)>::divider divider; };
1345
1346	/ Define two more bogus ones so that the same (templated, presumably) code can handle both signed and unsigned /
1347	template<int J> struct algo<`3`, J> { typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1348	template<int J> struct algo<`4`, J> { typedef denom<crash_u64, MAYBE_VECTOR(crash_u64_vector)>::divider divider; };
1349
1350
1351	};
1352
1353	template<> struct divider_mid<int64_t> {
1354	typedef int64_t IntType;
1355	typedef struct libdivide_s64_t DenomType;
1356	template<IntType do_func(IntType, const DenomType ), MAYBE_VECTOR_PARAM> struct* denom {
1357	typedef divider_base<IntType, DenomType, libdivide_s64_gen, libdivide_s64_get_algorithm, do_func, vector_func> divider;
1358	};
1359
1360	template<int ALGO, int J = `0`> struct algo { };
1361	template<int J> struct algo<-`1`, J> { typedef denom<libdivide_s64_do, MAYBE_VECTOR(libdivide_s64_do_vector)>::divider divider; };
1362	template<int J> struct algo<`0`, J> { typedef denom<libdivide_s64_do_alg0, MAYBE_VECTOR(libdivide_s64_do_vector_alg0)>::divider divider; };
1363	template<int J> struct algo<`1`, J> { typedef denom<libdivide_s64_do_alg1, MAYBE_VECTOR(libdivide_s64_do_vector_alg1)>::divider divider; };
1364	template<int J> struct algo<`2`, J> { typedef denom<libdivide_s64_do_alg2, MAYBE_VECTOR(libdivide_s64_do_vector_alg2)>::divider divider; };
1365	template<int J> struct algo<`3`, J> { typedef denom<libdivide_s64_do_alg3, MAYBE_VECTOR(libdivide_s64_do_vector_alg3)>::divider divider; };
1366	template<int J> struct algo<`4`, J> { typedef denom<libdivide_s64_do_alg4, MAYBE_VECTOR(libdivide_s64_do_vector_alg4)>::divider divider; };
1367	};
1368
1369	}
1370
1371	template<typename T, int ALGO = -`1`>
1372	class divider
1373	{
1374	private:
1375	typename libdivide_internal::divider_mid<T>::template algo<ALGO>::divider sub;
1376	template<int NEW_ALGO, typename S> friend divider<S, NEW_ALGO> unswitch(const divider<S, -`1`> & d);
1377	divider(const typename libdivide_internal::divider_mid<T>::DenomType & denom) : sub(denom) { }
1378
1379	public:
1380
1381	/ Ordinary constructor, that takes the divisor as a parameter. /
1382	divider(T n) : sub(n) { }
1383
1384	/ Default constructor, that divides by 1 /
1385	divider() : sub(`1`) { }
1386
1387	/ Divides the parameter by the divisor, returning the quotient /
1388	T perform_divide(T val) const { return sub.perform_divide(val); }
1389
1390	#if LIBDIVIDE_USE_SSE2
1391	/ Treats the vector as either two or four packed values (depending on the size), and divides each of them by the divisor, returning the packed quotients. /
1392	__m128i perform_divide_vector(__m128i val) const { return sub.perform_divide_vector(val); }
1393	#endif
1394
1395	/ Returns the index of algorithm, for use in the unswitch function /
1396	int get_algorithm() const { return sub.get_algorithm(); } // returns the algorithm for unswitching
1397
1398	/ operator== /
1399	bool operator==(const divider<T, ALGO> & him) const { return sub.denom.magic == him.sub.denom.magic && sub.denom.more == him.sub.denom.more; }
1400
1401	bool operator!=(const divider<T, ALGO> & him) const { return ! (*this == him); }
1402	};
1403
1404	/ Returns a divider specialized for the given algorithm. /
1405	template<int NEW_ALGO, typename S>
1406	divider<S, NEW_ALGO> unswitch(const divider<S, -`1`> & d) { return divider<S, NEW_ALGO>(d.sub.denom); }
1407
1408	/ Overload of the / operator for scalar division. /
1409	template<typename int_type, int ALGO>
1410	int_type operator/(int_type numer, const divider<int_type, ALGO> & denom) {
1411	return denom.perform_divide(numer);
1412	}
1413
1414	#if LIBDIVIDE_USE_SSE2
1415	/ Overload of the / operator for vector division. /
1416	template<typename int_type, int ALGO>
1417	__m128i operator/(__m128i numer, const divider<int_type, ALGO> & denom) {
1418	return denom.perform_divide_vector(numer);
1419	}
1420	#endif
1421
1422
1423	#endif //__cplusplus
1424
1425	#endif //LIBDIVIDE_HEADER_ONLY
1426	#ifdef __cplusplus
1427	} //close namespace libdivide
1428	} //close anonymous namespace
1429	#endif
1430
1431	#pragma GCC diagnostic pop
1432

Browse the source code of ClickHouse/contrib/libdivide/libdivide.h