transcendental.h source code [Godot/thirdparty/embree/common/math/transcendental.h]

1	// Copyright 2009-2021 Intel Corporation
2	// SPDX-License-Identifier: Apache-2.0
3
4	#pragma once
5
6	// Transcendental functions from "ispc": https://github.com/ispc/ispc/
7	// Most of the transcendental implementations in ispc code come from
8	// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
9
10	#include "../simd/simd.h"
11
12	namespace embree
13	{
14
15	namespace fastapprox
16	{
17
18	template <typename T>
19	__forceinline T sin(const T &v)
20	{
21	static const float piOverTwoVec = `1.57079637050628662109375`;
22	static const float twoOverPiVec = `0.636619746685028076171875`;
23	auto scaled = v * twoOverPiVec;
24	auto kReal = floor(scaled);
25	auto k = toInt(kReal);
26
27	// Reduced range version of x
28	auto x = v - kReal * piOverTwoVec;
29	auto kMod4 = k & `3`;
30	auto sinUseCos = (kMod4 == `1`) \| (kMod4 == `3`);
31	auto flipSign = (kMod4 > `1`);
32
33	// These coefficients are from sollya with fpminimax(sin(x)/x, [\|0, 2,
34	// 4, 6, 8, 10\|], [\|single...\|], [0;Pi/2]);
35	static const float sinC2 = -`0.16666667163372039794921875`;
36	static const float sinC4 = +`8.333347737789154052734375e-3`;
37	static const float sinC6 = -`1.9842604524455964565277099609375e-4`;
38	static const float sinC8 = +`2.760012648650445044040679931640625e-6`;
39	static const float sinC10 = -`2.50293279435709337121807038784027099609375e-8`;
40
41	static const float cosC2 = -`0.5`;
42	static const float cosC4 = +`4.166664183139801025390625e-2`;
43	static const float cosC6 = -`1.388833043165504932403564453125e-3`;
44	static const float cosC8 = +`2.47562347794882953166961669921875e-5`;
45	static const float cosC10 = -`2.59630184018533327616751194000244140625e-7`;
46
47	auto outside = select(sinUseCos, `1.`, x);
48	auto c2 = select(sinUseCos, T(cosC2), T(sinC2));
49	auto c4 = select(sinUseCos, T(cosC4), T(sinC4));
50	auto c6 = select(sinUseCos, T(cosC6), T(sinC6));
51	auto c8 = select(sinUseCos, T(cosC8), T(sinC8));
52	auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
53
54	auto x2 = x * x;
55	auto formula = x2 * c10 + c8;
56	formula = x2 * formula + c6;
57	formula = x2 * formula + c4;
58	formula = x2 * formula + c2;
59	formula = x2 * formula + `1.`;
60	formula *= outside;
61
62	formula = select(flipSign, -formula, formula);
63	return formula;
64	}
65
66	template <typename T>
67	__forceinline T cos(const T &v)
68	{
69	static const float piOverTwoVec = `1.57079637050628662109375`;
70	static const float twoOverPiVec = `0.636619746685028076171875`;
71	auto scaled = v * twoOverPiVec;
72	auto kReal = floor(scaled);
73	auto k = toInt(kReal);
74
75	// Reduced range version of x
76	auto x = v - kReal * piOverTwoVec;
77
78	auto kMod4 = k & `3`;
79	auto cosUseCos = (kMod4 == `0`) \| (kMod4 == `2`);
80	auto flipSign = (kMod4 == `1`) \| (kMod4 == `2`);
81
82	const float sinC2 = -`0.16666667163372039794921875`;
83	const float sinC4 = +`8.333347737789154052734375e-3`;
84	const float sinC6 = -`1.9842604524455964565277099609375e-4`;
85	const float sinC8 = +`2.760012648650445044040679931640625e-6`;
86	const float sinC10 = -`2.50293279435709337121807038784027099609375e-8`;
87
88	const float cosC2 = -`0.5`;
89	const float cosC4 = +`4.166664183139801025390625e-2`;
90	const float cosC6 = -`1.388833043165504932403564453125e-3`;
91	const float cosC8 = +`2.47562347794882953166961669921875e-5`;
92	const float cosC10 = -`2.59630184018533327616751194000244140625e-7`;
93
94	auto outside = select(cosUseCos, `1.`, x);
95	auto c2 = select(cosUseCos, T(cosC2), T(sinC2));
96	auto c4 = select(cosUseCos, T(cosC4), T(sinC4));
97	auto c6 = select(cosUseCos, T(cosC6), T(sinC6));
98	auto c8 = select(cosUseCos, T(cosC8), T(sinC8));
99	auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
100
101	auto x2 = x * x;
102	auto formula = x2 * c10 + c8;
103	formula = x2 * formula + c6;
104	formula = x2 * formula + c4;
105	formula = x2 * formula + c2;
106	formula = x2 * formula + `1.`;
107	formula *= outside;
108
109	formula = select(flipSign, -formula, formula);
110	return formula;
111	}
112
113	template <typename T>
114	__forceinline void sincos(const T &v, T &sinResult, T &cosResult)
115	{
116	const float piOverTwoVec = `1.57079637050628662109375`;
117	const float twoOverPiVec = `0.636619746685028076171875`;
118	auto scaled = v * twoOverPiVec;
119	auto kReal = floor(scaled);
120	auto k = toInt(kReal);
121
122	// Reduced range version of x
123	auto x = v - kReal * piOverTwoVec;
124	auto kMod4 = k & `3`;
125	auto cosUseCos = ((kMod4 == `0`) \| (kMod4 == `2`));
126	auto sinUseCos = ((kMod4 == `1`) \| (kMod4 == `3`));
127	auto sinFlipSign = (kMod4 > `1`);
128	auto cosFlipSign = ((kMod4 == `1`) \| (kMod4 == `2`));
129
130	const float oneVec = +`1.`;
131	const float sinC2 = -`0.16666667163372039794921875`;
132	const float sinC4 = +`8.333347737789154052734375e-3`;
133	const float sinC6 = -`1.9842604524455964565277099609375e-4`;
134	const float sinC8 = +`2.760012648650445044040679931640625e-6`;
135	const float sinC10 = -`2.50293279435709337121807038784027099609375e-8`;
136
137	const float cosC2 = -`0.5`;
138	const float cosC4 = +`4.166664183139801025390625e-2`;
139	const float cosC6 = -`1.388833043165504932403564453125e-3`;
140	const float cosC8 = +`2.47562347794882953166961669921875e-5`;
141	const float cosC10 = -`2.59630184018533327616751194000244140625e-7`;
142
143	auto x2 = x * x;
144
145	auto sinFormula = x2 * sinC10 + sinC8;
146	auto cosFormula = x2 * cosC10 + cosC8;
147	sinFormula = x2 * sinFormula + sinC6;
148	cosFormula = x2 * cosFormula + cosC6;
149
150	sinFormula = x2 * sinFormula + sinC4;
151	cosFormula = x2 * cosFormula + cosC4;
152
153	sinFormula = x2 * sinFormula + sinC2;
154	cosFormula = x2 * cosFormula + cosC2;
155
156	sinFormula = x2 * sinFormula + oneVec;
157	cosFormula = x2 * cosFormula + oneVec;
158
159	sinFormula *= x;
160
161	sinResult = select(sinUseCos, cosFormula, sinFormula);
162	cosResult = select(cosUseCos, cosFormula, sinFormula);
163
164	sinResult = select(sinFlipSign, -sinResult, sinResult);
165	cosResult = select(cosFlipSign, -cosResult, cosResult);
166	}
167
168	template <typename T>
169	__forceinline T tan(const T &v)
170	{
171	const float piOverFourVec = `0.785398185253143310546875`;
172	const float fourOverPiVec = `1.27323949337005615234375`;
173
174	auto xLt0 = v < `0.`;
175	auto y = select(xLt0, -v, v);
176	auto scaled = y * fourOverPiVec;
177
178	auto kReal = floor(scaled);
179	auto k = toInt(kReal);
180
181	auto x = y - kReal * piOverFourVec;
182
183	// If k & 1, x -= Pi/4
184	auto needOffset = (k & `1`) != `0`;
185	x = select(needOffset, x - piOverFourVec, x);
186
187	// If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
188	auto kMod4 = k & `3`;
189	auto useCotan = (kMod4 == `1`) \| (kMod4 == `2`);
190
191	const float oneVec = `1.0`;
192
193	const float tanC2 = +`0.33333075046539306640625`;
194	const float tanC4 = +`0.13339905440807342529296875`;
195	const float tanC6 = +`5.3348250687122344970703125e-2`;
196	const float tanC8 = +`2.46033705770969390869140625e-2`;
197	const float tanC10 = +`2.892402000725269317626953125e-3`;
198	const float tanC12 = +`9.500005282461643218994140625e-3`;
199
200	const float cotC2 = -`0.3333333432674407958984375`;
201	const float cotC4 = -`2.222204394638538360595703125e-2`;
202	const float cotC6 = -`2.11752182804048061370849609375e-3`;
203	const float cotC8 = -`2.0846328698098659515380859375e-4`;
204	const float cotC10 = -`2.548247357481159269809722900390625e-5`;
205	const float cotC12 = -`3.5257363606433500535786151885986328125e-7`;
206
207	auto x2 = x * x;
208	T z;
209	if (any(useCotan))
210	{
211	auto cotVal = x2 * cotC12 + cotC10;
212	cotVal = x2 * cotVal + cotC8;
213	cotVal = x2 * cotVal + cotC6;
214	cotVal = x2 * cotVal + cotC4;
215	cotVal = x2 * cotVal + cotC2;
216	cotVal = x2 * cotVal + oneVec;
217	// The equation is for x cot(x) but we need -x * cot(x) for the tan part.*
218	cotVal /= -x;
219	z = cotVal;
220	}
221	auto useTan = !useCotan;
222	if (any(useTan))
223	{
224	auto tanVal = x2 * tanC12 + tanC10;
225	tanVal = x2 * tanVal + tanC8;
226	tanVal = x2 * tanVal + tanC6;
227	tanVal = x2 * tanVal + tanC4;
228	tanVal = x2 * tanVal + tanC2;
229	tanVal = x2 * tanVal + oneVec;
230	// Equation was for tan(x)/x
231	tanVal *= x;
232	z = select(useTan, tanVal, z);
233	}
234	return select(xLt0, -z, z);
235	}
236
237	template <typename T>
238	__forceinline T asin(const T &x0)
239	{
240	auto isneg = (x0 < `0.f`);
241	auto x = abs(x0);
242	auto isnan = (x > `1.f`);
243
244	// sollya
245	// fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [\|0,1,2,3,4,5\|],[\|single...\|],
246	// [1e-20;.9999999999999999]);
247	// avg error: 1.1105439e-06, max error 1.3187528e-06
248	auto v = `1.57079517841339111328125f` +
249	x * (-`0.21450997889041900634765625f` +
250	x * (`8.78556668758392333984375e-2f` +
251	x * (-`4.489909112453460693359375e-2f` +
252	x * (`1.928029954433441162109375e-2f` +
253	x * (-`4.3095736764371395111083984375e-3f`)))));
254
255	v *= -sqrt(`1.f` - x);
256	v = v + `1.57079637050628662109375f`;
257
258	v = select(v < `0.f`, T(`0.f`), v);
259	v = select(isneg, -v, v);
260	v = select(isnan, T(cast_i2f(`0x7fc00000`)), v);
261
262	return v;
263	}
264
265	template <typename T>
266	__forceinline T acos(const T &v)
267	{
268	return `1.57079637050628662109375f` - asin(v);
269	}
270
271	template <typename T>
272	__forceinline T atan(const T &v)
273	{
274	const float piOverTwoVec = `1.57079637050628662109375`;
275	// atan(-x) = -atan(x) (so flip from negative to positive first)
276	// If x > 1 -> atan(x) = Pi/2 - atan(1/x)
277	auto xNeg = v < `0.f`;
278	auto xFlipped = select(xNeg, -v, v);
279
280	auto xGt1 = xFlipped > `1.`;
281	auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
282
283	// These coefficients approximate atan(x)/x
284	const float atanC0 = +`0.99999988079071044921875`;
285	const float atanC2 = -`0.3333191573619842529296875`;
286	const float atanC4 = +`0.199689209461212158203125`;
287	const float atanC6 = -`0.14015688002109527587890625`;
288	const float atanC8 = +`9.905083477497100830078125e-2`;
289	const float atanC10 = -`5.93664981424808502197265625e-2`;
290	const float atanC12 = +`2.417283318936824798583984375e-2`;
291	const float atanC14 = -`4.6721356920897960662841796875e-3`;
292
293	auto x2 = x * x;
294	auto result = x2 * atanC14 + atanC12;
295	result = x2 * result + atanC10;
296	result = x2 * result + atanC8;
297	result = x2 * result + atanC6;
298	result = x2 * result + atanC4;
299	result = x2 * result + atanC2;
300	result = x2 * result + atanC0;
301	result *= x;
302
303	result = select(xGt1, piOverTwoVec - result, result);
304	result = select(xNeg, -result, result);
305	return result;
306	}
307
308	template <typename T>
309	__forceinline T atan2(const T &y, const T &x)
310	{
311	const float piVec = `3.1415926536`;
312	// atan2(y, x) =
313	//
314	// atan2(y > 0, x = +-0) -> Pi/2
315	// atan2(y < 0, x = +-0) -> -Pi/2
316	// atan2(y = +-0, x < +0) -> +-Pi
317	// atan2(y = +-0, x >= +0) -> +-0
318	//
319	// atan2(y >= 0, x < 0) -> Pi + atan(y/x)
320	// atan2(y < 0, x < 0) -> -Pi + atan(y/x)
321	// atan2(y, x > 0) -> atan(y/x)
322	//
323	// and then a bunch of code for dealing with infinities.
324	auto yOverX = y * rcpSafe(x);
325	auto atanArg = atan(yOverX);
326	auto xLt0 = x < `0.f`;
327	auto yLt0 = y < `0.f`;
328	auto offset = select(xLt0,
329	select(yLt0, T(-piVec), T(piVec)), `0.f`);
330	return offset + atanArg;
331	}
332
333	template <typename T>
334	__forceinline T exp(const T &v)
335	{
336	const float ln2Part1 = `0.6931457519`;
337	const float ln2Part2 = `1.4286067653e-6`;
338	const float oneOverLn2 = `1.44269502162933349609375`;
339
340	auto scaled = v * oneOverLn2;
341	auto kReal = floor(scaled);
342	auto k = toInt(kReal);
343
344	// Reduced range version of x
345	auto x = v - kReal * ln2Part1;
346	x -= kReal * ln2Part2;
347
348	// These coefficients are for e^x in [0, ln(2)]
349	const float one = `1.`;
350	const float c2 = `0.4999999105930328369140625`;
351	const float c3 = `0.166668415069580078125`;
352	const float c4 = `4.16539050638675689697265625e-2`;
353	const float c5 = `8.378830738365650177001953125e-3`;
354	const float c6 = `1.304379315115511417388916015625e-3`;
355	const float c7 = `2.7555381529964506626129150390625e-4`;
356
357	auto result = x * c7 + c6;
358	result = x * result + c5;
359	result = x * result + c4;
360	result = x * result + c3;
361	result = x * result + c2;
362	result = x * result + one;
363	result = x * result + one;
364
365	// Compute 2^k (should differ for float and double, but I'll avoid
366	// it for now and just do floats)
367	const int fpbias = `127`;
368	auto biasedN = k + fpbias;
369	auto overflow = kReal > fpbias;
370	// Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
371	// we've got underflow. -127 ln(2) -> -88.02. So the most*
372	// negative float input that doesn't result in zero is like -88.
373	auto underflow = kReal <= -fpbias;
374	const int infBits = `0x7f800000`;
375	biasedN <<= `23`;
376	// Reinterpret this thing as float
377	auto twoToTheN = asFloat(biasedN);
378	// Handle both doubles and floats (hopefully eliding the copy for float)
379	auto elemtype2n = twoToTheN;
380	result *= elemtype2n;
381	result = select(overflow, cast_i2f(infBits), result);
382	result = select(underflow, `0.`, result);
383	return result;
384	}
385
386	// Range reduction for logarithms takes log(x) -> log(2^n y) -> n*
387	// log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).*
388	template <typename T, typename R>
389	__forceinline void __rangeReduceLog(const T &input,
390	T &reduced,
391	R &exponent)
392	{
393	auto intVersion = asInt(input);
394	// single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
395	// exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000
396	// 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0
397	// non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111
398	// = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF
399
400	//const int exponentMask(0x7F800000)
401	static const int nonexponentMask = `0x807FFFFF`;
402
403	// We want the reduced version to have an exponent of -1 which is
404	// -1 + 127 after biasing or 126
405	static const int exponentNeg1 = (`126l` << `23`);
406	// NOTE(boulos): We don't need to mask anything out since we know
407	// the sign bit has to be 0. If it's 1, we need to return infinity/nan
408	// anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
409	auto biasedExponent = intVersion >> `23`; // This number is [0, 255] but it means [-127, 128]
410
411	auto offsetExponent = biasedExponent + `1`; // Treat the number as if it were 2^{e+1} (1.m)/2*
412	exponent = offsetExponent - `127`; // get the real value
413
414	// Blend the offset_exponent with the original input (do this in
415	// int for now, until I decide if float can have & and &not)
416	auto blended = (intVersion & nonexponentMask) \| (exponentNeg1);
417	reduced = asFloat(blended);
418	}
419
420	template <typename T> struct ExponentType { };
421	template <int N> struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; };
422	template <> struct ExponentType<float> { typedef int Ty; };
423
424	template <typename T>
425	__forceinline T log(const T &v)
426	{
427	T reduced;
428	typename ExponentType<T>::Ty exponent;
429
430	const int nanBits = `0x7fc00000`;
431	const int negInfBits = `0xFF800000`;
432	const float nan = cast_i2f(nanBits);
433	const float negInf = cast_i2f(negInfBits);
434	auto useNan = v < `0.`;
435	auto useInf = v == `0.`;
436	auto exceptional = useNan \| useInf;
437	const float one = `1.0`;
438
439	auto patched = select(exceptional, one, v);
440	__rangeReduceLog(patched, reduced, exponent);
441
442	const float ln2 = `0.693147182464599609375`;
443
444	auto x1 = one - reduced;
445	const float c1 = +`0.50000095367431640625`;
446	const float c2 = +`0.33326041698455810546875`;
447	const float c3 = +`0.2519190013408660888671875`;
448	const float c4 = +`0.17541764676570892333984375`;
449	const float c5 = +`0.3424419462680816650390625`;
450	const float c6 = -`0.599632322788238525390625`;
451	const float c7 = +`1.98442304134368896484375`;
452	const float c8 = -`2.4899270534515380859375`;
453	const float c9 = +`1.7491014003753662109375`;
454
455	auto result = x1 * c9 + c8;
456	result = x1 * result + c7;
457	result = x1 * result + c6;
458	result = x1 * result + c5;
459	result = x1 * result + c4;
460	result = x1 * result + c3;
461	result = x1 * result + c2;
462	result = x1 * result + c1;
463	result = x1 * result + one;
464
465	// Equation was for -(ln(red)/(1-red))
466	result *= -x1;
467	result += toFloat(exponent) * ln2;
468
469	return select(exceptional,
470	select(useNan, T(nan), T(negInf)),
471	result);
472	}
473
474	template <typename T>
475	__forceinline T pow(const T &x, const T &y)
476	{
477	auto x1 = abs(x);
478	auto z = exp(y * log(x1));
479
480	// Handle special cases
481	const float twoOver23 = `8388608.0f`;
482	auto yInt = y == round(y);
483	auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << `31`, `0`); // set sign bit
484
485	// x == 0
486	z = select(x == `0.0f`,
487	select(y < `0.0f`, T(inf) \| signmsk(x),
488	select(y == `0.0f`, T(`1.0f`), asFloat(yOddInt) & x)), z);
489
490	// x < 0
491	auto xNegative = x < `0.0f`;
492	if (any(xNegative))
493	{
494	auto z1 = z \| asFloat(yOddInt);
495	z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
496	z = select(xNegative, z1, z);
497	}
498
499	auto xFinite = isfinite(x);
500	auto yFinite = isfinite(y);
501	if (all(xFinite & yFinite))
502	return z;
503
504	// x finite and y infinite
505	z = select(andn(xFinite, yFinite),
506	select(x1 == `1.0f`, `1.0f`,
507	select((x1 > `1.0f`) ^ (y < `0.0f`), inf, T(`0.0f`))), z);
508
509	// x infinite
510	z = select(xFinite, z,
511	select(y == `0.0f`, `1.0f`,
512	select(y < `0.0f`, T(`0.0f`), inf) \| (asFloat(yOddInt) & x)));
513
514	return z;
515	}
516
517	template <typename T>
518	__forceinline T pow(const T &x, float y)
519	{
520	return pow(x, T(y));
521	}
522
523	} // namespace fastapprox
524
525	} // namespace embree
526

Browse the source code of Godot/thirdparty/embree/common/math/transcendental.h