astcenc_vecmathlib.h source code [Godot/thirdparty/astcenc/astcenc_vecmathlib.h]

1	// SPDX-License-Identifier: Apache-2.0
2	// ----------------------------------------------------------------------------
3	// Copyright 2019-2022 Arm Limited
4	// Copyright 2008 Jose Fonseca
5	//
6	// Licensed under the Apache License, Version 2.0 (the "License"); you may not
7	// use this file except in compliance with the License. You may obtain a copy
8	// of the License at:
9	//
10	// http://www.apache.org/licenses/LICENSE-2.0
11	//
12	// Unless required by applicable law or agreed to in writing, software
13	// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14	// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15	// License for the specific language governing permissions and limitations
16	// under the License.
17	// ----------------------------------------------------------------------------
18
19	/*
20	* This module implements vector support for floats, ints, and vector lane
21	* control masks. It provides access to both explicit vector width types, and
22	* flexible N-wide types where N can be determined at compile time.
23	*
24	* The design of this module encourages use of vector length agnostic code, via
25	* the vint, vfloat, and vmask types. These will take on the widest SIMD vector
26	* with that is available at compile time. The current vector width is
27	* accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
28	*
29	* Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
30	* These are provided primarily for prototyping and algorithm debug of VLA
31	* implementations.
32	*
33	* Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
34	* types. These are provided for use by VLA code, but are also expected to be
35	* used as a fixed-width type and will supported a reference C++ fallback for
36	* use on platforms without SIMD intrinsics.
37	*
38	* Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
39	* types. These are provide for use by VLA code, and are not expected to be
40	* used as a fixed-width type in normal code. No reference C implementation is
41	* provided on platforms without underlying SIMD intrinsics.
42	*
43	* With the current implementation ISA support is provided for:
44	*
45	* * 1-wide for scalar reference.
46	* * 4-wide for Armv8-A NEON.
47	* * 4-wide for x86-64 SSE2.
48	* * 4-wide for x86-64 SSE4.1.
49	* * 8-wide for x86-64 AVX2.
50	*/
51
52	#ifndef ASTC_VECMATHLIB_H_INCLUDED
53	#define ASTC_VECMATHLIB_H_INCLUDED
54
55	#if ASTCENC_SSE != 0 \|\| ASTCENC_AVX != 0
56	#include <immintrin.h>
57	#elif ASTCENC_NEON != 0
58	#include <arm_neon.h>
59	#endif
60
61	#if !defined(__clang__) && defined(_MSC_VER)
62	#define ASTCENC_SIMD_INLINE __forceinline
63	#define ASTCENC_NO_INLINE
64	#elif defined(__GNUC__) && !defined(__clang__)
65	#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
66	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
67	#else
68	#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
69	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
70	#endif
71
72	#if ASTCENC_AVX >= 2
73	/ If we have AVX2 expose 8-wide VLA. /
74	#include "astcenc_vecmathlib_sse_4.h"
75	#include "astcenc_vecmathlib_common_4.h"
76	#include "astcenc_vecmathlib_avx2_8.h"
77
78	#define ASTCENC_SIMD_WIDTH 8
79
80	using vfloat = vfloat8;
81
82	#if defined(ASTCENC_NO_INVARIANCE)
83	using vfloatacc = vfloat8;
84	#else
85	using vfloatacc = vfloat4;
86	#endif
87
88	using vint = vint8;
89	using vmask = vmask8;
90
91	constexpr auto loada = vfloat8::loada;
92	constexpr auto load1 = vfloat8::load1;
93
94	#elif ASTCENC_SSE >= 20
95	/ If we have SSE expose 4-wide VLA, and 4-wide fixed width. /
96	#include "astcenc_vecmathlib_sse_4.h"
97	#include "astcenc_vecmathlib_common_4.h"
98
99	#define ASTCENC_SIMD_WIDTH 4
100
101	using vfloat = vfloat4;
102	using vfloatacc = vfloat4;
103	using vint = vint4;
104	using vmask = vmask4;
105
106	constexpr auto loada = vfloat4::loada;
107	constexpr auto load1 = vfloat4::load1;
108
109	#elif ASTCENC_NEON > 0
110	/ If we have NEON expose 4-wide VLA. /
111	#include "astcenc_vecmathlib_neon_4.h"
112	#include "astcenc_vecmathlib_common_4.h"
113
114	#define ASTCENC_SIMD_WIDTH 4
115
116	using vfloat = vfloat4;
117	using vfloatacc = vfloat4;
118	using vint = vint4;
119	using vmask = vmask4;
120
121	constexpr auto loada = vfloat4::loada;
122	constexpr auto load1 = vfloat4::load1;
123
124	#else
125	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
126
127	// Note: We no longer expose the 1-wide scalar fallback because it is not
128	// invariant with the 4-wide path due to algorithms that use horizontal
129	// operations that accumulate a local vector sum before accumulating into
130	// a running sum.
131	//
132	// For 4 items adding into an accumulator using 1-wide vectors the sum is:
133	//
134	// result = ((((sum + l0) + l1) + l2) + l3)
135	//
136	// ... whereas the accumulator for a 4-wide vector sum is:
137	//
138	// result = sum + ((l0 + l2) + (l1 + l3))
139	//
140	// In "normal maths" this is the same, but the floating point reassociation
141	// differences mean that these will not produce the same result.
142
143	#include "astcenc_vecmathlib_none_4.h"
144	#include "astcenc_vecmathlib_common_4.h"
145
146	#define ASTCENC_SIMD_WIDTH 4
147
148	using vfloat = vfloat4;
149	using vfloatacc = vfloat4;
150	using vint = vint4;
151	using vmask = vmask4;
152
153	constexpr auto loada = vfloat4::loada;
154	constexpr auto load1 = vfloat4::load1;
155	#endif
156
157	/**
158	* @brief Round a count down to the largest multiple of 8.
159	*
160	* @param count The unrounded value.
161	*
162	* @return The rounded value.
163	*/
164	ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
165	{
166	return count & static_cast<unsigned int>(~(`8` - `1`));
167	}
168
169	/**
170	* @brief Round a count down to the largest multiple of 4.
171	*
172	* @param count The unrounded value.
173	*
174	* @return The rounded value.
175	*/
176	ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
177	{
178	return count & static_cast<unsigned int>(~(`4` - `1`));
179	}
180
181	/**
182	* @brief Round a count down to the largest multiple of the SIMD width.
183	*
184	* Assumption that the vector width is a power of two ...
185	*
186	* @param count The unrounded value.
187	*
188	* @return The rounded value.
189	*/
190	ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
191	{
192	return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - `1`));
193	}
194
195	/**
196	* @brief Round a count up to the largest multiple of the SIMD width.
197	*
198	* Assumption that the vector width is a power of two ...
199	*
200	* @param count The unrounded value.
201	*
202	* @return The rounded value.
203	*/
204	ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
205	{
206	unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - `1`) / ASTCENC_SIMD_WIDTH;
207	return multiples * ASTCENC_SIMD_WIDTH;
208	}
209
210	/**
211	* @brief Return @c a with lanes negated if the @c b lane is negative.
212	*/
213	ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
214	{
215	vint ia = float_as_int(a);
216	vint ib = float_as_int(b);
217	vint sign_mask(static_cast<int>(`0x80000000`));
218	vint r = ia ^ (ib & sign_mask);
219	return int_as_float(r);
220	}
221
222	/**
223	* @brief Return fast, but approximate, vector atan(x).
224	*
225	* Max error of this implementation is 0.004883.
226	*/
227	ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
228	{
229	vmask c = abs(x) > vfloat (`1.0f`);
230	vfloat z = change_sign(vfloat (astc::PI_OVER_TWO), x);
231	vfloat y = select(x, vfloat (`1.0f`) / x, c);
232	y = y / (y * y * vfloat (`0.28f`) + vfloat (`1.0f`));
233	return select(y, z - y, c);
234	}
235
236	/**
237	* @brief Return fast, but approximate, vector atan2(x, y).
238	*/
239	ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
240	{
241	vfloat z = atan(abs(y / x));
242	vmask xmask = vmask (float_as_int(x).m);
243	return change_sign(select_msb(z, vfloat (astc::PI) - z, xmask), y);
244	}
245
246	/*
247	* @brief Factory that returns a unit length 4 component vfloat4.
248	*/
249	static ASTCENC_SIMD_INLINE vfloat4 unit4()
250	{
251	return vfloat4 (`0.5f`);
252	}
253
254	/**
255	* @brief Factory that returns a unit length 3 component vfloat4.
256	*/
257	static ASTCENC_SIMD_INLINE vfloat4 unit3()
258	{
259	float val = `0.577350258827209473f`;
260	return vfloat4 (val, val, val, `0.0f`);
261	}
262
263	/**
264	* @brief Factory that returns a unit length 2 component vfloat4.
265	*/
266	static ASTCENC_SIMD_INLINE vfloat4 unit2()
267	{
268	float val = `0.707106769084930420f`;
269	return vfloat4 (val, val, `0.0f`, `0.0f`);
270	}
271
272	/**
273	* @brief Factory that returns a 3 component vfloat4.
274	*/
275	static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
276	{
277	return vfloat4 (a, b, c, `0.0f`);
278	}
279
280	/**
281	* @brief Factory that returns a 2 component vfloat4.
282	*/
283	static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
284	{
285	return vfloat4 (a, b, `0.0f`, `0.0f`);
286	}
287
288	/**
289	* @brief Normalize a non-zero length vector to unit length.
290	*/
291	static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
292	{
293	vfloat4 length = dot(a, a);
294	return a / sqrt(length);
295	}
296
297	/**
298	* @brief Normalize a vector, returning @c safe if len is zero.
299	*/
300	static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
301	{
302	vfloat4 length = dot(a, a);
303	if (length.lane<`0`>() != `0.0f`)
304	{
305	return a / sqrt(length);
306	}
307
308	return safe;
309	}
310
311
312
313	#define POLY0(x, c0) ( c0)
314	#define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0)
315	#define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0)
316	#define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0)
317	#define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0)
318	#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
319
320	/**
321	* @brief Compute an approximate exp2(x) for each lane in the vector.
322	*
323	* Based on 5th degree minimax polynomials, ported from this blog
324	* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
325	*/
326	static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
327	{
328	x = clamp(-`126.99999f`, `129.0f`, x);
329
330	vint4 ipart = float_to_int(x - `0.5f`);
331	vfloat4 fpart = x - int_to_float(ipart);
332
333	// Integer contrib, using 1 << ipart
334	vfloat4 iexp = int_as_float(lsl<`23`>(ipart + `127`));
335
336	// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
337	vfloat4 fexp = POLY5(fpart,
338	`9.9999994e-1f`,
339	`6.9315308e-1f`,
340	`2.4015361e-1f`,
341	`5.5826318e-2f`,
342	`8.9893397e-3f`,
343	`1.8775767e-3f`);
344
345	return iexp * fexp;
346	}
347
348	/**
349	* @brief Compute an approximate log2(x) for each lane in the vector.
350	*
351	* Based on 5th degree minimax polynomials, ported from this blog
352	* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
353	*/
354	static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
355	{
356	vint4 exp(`0x7F800000`);
357	vint4 mant(`0x007FFFFF`);
358	vint4 one(`0x3F800000`);
359
360	vint4 i = float_as_int(x);
361
362	vfloat4 e = int_to_float(lsr<`23`>(i & exp) - `127`);
363
364	vfloat4 m = int_as_float((i & mant) \| one);
365
366	// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
367	vfloat4 p = POLY4(m,
368	`2.8882704548164776201f`,
369	-`2.52074962577807006663f`,
370	`1.48116647521213171641f`,
371	-`0.465725644288844778798f`,
372	`0.0596515482674574969533f`);
373
374	// Increases the polynomial degree, but ensures that log2(1) == 0
375	p = p * (m - `1.0f`);
376
377	return p + e;
378	}
379
380	/**
381	* @brief Compute an approximate pow(x, y) for each lane in the vector.
382	*
383	* Power function based on the exp2(log2(x) * y) transform.
384	*/
385	static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
386	{
387	vmask4 zero_mask = y == vfloat4 (`0.0f`);
388	vfloat4 estimate = exp2(log2(x) * y);
389
390	// Guarantee that y == 0 returns exactly 1.0f
391	return select(estimate, vfloat4 (`1.0f`), zero_mask);
392	}
393
394	/**
395	* @brief Count the leading zeros for each lane in @c a.
396	*
397	* Valid for all data values of @c a; will return a per-lane value [0, 32].
398	*/
399	static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
400	{
401	// This function is a horrible abuse of floating point exponents to convert
402	// the original integer value into a 2^N encoding we can recover easily.
403
404	// Convert to float without risk of rounding up by keeping only top 8 bits.
405	// This trick is is guaranteed to keep top 8 bits and clear the 9th.
406	a = (~lsr<`8`>(a)) & a;
407	a = float_as_int(int_to_float(a));
408
409	// Extract and unbias exponent
410	a = vint4 (`127` + `31`) - lsr<`23`>(a);
411
412	// Clamp result to a valid 32-bit range
413	return clamp(`0`, `32`, a);
414	}
415
416	/**
417	* @brief Return lanewise 2^a for each lane in @c a.
418	*
419	* Use of signed int means that this is only valid for values in range [0, 31].
420	*/
421	static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
422	{
423	// 2^30 is the largest signed number than can be represented
424	assert(all(a < vint4(`31`)));
425
426	// This function is a horrible abuse of floating point to use the exponent
427	// and float conversion to generate a 2^N multiple.
428
429	// Bias the exponent
430	vint4 exp = a + `127`;
431	exp = lsl<`23`>(exp);
432
433	// Reinterpret the bits as a float, and then convert to an int
434	vfloat4 f = int_as_float(exp);
435	return float_to_int(f);
436	}
437
438	/**
439	* @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
440	*/
441	static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
442	{
443	vint4 fp16_one = vint4 (`0x3C00`);
444	vint4 fp16_small = lsl<`8`>(p);
445
446	vmask4 is_one = p == vint4 (`0xFFFF`);
447	vmask4 is_small = p < vint4 (`4`);
448
449	// Manually inline clz() on Visual Studio to avoid release build codegen bug
450	// see https://github.com/ARM-software/astc-encoder/issues/259
451	#if !defined(__clang__) && defined(_MSC_VER)
452	vint4 a = (~lsr<`8`>(p)) & p;
453	a = float_as_int(int_to_float(a));
454	a = vint4(`127` + `31`) - lsr<`23`>(a);
455	vint4 lz = clamp(`0`, `32`, a) - `16`;
456	#else
457	vint4 lz = clz(p) - `16`;
458	#endif
459
460	p = p * two_to_the_n(lz + `1`);
461	p = p & vint4 (`0xFFFF`);
462
463	p = lsr<`6`>(p);
464
465	p = p \| lsl<`10`>(vint4 (`14`) - lz);
466
467	vint4 r = select(p, fp16_one, is_one);
468	r = select(r, fp16_small, is_small);
469	return r;
470	}
471
472	/**
473	* @brief Convert 16-bit LNS to float16.
474	*/
475	static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
476	{
477	vint4 mc = p & `0x7FF`;
478	vint4 ec = lsr<`11`>(p);
479
480	vint4 mc_512 = mc * `3`;
481	vmask4 mask_512 = mc < vint4 (`512`);
482
483	vint4 mc_1536 = mc * `4` - `512`;
484	vmask4 mask_1536 = mc < vint4 (`1536`);
485
486	vint4 mc_else = mc * `5` - `2048`;
487
488	vint4 mt = mc_else;
489	mt = select(mt, mc_1536, mask_1536);
490	mt = select(mt, mc_512, mask_512);
491
492	vint4 res = lsl<`10`>(ec) \| lsr<`3`>(mt);
493	return min(res, vint4 (`0x7BFF`));
494	}
495
496	/**
497	* @brief Extract mantissa and exponent of a float value.
498	*
499	* @param a The input value.
500	* @param[out] exp The output exponent.
501	*
502	* @return The mantissa.
503	*/
504	static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
505	{
506	// Interpret the bits as an integer
507	vint4 ai = float_as_int(a);
508
509	// Extract and unbias the exponent
510	exp = (lsr<`23`>(ai) & `0xFF`) - `126`;
511
512	// Extract and unbias the mantissa
513	vint4 manti = (ai & static_cast<int>(`0x807FFFFF`)) \| `0x3F000000`;
514	return int_as_float(manti);
515	}
516
517	/**
518	* @brief Convert float to 16-bit LNS.
519	*/
520	static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
521	{
522	vint4 exp;
523	vfloat4 mant = frexp(a, exp);
524
525	// Do these early before we start messing about ...
526	vmask4 mask_underflow_nan = ~(a > vfloat4 (`1.0f` / `67108864.0f`));
527	vmask4 mask_infinity = a >= vfloat4 (`65536.0f`);
528
529	// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
530	vmask4 exp_lt_m13 = exp < vint4 (-`13`);
531
532	vfloat4 a1a = a * `33554432.0f`;
533	vint4 expa = vint4::zero();
534
535	vfloat4 a1b = (mant - `0.5f`) * `4096`;
536	vint4 expb = exp + `14`;
537
538	a = select(a1b, a1a, exp_lt_m13);
539	exp = select(expb, expa, exp_lt_m13);
540
541	vmask4 a_lt_384 = a < vfloat4 (`384.0f`);
542	vmask4 a_lt_1408 = a <= vfloat4 (`1408.0f`);
543
544	vfloat4 a2a = a * (`4.0f` / `3.0f`);
545	vfloat4 a2b = a + `128.0f`;
546	vfloat4 a2c = (a + `512.0f`) * (`4.0f` / `5.0f`);
547
548	a = a2c;
549	a = select(a, a2b, a_lt_1408);
550	a = select(a, a2a, a_lt_384);
551
552	a = a + (int_to_float(exp) * `2048.0f`) + `1.0f`;
553
554	a = select(a, vfloat4 (`65535.0f`), mask_infinity);
555	a = select(a, vfloat4::zero(), mask_underflow_nan);
556
557	return a;
558	}
559
560	namespace astc
561	{
562
563	static ASTCENC_SIMD_INLINE float pow(float x, float y)
564	{
565	return pow(vfloat4 (x), vfloat4 (y)).lane<`0`>();
566	}
567
568	}
569
570	#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
571

Browse the source code of Godot/thirdparty/astcenc/astcenc_vecmathlib.h