vertexfilter.cpp source code [Godot/thirdparty/meshoptimizer/vertexfilter.cpp]

1	// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2	#include "meshoptimizer.h"
3
4	#include <math.h>
5	#include <string.h>
6
7	// The block below auto-detects SIMD ISA that can be used on the target platform
8	#ifndef MESHOPTIMIZER_NO_SIMD
9
10	// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings
11	#if defined(__SSE2__)
12	#define SIMD_SSE
13	#endif
14
15	// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2
16	#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) \|\| defined(_M_X64))
17	#define SIMD_SSE
18	#endif
19
20	// GCC/clang define these when NEON support is available
21	#if defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
22	#define SIMD_NEON
23	#endif
24
25	// On MSVC, we assume that ARM builds always target NEON-capable devices
26	#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) \|\| defined(_M_ARM64))
27	#define SIMD_NEON
28	#endif
29
30	// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
31	#if defined(__wasm_simd128__)
32	#define SIMD_WASM
33	#endif
34
35	#endif // !MESHOPTIMIZER_NO_SIMD
36
37	#ifdef SIMD_SSE
38	#include <emmintrin.h>
39	#include <stdint.h>
40	#endif
41
42	#ifdef _MSC_VER
43	#include <intrin.h>
44	#endif
45
46	#ifdef SIMD_NEON
47	#if defined(_MSC_VER) && defined(_M_ARM64)
48	#include <arm64_neon.h>
49	#else
50	#include <arm_neon.h>
51	#endif
52	#endif
53
54	#ifdef SIMD_WASM
55	#undef __DEPRECATED
56	#include <wasm_simd128.h>
57	#endif
58
59	#ifdef SIMD_WASM
60	#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
61	#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
62	#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6)
63	#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
64	#endif
65
66	namespace meshopt
67	{
68
69	#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM)
70	template <typename T>
71	static void decodeFilterOct(T* data, size_t count)
72	{
73	const float max = float((`1` << (sizeof(T) * `8` - `1`)) - `1`);
74
75	for (size_t i = `0`; i < count; ++i)
76	{
77	// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
78	float x = float(data[i * `4` + `0`]);
79	float y = float(data[i * `4` + `1`]);
80	float z = float(data[i * `4` + `2`]) - fabsf(x) - fabsf(y);
81
82	// fixup octahedral coordinates for z<0
83	float t = (z >= `0.f`) ? `0.f` : z;
84
85	x += (x >= `0.f`) ? t : -t;
86	y += (y >= `0.f`) ? t : -t;
87
88	// compute normal length & scale
89	float l = sqrtf(x * x + y * y + z * z);
90	float s = max / l;
91
92	// rounded signed float->int
93	int xf = int(x * s + (x >= `0.f` ? `0.5f` : -`0.5f`));
94	int yf = int(y * s + (y >= `0.f` ? `0.5f` : -`0.5f`));
95	int zf = int(z * s + (z >= `0.f` ? `0.5f` : -`0.5f`));
96
97	data[i * `4` + `0`] = T(xf);
98	data[i * `4` + `1`] = T(yf);
99	data[i * `4` + `2`] = T(zf);
100	}
101	}
102
103	static void decodeFilterQuat(short* data, size_t count)
104	{
105	const float scale = `1.f` / sqrtf(`2.f`);
106
107	for (size_t i = `0`; i < count; ++i)
108	{
109	// recover scale from the high byte of the component
110	int sf = data[i * `4` + `3`] \| `3`;
111	float ss = scale / float(sf);
112
113	// convert x/y/z to [-1..1] (scaled...)
114	float x = float(data[i * `4` + `0`]) * ss;
115	float y = float(data[i * `4` + `1`]) * ss;
116	float z = float(data[i * `4` + `2`]) * ss;
117
118	// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
119	float ww = `1.f` - x * x - y * y - z * z;
120	float w = sqrtf(ww >= `0.f` ? ww : `0.f`);
121
122	// rounded signed float->int
123	int xf = int(x * `32767.f` + (x >= `0.f` ? `0.5f` : -`0.5f`));
124	int yf = int(y * `32767.f` + (y >= `0.f` ? `0.5f` : -`0.5f`));
125	int zf = int(z * `32767.f` + (z >= `0.f` ? `0.5f` : -`0.5f`));
126	int wf = int(w * `32767.f` + `0.5f`);
127
128	int qc = data[i * `4` + `3`] & `3`;
129
130	// output order is dictated by input index
131	data[i * `4` + ((qc + `1`) & `3`)] = short(xf);
132	data[i * `4` + ((qc + `2`) & `3`)] = short(yf);
133	data[i * `4` + ((qc + `3`) & `3`)] = short(zf);
134	data[i * `4` + ((qc + `0`) & `3`)] = short(wf);
135	}
136	}
137
138	static void decodeFilterExp(unsigned int* data, size_t count)
139	{
140	for (size_t i = `0`; i < count; ++i)
141	{
142	unsigned int v = data[i];
143
144	// decode mantissa and exponent
145	int m = int(v << `8`) >> `8`;
146	int e = int(v) >> `24`;
147
148	union
149	{
150	float f;
151	unsigned int ui;
152	} u;
153
154	// optimized version of ldexp(float(m), e)
155	u.ui = unsigned(e + `127`) << `23`;
156	u.f = u.f * float(m);
157
158	data[i] = u.ui;
159	}
160	}
161	#endif
162
163	#if defined(SIMD_SSE) \|\| defined(SIMD_NEON) \|\| defined(SIMD_WASM)
164	template <typename T>
165	static void dispatchSimd(void (process)(T, size_t), T* data, size_t count, size_t stride)
166	{
167	assert(stride <= `4`);
168
169	size_t count4 = count & ~size_t(`3`);
170	process(data, count4);
171
172	if (count4 < count)
173	{
174	T tail[`4` * `4`] = {}; // max stride 4, max count 4
175	size_t tail_size = (count - count4) * stride * sizeof(T);
176	assert(tail_size <= sizeof(tail));
177
178	memcpy(tail, data + count4 * stride, tail_size);
179	process(tail, count - count4);
180	memcpy(data + count4 * stride, tail, tail_size);
181	}
182	}
183
184	inline uint64_t rotateleft64(uint64_t v, int x)
185	{
186	#if defined(_MSC_VER) && !defined(__clang__)
187	return _rotl64(v, x);
188	// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
189	// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
190	#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) \|\| __clang_major__ >= 11)
191	return __builtin_rotateleft64(v, x);
192	#else
193	return (v << (x & `63`)) \| (v >> ((`64` - x) & `63`));
194	#endif
195	}
196	#endif
197
198	#ifdef SIMD_SSE
199	static void decodeFilterOctSimd(signed char* data, size_t count)
200	{
201	const __m128 sign = _mm_set1_ps(-`0.f`);
202
203	for (size_t i = `0`; i < count; i += `4`)
204	{
205	__m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i>(&data[i `4`]));
206
207	// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
208	__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, `24`), `24`);
209	__m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, `16`), `24`);
210
211	// unpack z; note that z is unsigned so we technically don't need to sign extend it
212	__m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, `8`), `24`);
213
214	// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
215	__m128 x = _mm_cvtepi32_ps(xf);
216	__m128 y = _mm_cvtepi32_ps(yf);
217	__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
218
219	// fixup octahedral coordinates for z<0
220	__m128 t = _mm_min_ps(z, _mm_setzero_ps());
221
222	x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
223	y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
224
225	// compute normal length & scale
226	__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
227	__m128 s = _mm_mul_ps(_mm_set1_ps(`127.f`), _mm_rsqrt_ps(ll));
228
229	// rounded signed float->int
230	__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
231	__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
232	__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
233
234	// combine xr/yr/zr into final value
235	__m128i res = _mm_and_si128(n4, _mm_set1_epi32(`0xff000000`));
236	res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(`0xff`)));
237	res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(`0xff`)), `8`));
238	res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(`0xff`)), `16`));
239
240	_mm_storeu_si128(reinterpret_cast<__m128i>(&data[i `4`]), res);
241	}
242	}
243
244	static void decodeFilterOctSimd(short* data, size_t count)
245	{
246	const __m128 sign = _mm_set1_ps(-`0.f`);
247
248	for (size_t i = `0`; i < count; i += `4`)
249	{
250	__m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float>(&data[(i + `0`) `4`]));
251	__m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float>(&data[(i + `2`) `4`]));
252
253	// gather both x/y 16-bit pairs in each 32-bit lane
254	__m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(`2`, `0`, `2`, `0`)));
255
256	// sign-extends each of x,y in [x y] with arithmetic shifts
257	__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, `16`), `16`);
258	__m128i yf = _mm_srai_epi32(n4, `16`);
259
260	// unpack z; note that z is unsigned so we don't need to sign extend it
261	__m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(`3`, `1`, `3`, `1`)));
262	__m128i zf = _mm_and_si128(z4, _mm_set1_epi32(`0x7fff`));
263
264	// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
265	__m128 x = _mm_cvtepi32_ps(xf);
266	__m128 y = _mm_cvtepi32_ps(yf);
267	__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
268
269	// fixup octahedral coordinates for z<0
270	__m128 t = _mm_min_ps(z, _mm_setzero_ps());
271
272	x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
273	y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
274
275	// compute normal length & scale
276	__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
277	__m128 s = _mm_div_ps(_mm_set1_ps(`32767.f`), _mm_sqrt_ps(ll));
278
279	// rounded signed float->int
280	__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
281	__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
282	__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
283
284	// mix x/z and y/0 to make 16-bit unpack easier
285	__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(`0xffff`)), _mm_slli_epi32(zr, `16`));
286	__m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(`0xffff`));
287
288	// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
289	__m128i res_0 = _mm_unpacklo_epi16(xzr, y0r);
290	__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
291
292	// patch in .w
293	res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(`0xffff000000000000`)));
294	res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(`0xffff000000000000`)));
295
296	_mm_storeu_si128(reinterpret_cast<__m128i>(&data[(i + `0`) `4`]), res_0);
297	_mm_storeu_si128(reinterpret_cast<__m128i>(&data[(i + `2`) `4`]), res_1);
298	}
299	}
300
301	static void decodeFilterQuatSimd(short* data, size_t count)
302	{
303	const float scale = `1.f` / sqrtf(`2.f`);
304
305	for (size_t i = `0`; i < count; i += `4`)
306	{
307	__m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float>(&data[(i + `0`) `4`]));
308	__m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float>(&data[(i + `2`) `4`]));
309
310	// gather both x/y 16-bit pairs in each 32-bit lane
311	__m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(`2`, `0`, `2`, `0`)));
312	__m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(`3`, `1`, `3`, `1`)));
313
314	// sign-extends each of x,y in [x y] with arithmetic shifts
315	__m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, `16`), `16`);
316	__m128i yf = _mm_srai_epi32(q4_xy, `16`);
317	__m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, `16`), `16`);
318	__m128i cf = _mm_srai_epi32(q4_zc, `16`);
319
320	// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
321	__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(`3`));
322	__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
323
324	// convert x/y/z to [-1..1] (scaled...)
325	__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
326	__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
327	__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
328
329	// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
330	__m128 ww = _mm_sub_ps(_mm_set1_ps(`1.f`), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
331	__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
332
333	__m128 s = _mm_set1_ps(`32767.f`);
334
335	// rounded signed float->int
336	__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
337	__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
338	__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
339	__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
340
341	// mix x/z and w/y to make 16-bit unpack easier
342	__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(`0xffff`)), _mm_slli_epi32(zr, `16`));
343	__m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(`0xffff`)), _mm_slli_epi32(yr, `16`));
344
345	// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
346	__m128i res_0 = _mm_unpacklo_epi16(wyr, xzr);
347	__m128i res_1 = _mm_unpackhi_epi16(wyr, xzr);
348
349	// store results to stack so that we can rotate using scalar instructions
350	uint64_t res[`4`];
351	_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[`0`]), res_0);
352	_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[`2`]), res_1);
353
354	// rotate and store
355	uint64_t* out = reinterpret_cast<uint64_t>(&data[i `4`]);
356
357	out[`0`] = rotateleft64(res[`0`], data[(i + `0`) * `4` + `3`] << `4`);
358	out[`1`] = rotateleft64(res[`1`], data[(i + `1`) * `4` + `3`] << `4`);
359	out[`2`] = rotateleft64(res[`2`], data[(i + `2`) * `4` + `3`] << `4`);
360	out[`3`] = rotateleft64(res[`3`], data[(i + `3`) * `4` + `3`] << `4`);
361	}
362	}
363
364	static void decodeFilterExpSimd(unsigned int* data, size_t count)
365	{
366	for (size_t i = `0`; i < count; i += `4`)
367	{
368	__m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i]));
369
370	// decode exponent into 2^x directly
371	__m128i ef = _mm_srai_epi32(v, `24`);
372	__m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(`127`)), `23`);
373
374	// decode 24-bit mantissa into floating-point value
375	__m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, `8`), `8`);
376	__m128 m = _mm_cvtepi32_ps(mf);
377
378	__m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m);
379
380	_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
381	}
382	}
383	#endif
384
385	#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
386	inline float32x4_t vsqrtq_f32(float32x4_t x)
387	{
388	float32x4_t r = vrsqrteq_f32(x);
389	r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate
390	return vmulq_f32(r, x);
391	}
392
393	inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
394	{
395	float32x4_t r = vrecpeq_f32(y);
396	r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
397	return vmulq_f32(x, r);
398	}
399	#endif
400
401	#ifdef SIMD_NEON
402	static void decodeFilterOctSimd(signed char* data, size_t count)
403	{
404	const int32x4_t sign = vdupq_n_s32(`0x80000000`);
405
406	for (size_t i = `0`; i < count; i += `4`)
407	{
408	int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t>(&data[i `4`]));
409
410	// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
411	int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, `24`), `24`);
412	int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, `16`), `24`);
413
414	// unpack z; note that z is unsigned so we technically don't need to sign extend it
415	int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, `8`), `24`);
416
417	// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
418	float32x4_t x = vcvtq_f32_s32(xf);
419	float32x4_t y = vcvtq_f32_s32(yf);
420	float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
421
422	// fixup octahedral coordinates for z<0
423	float32x4_t t = vminq_f32(z, vdupq_n_f32(`0.f`));
424
425	x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
426	y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
427
428	// compute normal length & scale
429	float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
430	float32x4_t rl = vrsqrteq_f32(ll);
431	float32x4_t s = vmulq_f32(vdupq_n_f32(`127.f`), rl);
432
433	// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
434	// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
435	const float32x4_t fsnap = vdupq_n_f32(`3` << `22`);
436
437	int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
438	int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
439	int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
440
441	// combine xr/yr/zr into final value
442	int32x4_t res = vandq_s32(n4, vdupq_n_s32(`0xff000000`));
443	res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(`0xff`)));
444	res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(`0xff`)), `8`));
445	res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(`0xff`)), `16`));
446
447	vst1q_s32(reinterpret_cast<int32_t>(&data[i `4`]), res);
448	}
449	}
450
451	static void decodeFilterOctSimd(short* data, size_t count)
452	{
453	const int32x4_t sign = vdupq_n_s32(`0x80000000`);
454
455	for (size_t i = `0`; i < count; i += `4`)
456	{
457	int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t>(&data[(i + `0`) `4`]));
458	int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t>(&data[(i + `2`) `4`]));
459
460	// gather both x/y 16-bit pairs in each 32-bit lane
461	int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[`0`];
462
463	// sign-extends each of x,y in [x y] with arithmetic shifts
464	int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, `16`), `16`);
465	int32x4_t yf = vshrq_n_s32(n4, `16`);
466
467	// unpack z; note that z is unsigned so we don't need to sign extend it
468	int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[`1`];
469	int32x4_t zf = vandq_s32(z4, vdupq_n_s32(`0x7fff`));
470
471	// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
472	float32x4_t x = vcvtq_f32_s32(xf);
473	float32x4_t y = vcvtq_f32_s32(yf);
474	float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
475
476	// fixup octahedral coordinates for z<0
477	float32x4_t t = vminq_f32(z, vdupq_n_f32(`0.f`));
478
479	x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
480	y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
481
482	// compute normal length & scale
483	float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
484	float32x4_t rl = vrsqrteq_f32(ll);
485	rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
486	float32x4_t s = vmulq_f32(vdupq_n_f32(`32767.f`), rl);
487
488	// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
489	// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
490	const float32x4_t fsnap = vdupq_n_f32(`3` << `22`);
491
492	int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
493	int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
494	int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
495
496	// mix x/z and y/0 to make 16-bit unpack easier
497	int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(`0xffff`)), vshlq_n_s32(zr, `16`));
498	int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(`0xffff`));
499
500	// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
501	int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[`0`]);
502	int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[`1`]);
503
504	// patch in .w
505	res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(`0xffff000000000000`)), n4_0, res_0);
506	res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(`0xffff000000000000`)), n4_1, res_1);
507
508	vst1q_s32(reinterpret_cast<int32_t>(&data[(i + `0`) `4`]), res_0);
509	vst1q_s32(reinterpret_cast<int32_t>(&data[(i + `2`) `4`]), res_1);
510	}
511	}
512
513	static void decodeFilterQuatSimd(short* data, size_t count)
514	{
515	const float scale = `1.f` / sqrtf(`2.f`);
516
517	for (size_t i = `0`; i < count; i += `4`)
518	{
519	int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t>(&data[(i + `0`) `4`]));
520	int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t>(&data[(i + `2`) `4`]));
521
522	// gather both x/y 16-bit pairs in each 32-bit lane
523	int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[`0`];
524	int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[`1`];
525
526	// sign-extends each of x,y in [x y] with arithmetic shifts
527	int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, `16`), `16`);
528	int32x4_t yf = vshrq_n_s32(q4_xy, `16`);
529	int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, `16`), `16`);
530	int32x4_t cf = vshrq_n_s32(q4_zc, `16`);
531
532	// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
533	int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(`3`));
534	float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
535
536	// convert x/y/z to [-1..1] (scaled...)
537	float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
538	float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
539	float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
540
541	// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
542	float32x4_t ww = vsubq_f32(vdupq_n_f32(`1.f`), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
543	float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(`0.f`)));
544
545	float32x4_t s = vdupq_n_f32(`32767.f`);
546
547	// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
548	// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
549	const float32x4_t fsnap = vdupq_n_f32(`3` << `22`);
550
551	int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
552	int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
553	int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
554	int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
555
556	// mix x/z and w/y to make 16-bit unpack easier
557	int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(`0xffff`)), vshlq_n_s32(zr, `16`));
558	int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(`0xffff`)), vshlq_n_s32(yr, `16`));
559
560	// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
561	int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[`0`]);
562	int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[`1`]);
563
564	// rotate and store
565	uint64_t* out = (uint64_t)&data[i `4`];
566
567	out[`0`] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), `0`), vgetq_lane_s32(cf, `0`) << `4`);
568	out[`1`] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), `1`), vgetq_lane_s32(cf, `1`) << `4`);
569	out[`2`] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), `0`), vgetq_lane_s32(cf, `2`) << `4`);
570	out[`3`] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), `1`), vgetq_lane_s32(cf, `3`) << `4`);
571	}
572	}
573
574	static void decodeFilterExpSimd(unsigned int* data, size_t count)
575	{
576	for (size_t i = `0`; i < count; i += `4`)
577	{
578	int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i]));
579
580	// decode exponent into 2^x directly
581	int32x4_t ef = vshrq_n_s32(v, `24`);
582	int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(`127`)), `23`);
583
584	// decode 24-bit mantissa into floating-point value
585	int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, `8`), `8`);
586	float32x4_t m = vcvtq_f32_s32(mf);
587
588	float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m);
589
590	vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
591	}
592	}
593	#endif
594
595	#ifdef SIMD_WASM
596	static void decodeFilterOctSimd(signed char* data, size_t count)
597	{
598	const v128_t sign = wasm_f32x4_splat(-`0.f`);
599
600	for (size_t i = `0`; i < count; i += `4`)
601	{
602	v128_t n4 = wasm_v128_load(&data[i * `4`]);
603
604	// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
605	v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, `24`), `24`);
606	v128_t yf = wasm_i32x4_shr(wasm_i32x4_shl(n4, `16`), `24`);
607
608	// unpack z; note that z is unsigned so we technically don't need to sign extend it
609	v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(n4, `8`), `24`);
610
611	// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
612	v128_t x = wasm_f32x4_convert_i32x4(xf);
613	v128_t y = wasm_f32x4_convert_i32x4(yf);
614	v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
615
616	// fixup octahedral coordinates for z<0
617	// note: i32x4_min with 0 is equvalent to f32x4_min
618	v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(`0`));
619
620	x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
621	y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
622
623	// compute normal length & scale
624	v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
625	v128_t s = wasm_f32x4_div(wasm_f32x4_splat(`127.f`), wasm_f32x4_sqrt(ll));
626
627	// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
628	// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
629	const v128_t fsnap = wasm_f32x4_splat(`3` << `22`);
630
631	v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
632	v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
633	v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
634
635	// combine xr/yr/zr into final value
636	v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(`0xff000000`));
637	res = wasm_v128_or(res, wasm_v128_and(xr, wasm_i32x4_splat(`0xff`)));
638	res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(yr, wasm_i32x4_splat(`0xff`)), `8`));
639	res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(zr, wasm_i32x4_splat(`0xff`)), `16`));
640
641	wasm_v128_store(&data[i * `4`], res);
642	}
643	}
644
645	static void decodeFilterOctSimd(short* data, size_t count)
646	{
647	const v128_t sign = wasm_f32x4_splat(-`0.f`);
648	const v128_t zmask = wasm_i32x4_splat(`0x7fff`);
649
650	for (size_t i = `0`; i < count; i += `4`)
651	{
652	v128_t n4_0 = wasm_v128_load(&data[(i + `0`) * `4`]);
653	v128_t n4_1 = wasm_v128_load(&data[(i + `2`) * `4`]);
654
655	// gather both x/y 16-bit pairs in each 32-bit lane
656	v128_t n4 = wasmx_unziplo_v32x4(n4_0, n4_1);
657
658	// sign-extends each of x,y in [x y] with arithmetic shifts
659	v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, `16`), `16`);
660	v128_t yf = wasm_i32x4_shr(n4, `16`);
661
662	// unpack z; note that z is unsigned so we don't need to sign extend it
663	v128_t z4 = wasmx_unziphi_v32x4(n4_0, n4_1);
664	v128_t zf = wasm_v128_and(z4, zmask);
665
666	// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
667	v128_t x = wasm_f32x4_convert_i32x4(xf);
668	v128_t y = wasm_f32x4_convert_i32x4(yf);
669	v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
670
671	// fixup octahedral coordinates for z<0
672	// note: i32x4_min with 0 is equvalent to f32x4_min
673	v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(`0`));
674
675	x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
676	y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
677
678	// compute normal length & scale
679	v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
680	v128_t s = wasm_f32x4_div(wasm_f32x4_splat(`32767.f`), wasm_f32x4_sqrt(ll));
681
682	// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
683	// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
684	const v128_t fsnap = wasm_f32x4_splat(`3` << `22`);
685
686	v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
687	v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
688	v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
689
690	// mix x/z and y/0 to make 16-bit unpack easier
691	v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(`0xffff`)), wasm_i32x4_shl(zr, `16`));
692	v128_t y0r = wasm_v128_and(yr, wasm_i32x4_splat(`0xffff`));
693
694	// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
695	v128_t res_0 = wasmx_unpacklo_v16x8(xzr, y0r);
696	v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r);
697
698	// patch in .w
699	res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(`0xffff000000000000`)));
700	res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(`0xffff000000000000`)));
701
702	wasm_v128_store(&data[(i + `0`) * `4`], res_0);
703	wasm_v128_store(&data[(i + `2`) * `4`], res_1);
704	}
705	}
706
707	static void decodeFilterQuatSimd(short* data, size_t count)
708	{
709	const float scale = `1.f` / sqrtf(`2.f`);
710
711	for (size_t i = `0`; i < count; i += `4`)
712	{
713	v128_t q4_0 = wasm_v128_load(&data[(i + `0`) * `4`]);
714	v128_t q4_1 = wasm_v128_load(&data[(i + `2`) * `4`]);
715
716	// gather both x/y 16-bit pairs in each 32-bit lane
717	v128_t q4_xy = wasmx_unziplo_v32x4(q4_0, q4_1);
718	v128_t q4_zc = wasmx_unziphi_v32x4(q4_0, q4_1);
719
720	// sign-extends each of x,y in [x y] with arithmetic shifts
721	v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, `16`), `16`);
722	v128_t yf = wasm_i32x4_shr(q4_xy, `16`);
723	v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, `16`), `16`);
724	v128_t cf = wasm_i32x4_shr(q4_zc, `16`);
725
726	// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
727	v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(`3`));
728	v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
729
730	// convert x/y/z to [-1..1] (scaled...)
731	v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
732	v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
733	v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
734
735	// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
736	// note: i32x4_max with 0 is equivalent to f32x4_max
737	v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(`1.f`), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
738	v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(`0`)));
739
740	v128_t s = wasm_f32x4_splat(`32767.f`);
741
742	// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
743	// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
744	const v128_t fsnap = wasm_f32x4_splat(`3` << `22`);
745
746	v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
747	v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
748	v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
749	v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
750
751	// mix x/z and w/y to make 16-bit unpack easier
752	v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(`0xffff`)), wasm_i32x4_shl(zr, `16`));
753	v128_t wyr = wasm_v128_or(wasm_v128_and(wr, wasm_i32x4_splat(`0xffff`)), wasm_i32x4_shl(yr, `16`));
754
755	// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
756	v128_t res_0 = wasmx_unpacklo_v16x8(wyr, xzr);
757	v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
758
759	// compute component index shifted left by 4 (and moved into i32x4 slot)
760	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
761	volatile v128_t cm = wasm_i32x4_shl(cf, `4`);
762
763	// rotate and store
764	uint64_t* out = reinterpret_cast<uint64_t>(&data[i `4`]);
765
766	out[`0`] = rotateleft64(wasm_i64x2_extract_lane(res_0, `0`), wasm_i32x4_extract_lane(cm, `0`));
767	out[`1`] = rotateleft64(wasm_i64x2_extract_lane(res_0, `1`), wasm_i32x4_extract_lane(cm, `1`));
768	out[`2`] = rotateleft64(wasm_i64x2_extract_lane(res_1, `0`), wasm_i32x4_extract_lane(cm, `2`));
769	out[`3`] = rotateleft64(wasm_i64x2_extract_lane(res_1, `1`), wasm_i32x4_extract_lane(cm, `3`));
770	}
771	}
772
773	static void decodeFilterExpSimd(unsigned int* data, size_t count)
774	{
775	for (size_t i = `0`; i < count; i += `4`)
776	{
777	v128_t v = wasm_v128_load(&data[i]);
778
779	// decode exponent into 2^x directly
780	v128_t ef = wasm_i32x4_shr(v, `24`);
781	v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(`127`)), `23`);
782
783	// decode 24-bit mantissa into floating-point value
784	v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, `8`), `8`);
785	v128_t m = wasm_f32x4_convert_i32x4(mf);
786
787	v128_t r = wasm_f32x4_mul(es, m);
788
789	wasm_v128_store(&data[i], r);
790	}
791	}
792	#endif
793
794	} // namespace meshopt
795
796	void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride)
797	{
798	using namespace meshopt;
799
800	assert(stride == `4` \|\| stride == `8`);
801
802	#if defined(SIMD_SSE) \|\| defined(SIMD_NEON) \|\| defined(SIMD_WASM)
803	if (stride == `4`)
804	dispatchSimd(decodeFilterOctSimd, static_cast<signed char*>(buffer), count, `4`);
805	else
806	dispatchSimd(decodeFilterOctSimd, static_cast<short*>(buffer), count, `4`);
807	#else
808	if (stride == `4`)
809	decodeFilterOct(static_cast<signed char*>(buffer), count);
810	else
811	decodeFilterOct(static_cast<short*>(buffer), count);
812	#endif
813	}
814
815	void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride)
816	{
817	using namespace meshopt;
818
819	assert(stride == `8`);
820	(void)stride;
821
822	#if defined(SIMD_SSE) \|\| defined(SIMD_NEON) \|\| defined(SIMD_WASM)
823	dispatchSimd(decodeFilterQuatSimd, static_cast<short*>(buffer), count, `4`);
824	#else
825	decodeFilterQuat(static_cast<short*>(buffer), count);
826	#endif
827	}
828
829	void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride)
830	{
831	using namespace meshopt;
832
833	assert(stride > `0` && stride % `4` == `0`);
834
835	#if defined(SIMD_SSE) \|\| defined(SIMD_NEON) \|\| defined(SIMD_WASM)
836	dispatchSimd(decodeFilterExpSimd, static_cast<unsigned int>(buffer), count (stride / `4`), `1`);
837	#else
838	decodeFilterExp(static_cast<unsigned int>(buffer), count (stride / `4`));
839	#endif
840	}
841
842	void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data)
843	{
844	assert(stride == `4` \|\| stride == `8`);
845	assert(bits >= `1` && bits <= `16`);
846
847	signed char* d8 = static_cast<signed char*>(destination);
848	short* d16 = static_cast<short*>(destination);
849
850	int bytebits = int(stride * `2`);
851
852	for (size_t i = `0`; i < count; ++i)
853	{
854	const float* n = &data[i * `4`];
855
856	// octahedral encoding of a unit vector
857	float nx = n[`0`], ny = n[`1`], nz = n[`2`], nw = n[`3`];
858	float nl = fabsf(nx) + fabsf(ny) + fabsf(nz);
859	float ns = nl == `0.f` ? `0.f` : `1.f` / nl;
860
861	nx *= ns;
862	ny *= ns;
863
864	float u = (nz >= `0.f`) ? nx : (`1` - fabsf(ny)) * (nx >= `0.f` ? `1.f` : -`1.f`);
865	float v = (nz >= `0.f`) ? ny : (`1` - fabsf(nx)) * (ny >= `0.f` ? `1.f` : -`1.f`);
866
867	int fu = meshopt_quantizeSnorm(u, bits);
868	int fv = meshopt_quantizeSnorm(v, bits);
869	int fo = meshopt_quantizeSnorm(`1.f`, bits);
870	int fw = meshopt_quantizeSnorm(nw, bytebits);
871
872	if (stride == `4`)
873	{
874	d8[i * `4` + `0`] = (signed char)(fu);
875	d8[i * `4` + `1`] = (signed char)(fv);
876	d8[i * `4` + `2`] = (signed char)(fo);
877	d8[i * `4` + `3`] = (signed char)(fw);
878	}
879	else
880	{
881	d16[i * `4` + `0`] = short(fu);
882	d16[i * `4` + `1`] = short(fv);
883	d16[i * `4` + `2`] = short(fo);
884	d16[i * `4` + `3`] = short(fw);
885	}
886	}
887	}
888
889	void meshopt_encodeFilterQuat(void* destination_, size_t count, size_t stride, int bits, const float* data)
890	{
891	assert(stride == `8`);
892	assert(bits >= `4` && bits <= `16`);
893	(void)stride;
894
895	short* destination = static_cast<short*>(destination_);
896
897	const float scaler = sqrtf(`2.f`);
898
899	for (size_t i = `0`; i < count; ++i)
900	{
901	const float* q = &data[i * `4`];
902	short* d = &destination[i * `4`];
903
904	// establish maximum quaternion component
905	int qc = `0`;
906	qc = fabsf(q[`1`]) > fabsf(q[qc]) ? `1` : qc;
907	qc = fabsf(q[`2`]) > fabsf(q[qc]) ? `2` : qc;
908	qc = fabsf(q[`3`]) > fabsf(q[qc]) ? `3` : qc;
909
910	// we use double-cover properties to discard the sign
911	float sign = q[qc] < `0.f` ? -`1.f` : `1.f`;
912
913	// note: we always encode a cyclical swizzle to be able to recover the order via rotation
914	d[`0`] = short(meshopt_quantizeSnorm(q[(qc + `1`) & `3`] * scaler * sign, bits));
915	d[`1`] = short(meshopt_quantizeSnorm(q[(qc + `2`) & `3`] * scaler * sign, bits));
916	d[`2`] = short(meshopt_quantizeSnorm(q[(qc + `3`) & `3`] * scaler * sign, bits));
917	d[`3`] = short((meshopt_quantizeSnorm(`1.f`, bits) & ~`3`) \| qc);
918	}
919	}
920
921	void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data)
922	{
923	assert(stride > `0` && stride % `4` == `0`);
924	assert(bits >= `1` && bits <= `24`);
925
926	unsigned int* destination = static_cast<unsigned int*>(destination_);
927	size_t stride_float = stride / sizeof(float);
928
929	for (size_t i = `0`; i < count; ++i)
930	{
931	const float* v = &data[i * stride_float];
932	unsigned int* d = &destination[i * stride_float];
933
934	// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
935	int exp = -`100`;
936
937	for (size_t j = `0`; j < stride_float; ++j)
938	{
939	int e;
940	frexp(v[j], &e);
941
942	exp = (exp < e) ? e : exp;
943	}
944
945	// note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude)
946	exp -= (bits - `1`);
947
948	// compute renormalized rounded mantissa for each component
949	int mmask = (`1` << `24`) - `1`;
950
951	for (size_t j = `0`; j < stride_float; ++j)
952	{
953	int m = int(ldexp(v[j], -exp) + (v[j] >= `0` ? `0.5f` : -`0.5f`));
954
955	d[j] = (m & mmask) \| (unsigned(exp) << `24`);
956	}
957	}
958	}
959
960	#undef SIMD_SSE
961	#undef SIMD_NEON
962	#undef SIMD_WASM
963

Browse the source code of Godot/thirdparty/meshoptimizer/vertexfilter.cpp