1// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2#include "meshoptimizer.h"
3
4#include <math.h>
5#include <string.h>
6
7// The block below auto-detects SIMD ISA that can be used on the target platform
8#ifndef MESHOPTIMIZER_NO_SIMD
9
10// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings
11#if defined(__SSE2__)
12#define SIMD_SSE
13#endif
14
15// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2
16#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
17#define SIMD_SSE
18#endif
19
20// GCC/clang define these when NEON support is available
21#if defined(__ARM_NEON__) || defined(__ARM_NEON)
22#define SIMD_NEON
23#endif
24
25// On MSVC, we assume that ARM builds always target NEON-capable devices
26#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
27#define SIMD_NEON
28#endif
29
30// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
31#if defined(__wasm_simd128__)
32#define SIMD_WASM
33#endif
34
35#endif // !MESHOPTIMIZER_NO_SIMD
36
37#ifdef SIMD_SSE
38#include <emmintrin.h>
39#include <stdint.h>
40#endif
41
42#ifdef _MSC_VER
43#include <intrin.h>
44#endif
45
46#ifdef SIMD_NEON
47#if defined(_MSC_VER) && defined(_M_ARM64)
48#include <arm64_neon.h>
49#else
50#include <arm_neon.h>
51#endif
52#endif
53
54#ifdef SIMD_WASM
55#undef __DEPRECATED
56#include <wasm_simd128.h>
57#endif
58
59#ifdef SIMD_WASM
60#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
61#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
62#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6)
63#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
64#endif
65
66namespace meshopt
67{
68
69#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM)
70template <typename T>
71static void decodeFilterOct(T* data, size_t count)
72{
73 const float max = float((1 << (sizeof(T) * 8 - 1)) - 1);
74
75 for (size_t i = 0; i < count; ++i)
76 {
77 // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
78 float x = float(data[i * 4 + 0]);
79 float y = float(data[i * 4 + 1]);
80 float z = float(data[i * 4 + 2]) - fabsf(x) - fabsf(y);
81
82 // fixup octahedral coordinates for z<0
83 float t = (z >= 0.f) ? 0.f : z;
84
85 x += (x >= 0.f) ? t : -t;
86 y += (y >= 0.f) ? t : -t;
87
88 // compute normal length & scale
89 float l = sqrtf(x * x + y * y + z * z);
90 float s = max / l;
91
92 // rounded signed float->int
93 int xf = int(x * s + (x >= 0.f ? 0.5f : -0.5f));
94 int yf = int(y * s + (y >= 0.f ? 0.5f : -0.5f));
95 int zf = int(z * s + (z >= 0.f ? 0.5f : -0.5f));
96
97 data[i * 4 + 0] = T(xf);
98 data[i * 4 + 1] = T(yf);
99 data[i * 4 + 2] = T(zf);
100 }
101}
102
103static void decodeFilterQuat(short* data, size_t count)
104{
105 const float scale = 1.f / sqrtf(2.f);
106
107 for (size_t i = 0; i < count; ++i)
108 {
109 // recover scale from the high byte of the component
110 int sf = data[i * 4 + 3] | 3;
111 float ss = scale / float(sf);
112
113 // convert x/y/z to [-1..1] (scaled...)
114 float x = float(data[i * 4 + 0]) * ss;
115 float y = float(data[i * 4 + 1]) * ss;
116 float z = float(data[i * 4 + 2]) * ss;
117
118 // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
119 float ww = 1.f - x * x - y * y - z * z;
120 float w = sqrtf(ww >= 0.f ? ww : 0.f);
121
122 // rounded signed float->int
123 int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
124 int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
125 int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
126 int wf = int(w * 32767.f + 0.5f);
127
128 int qc = data[i * 4 + 3] & 3;
129
130 // output order is dictated by input index
131 data[i * 4 + ((qc + 1) & 3)] = short(xf);
132 data[i * 4 + ((qc + 2) & 3)] = short(yf);
133 data[i * 4 + ((qc + 3) & 3)] = short(zf);
134 data[i * 4 + ((qc + 0) & 3)] = short(wf);
135 }
136}
137
138static void decodeFilterExp(unsigned int* data, size_t count)
139{
140 for (size_t i = 0; i < count; ++i)
141 {
142 unsigned int v = data[i];
143
144 // decode mantissa and exponent
145 int m = int(v << 8) >> 8;
146 int e = int(v) >> 24;
147
148 union
149 {
150 float f;
151 unsigned int ui;
152 } u;
153
154 // optimized version of ldexp(float(m), e)
155 u.ui = unsigned(e + 127) << 23;
156 u.f = u.f * float(m);
157
158 data[i] = u.ui;
159 }
160}
161#endif
162
163#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
164template <typename T>
165static void dispatchSimd(void (*process)(T*, size_t), T* data, size_t count, size_t stride)
166{
167 assert(stride <= 4);
168
169 size_t count4 = count & ~size_t(3);
170 process(data, count4);
171
172 if (count4 < count)
173 {
174 T tail[4 * 4] = {}; // max stride 4, max count 4
175 size_t tail_size = (count - count4) * stride * sizeof(T);
176 assert(tail_size <= sizeof(tail));
177
178 memcpy(tail, data + count4 * stride, tail_size);
179 process(tail, count - count4);
180 memcpy(data + count4 * stride, tail, tail_size);
181 }
182}
183
184inline uint64_t rotateleft64(uint64_t v, int x)
185{
186#if defined(_MSC_VER) && !defined(__clang__)
187 return _rotl64(v, x);
188// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
189// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
190#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11)
191 return __builtin_rotateleft64(v, x);
192#else
193 return (v << (x & 63)) | (v >> ((64 - x) & 63));
194#endif
195}
196#endif
197
198#ifdef SIMD_SSE
199static void decodeFilterOctSimd(signed char* data, size_t count)
200{
201 const __m128 sign = _mm_set1_ps(-0.f);
202
203 for (size_t i = 0; i < count; i += 4)
204 {
205 __m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
206
207 // sign-extends each of x,y in [x y ? ?] with arithmetic shifts
208 __m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 24), 24);
209 __m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 24);
210
211 // unpack z; note that z is unsigned so we technically don't need to sign extend it
212 __m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, 8), 24);
213
214 // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
215 __m128 x = _mm_cvtepi32_ps(xf);
216 __m128 y = _mm_cvtepi32_ps(yf);
217 __m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
218
219 // fixup octahedral coordinates for z<0
220 __m128 t = _mm_min_ps(z, _mm_setzero_ps());
221
222 x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
223 y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
224
225 // compute normal length & scale
226 __m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
227 __m128 s = _mm_mul_ps(_mm_set1_ps(127.f), _mm_rsqrt_ps(ll));
228
229 // rounded signed float->int
230 __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
231 __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
232 __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
233
234 // combine xr/yr/zr into final value
235 __m128i res = _mm_and_si128(n4, _mm_set1_epi32(0xff000000));
236 res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(0xff)));
237 res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(0xff)), 8));
238 res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(0xff)), 16));
239
240 _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
241 }
242}
243
244static void decodeFilterOctSimd(short* data, size_t count)
245{
246 const __m128 sign = _mm_set1_ps(-0.f);
247
248 for (size_t i = 0; i < count; i += 4)
249 {
250 __m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
251 __m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
252
253 // gather both x/y 16-bit pairs in each 32-bit lane
254 __m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(2, 0, 2, 0)));
255
256 // sign-extends each of x,y in [x y] with arithmetic shifts
257 __m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 16);
258 __m128i yf = _mm_srai_epi32(n4, 16);
259
260 // unpack z; note that z is unsigned so we don't need to sign extend it
261 __m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(3, 1, 3, 1)));
262 __m128i zf = _mm_and_si128(z4, _mm_set1_epi32(0x7fff));
263
264 // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
265 __m128 x = _mm_cvtepi32_ps(xf);
266 __m128 y = _mm_cvtepi32_ps(yf);
267 __m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
268
269 // fixup octahedral coordinates for z<0
270 __m128 t = _mm_min_ps(z, _mm_setzero_ps());
271
272 x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
273 y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
274
275 // compute normal length & scale
276 __m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
277 __m128 s = _mm_div_ps(_mm_set1_ps(32767.f), _mm_sqrt_ps(ll));
278
279 // rounded signed float->int
280 __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
281 __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
282 __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
283
284 // mix x/z and y/0 to make 16-bit unpack easier
285 __m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
286 __m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(0xffff));
287
288 // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
289 __m128i res_0 = _mm_unpacklo_epi16(xzr, y0r);
290 __m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
291
292 // patch in .w
293 res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
294 res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
295
296 _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
297 _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
298 }
299}
300
301static void decodeFilterQuatSimd(short* data, size_t count)
302{
303 const float scale = 1.f / sqrtf(2.f);
304
305 for (size_t i = 0; i < count; i += 4)
306 {
307 __m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
308 __m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
309
310 // gather both x/y 16-bit pairs in each 32-bit lane
311 __m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(2, 0, 2, 0)));
312 __m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(3, 1, 3, 1)));
313
314 // sign-extends each of x,y in [x y] with arithmetic shifts
315 __m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, 16), 16);
316 __m128i yf = _mm_srai_epi32(q4_xy, 16);
317 __m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, 16), 16);
318 __m128i cf = _mm_srai_epi32(q4_zc, 16);
319
320 // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
321 __m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
322 __m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
323
324 // convert x/y/z to [-1..1] (scaled...)
325 __m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
326 __m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
327 __m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
328
329 // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
330 __m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
331 __m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
332
333 __m128 s = _mm_set1_ps(32767.f);
334
335 // rounded signed float->int
336 __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
337 __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
338 __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
339 __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
340
341 // mix x/z and w/y to make 16-bit unpack easier
342 __m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
343 __m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(yr, 16));
344
345 // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
346 __m128i res_0 = _mm_unpacklo_epi16(wyr, xzr);
347 __m128i res_1 = _mm_unpackhi_epi16(wyr, xzr);
348
349 // store results to stack so that we can rotate using scalar instructions
350 uint64_t res[4];
351 _mm_storeu_si128(reinterpret_cast<__m128i*>(&res[0]), res_0);
352 _mm_storeu_si128(reinterpret_cast<__m128i*>(&res[2]), res_1);
353
354 // rotate and store
355 uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
356
357 out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
358 out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
359 out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
360 out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
361 }
362}
363
364static void decodeFilterExpSimd(unsigned int* data, size_t count)
365{
366 for (size_t i = 0; i < count; i += 4)
367 {
368 __m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i]));
369
370 // decode exponent into 2^x directly
371 __m128i ef = _mm_srai_epi32(v, 24);
372 __m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(127)), 23);
373
374 // decode 24-bit mantissa into floating-point value
375 __m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, 8), 8);
376 __m128 m = _mm_cvtepi32_ps(mf);
377
378 __m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m);
379
380 _mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
381 }
382}
383#endif
384
385#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
386inline float32x4_t vsqrtq_f32(float32x4_t x)
387{
388 float32x4_t r = vrsqrteq_f32(x);
389 r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate
390 return vmulq_f32(r, x);
391}
392
393inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
394{
395 float32x4_t r = vrecpeq_f32(y);
396 r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
397 return vmulq_f32(x, r);
398}
399#endif
400
401#ifdef SIMD_NEON
402static void decodeFilterOctSimd(signed char* data, size_t count)
403{
404 const int32x4_t sign = vdupq_n_s32(0x80000000);
405
406 for (size_t i = 0; i < count; i += 4)
407 {
408 int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
409
410 // sign-extends each of x,y in [x y ? ?] with arithmetic shifts
411 int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 24), 24);
412 int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, 16), 24);
413
414 // unpack z; note that z is unsigned so we technically don't need to sign extend it
415 int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, 8), 24);
416
417 // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
418 float32x4_t x = vcvtq_f32_s32(xf);
419 float32x4_t y = vcvtq_f32_s32(yf);
420 float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
421
422 // fixup octahedral coordinates for z<0
423 float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
424
425 x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
426 y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
427
428 // compute normal length & scale
429 float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
430 float32x4_t rl = vrsqrteq_f32(ll);
431 float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
432
433 // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
434 // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
435 const float32x4_t fsnap = vdupq_n_f32(3 << 22);
436
437 int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
438 int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
439 int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
440
441 // combine xr/yr/zr into final value
442 int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
443 res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
444 res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
445 res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
446
447 vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
448 }
449}
450
451static void decodeFilterOctSimd(short* data, size_t count)
452{
453 const int32x4_t sign = vdupq_n_s32(0x80000000);
454
455 for (size_t i = 0; i < count; i += 4)
456 {
457 int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
458 int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
459
460 // gather both x/y 16-bit pairs in each 32-bit lane
461 int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[0];
462
463 // sign-extends each of x,y in [x y] with arithmetic shifts
464 int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 16), 16);
465 int32x4_t yf = vshrq_n_s32(n4, 16);
466
467 // unpack z; note that z is unsigned so we don't need to sign extend it
468 int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[1];
469 int32x4_t zf = vandq_s32(z4, vdupq_n_s32(0x7fff));
470
471 // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
472 float32x4_t x = vcvtq_f32_s32(xf);
473 float32x4_t y = vcvtq_f32_s32(yf);
474 float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
475
476 // fixup octahedral coordinates for z<0
477 float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
478
479 x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
480 y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
481
482 // compute normal length & scale
483 float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
484 float32x4_t rl = vrsqrteq_f32(ll);
485 rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
486 float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
487
488 // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
489 // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
490 const float32x4_t fsnap = vdupq_n_f32(3 << 22);
491
492 int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
493 int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
494 int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
495
496 // mix x/z and y/0 to make 16-bit unpack easier
497 int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
498 int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
499
500 // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
501 int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[0]);
502 int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[1]);
503
504 // patch in .w
505 res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_0, res_0);
506 res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_1, res_1);
507
508 vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
509 vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
510 }
511}
512
513static void decodeFilterQuatSimd(short* data, size_t count)
514{
515 const float scale = 1.f / sqrtf(2.f);
516
517 for (size_t i = 0; i < count; i += 4)
518 {
519 int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
520 int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
521
522 // gather both x/y 16-bit pairs in each 32-bit lane
523 int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[0];
524 int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[1];
525
526 // sign-extends each of x,y in [x y] with arithmetic shifts
527 int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, 16), 16);
528 int32x4_t yf = vshrq_n_s32(q4_xy, 16);
529 int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, 16), 16);
530 int32x4_t cf = vshrq_n_s32(q4_zc, 16);
531
532 // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
533 int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
534 float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
535
536 // convert x/y/z to [-1..1] (scaled...)
537 float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
538 float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
539 float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
540
541 // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
542 float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
543 float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
544
545 float32x4_t s = vdupq_n_f32(32767.f);
546
547 // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
548 // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
549 const float32x4_t fsnap = vdupq_n_f32(3 << 22);
550
551 int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
552 int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
553 int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
554 int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
555
556 // mix x/z and w/y to make 16-bit unpack easier
557 int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
558 int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
559
560 // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
561 int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
562 int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
563
564 // rotate and store
565 uint64_t* out = (uint64_t*)&data[i * 4];
566
567 out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
568 out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
569 out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
570 out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
571 }
572}
573
574static void decodeFilterExpSimd(unsigned int* data, size_t count)
575{
576 for (size_t i = 0; i < count; i += 4)
577 {
578 int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i]));
579
580 // decode exponent into 2^x directly
581 int32x4_t ef = vshrq_n_s32(v, 24);
582 int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(127)), 23);
583
584 // decode 24-bit mantissa into floating-point value
585 int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, 8), 8);
586 float32x4_t m = vcvtq_f32_s32(mf);
587
588 float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m);
589
590 vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
591 }
592}
593#endif
594
595#ifdef SIMD_WASM
596static void decodeFilterOctSimd(signed char* data, size_t count)
597{
598 const v128_t sign = wasm_f32x4_splat(-0.f);
599
600 for (size_t i = 0; i < count; i += 4)
601 {
602 v128_t n4 = wasm_v128_load(&data[i * 4]);
603
604 // sign-extends each of x,y in [x y ? ?] with arithmetic shifts
605 v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 24), 24);
606 v128_t yf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 24);
607
608 // unpack z; note that z is unsigned so we technically don't need to sign extend it
609 v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 8), 24);
610
611 // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
612 v128_t x = wasm_f32x4_convert_i32x4(xf);
613 v128_t y = wasm_f32x4_convert_i32x4(yf);
614 v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
615
616 // fixup octahedral coordinates for z<0
617 // note: i32x4_min with 0 is equvalent to f32x4_min
618 v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
619
620 x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
621 y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
622
623 // compute normal length & scale
624 v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
625 v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), wasm_f32x4_sqrt(ll));
626
627 // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
628 // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
629 const v128_t fsnap = wasm_f32x4_splat(3 << 22);
630
631 v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
632 v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
633 v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
634
635 // combine xr/yr/zr into final value
636 v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(0xff000000));
637 res = wasm_v128_or(res, wasm_v128_and(xr, wasm_i32x4_splat(0xff)));
638 res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(yr, wasm_i32x4_splat(0xff)), 8));
639 res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(zr, wasm_i32x4_splat(0xff)), 16));
640
641 wasm_v128_store(&data[i * 4], res);
642 }
643}
644
645static void decodeFilterOctSimd(short* data, size_t count)
646{
647 const v128_t sign = wasm_f32x4_splat(-0.f);
648 const v128_t zmask = wasm_i32x4_splat(0x7fff);
649
650 for (size_t i = 0; i < count; i += 4)
651 {
652 v128_t n4_0 = wasm_v128_load(&data[(i + 0) * 4]);
653 v128_t n4_1 = wasm_v128_load(&data[(i + 2) * 4]);
654
655 // gather both x/y 16-bit pairs in each 32-bit lane
656 v128_t n4 = wasmx_unziplo_v32x4(n4_0, n4_1);
657
658 // sign-extends each of x,y in [x y] with arithmetic shifts
659 v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 16);
660 v128_t yf = wasm_i32x4_shr(n4, 16);
661
662 // unpack z; note that z is unsigned so we don't need to sign extend it
663 v128_t z4 = wasmx_unziphi_v32x4(n4_0, n4_1);
664 v128_t zf = wasm_v128_and(z4, zmask);
665
666 // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
667 v128_t x = wasm_f32x4_convert_i32x4(xf);
668 v128_t y = wasm_f32x4_convert_i32x4(yf);
669 v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
670
671 // fixup octahedral coordinates for z<0
672 // note: i32x4_min with 0 is equvalent to f32x4_min
673 v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
674
675 x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
676 y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
677
678 // compute normal length & scale
679 v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
680 v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), wasm_f32x4_sqrt(ll));
681
682 // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
683 // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
684 const v128_t fsnap = wasm_f32x4_splat(3 << 22);
685
686 v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
687 v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
688 v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
689
690 // mix x/z and y/0 to make 16-bit unpack easier
691 v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
692 v128_t y0r = wasm_v128_and(yr, wasm_i32x4_splat(0xffff));
693
694 // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
695 v128_t res_0 = wasmx_unpacklo_v16x8(xzr, y0r);
696 v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r);
697
698 // patch in .w
699 res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(0xffff000000000000)));
700 res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(0xffff000000000000)));
701
702 wasm_v128_store(&data[(i + 0) * 4], res_0);
703 wasm_v128_store(&data[(i + 2) * 4], res_1);
704 }
705}
706
707static void decodeFilterQuatSimd(short* data, size_t count)
708{
709 const float scale = 1.f / sqrtf(2.f);
710
711 for (size_t i = 0; i < count; i += 4)
712 {
713 v128_t q4_0 = wasm_v128_load(&data[(i + 0) * 4]);
714 v128_t q4_1 = wasm_v128_load(&data[(i + 2) * 4]);
715
716 // gather both x/y 16-bit pairs in each 32-bit lane
717 v128_t q4_xy = wasmx_unziplo_v32x4(q4_0, q4_1);
718 v128_t q4_zc = wasmx_unziphi_v32x4(q4_0, q4_1);
719
720 // sign-extends each of x,y in [x y] with arithmetic shifts
721 v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, 16), 16);
722 v128_t yf = wasm_i32x4_shr(q4_xy, 16);
723 v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, 16), 16);
724 v128_t cf = wasm_i32x4_shr(q4_zc, 16);
725
726 // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
727 v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
728 v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
729
730 // convert x/y/z to [-1..1] (scaled...)
731 v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
732 v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
733 v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
734
735 // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
736 // note: i32x4_max with 0 is equivalent to f32x4_max
737 v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
738 v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
739
740 v128_t s = wasm_f32x4_splat(32767.f);
741
742 // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
743 // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
744 const v128_t fsnap = wasm_f32x4_splat(3 << 22);
745
746 v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
747 v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
748 v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
749 v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
750
751 // mix x/z and w/y to make 16-bit unpack easier
752 v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
753 v128_t wyr = wasm_v128_or(wasm_v128_and(wr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(yr, 16));
754
755 // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
756 v128_t res_0 = wasmx_unpacklo_v16x8(wyr, xzr);
757 v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
758
759 // compute component index shifted left by 4 (and moved into i32x4 slot)
760 // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
761 volatile v128_t cm = wasm_i32x4_shl(cf, 4);
762
763 // rotate and store
764 uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
765
766 out[0] = rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0));
767 out[1] = rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1));
768 out[2] = rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2));
769 out[3] = rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
770 }
771}
772
773static void decodeFilterExpSimd(unsigned int* data, size_t count)
774{
775 for (size_t i = 0; i < count; i += 4)
776 {
777 v128_t v = wasm_v128_load(&data[i]);
778
779 // decode exponent into 2^x directly
780 v128_t ef = wasm_i32x4_shr(v, 24);
781 v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(127)), 23);
782
783 // decode 24-bit mantissa into floating-point value
784 v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, 8), 8);
785 v128_t m = wasm_f32x4_convert_i32x4(mf);
786
787 v128_t r = wasm_f32x4_mul(es, m);
788
789 wasm_v128_store(&data[i], r);
790 }
791}
792#endif
793
794} // namespace meshopt
795
796void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride)
797{
798 using namespace meshopt;
799
800 assert(stride == 4 || stride == 8);
801
802#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
803 if (stride == 4)
804 dispatchSimd(decodeFilterOctSimd, static_cast<signed char*>(buffer), count, 4);
805 else
806 dispatchSimd(decodeFilterOctSimd, static_cast<short*>(buffer), count, 4);
807#else
808 if (stride == 4)
809 decodeFilterOct(static_cast<signed char*>(buffer), count);
810 else
811 decodeFilterOct(static_cast<short*>(buffer), count);
812#endif
813}
814
815void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride)
816{
817 using namespace meshopt;
818
819 assert(stride == 8);
820 (void)stride;
821
822#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
823 dispatchSimd(decodeFilterQuatSimd, static_cast<short*>(buffer), count, 4);
824#else
825 decodeFilterQuat(static_cast<short*>(buffer), count);
826#endif
827}
828
829void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride)
830{
831 using namespace meshopt;
832
833 assert(stride > 0 && stride % 4 == 0);
834
835#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
836 dispatchSimd(decodeFilterExpSimd, static_cast<unsigned int*>(buffer), count * (stride / 4), 1);
837#else
838 decodeFilterExp(static_cast<unsigned int*>(buffer), count * (stride / 4));
839#endif
840}
841
842void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data)
843{
844 assert(stride == 4 || stride == 8);
845 assert(bits >= 1 && bits <= 16);
846
847 signed char* d8 = static_cast<signed char*>(destination);
848 short* d16 = static_cast<short*>(destination);
849
850 int bytebits = int(stride * 2);
851
852 for (size_t i = 0; i < count; ++i)
853 {
854 const float* n = &data[i * 4];
855
856 // octahedral encoding of a unit vector
857 float nx = n[0], ny = n[1], nz = n[2], nw = n[3];
858 float nl = fabsf(nx) + fabsf(ny) + fabsf(nz);
859 float ns = nl == 0.f ? 0.f : 1.f / nl;
860
861 nx *= ns;
862 ny *= ns;
863
864 float u = (nz >= 0.f) ? nx : (1 - fabsf(ny)) * (nx >= 0.f ? 1.f : -1.f);
865 float v = (nz >= 0.f) ? ny : (1 - fabsf(nx)) * (ny >= 0.f ? 1.f : -1.f);
866
867 int fu = meshopt_quantizeSnorm(u, bits);
868 int fv = meshopt_quantizeSnorm(v, bits);
869 int fo = meshopt_quantizeSnorm(1.f, bits);
870 int fw = meshopt_quantizeSnorm(nw, bytebits);
871
872 if (stride == 4)
873 {
874 d8[i * 4 + 0] = (signed char)(fu);
875 d8[i * 4 + 1] = (signed char)(fv);
876 d8[i * 4 + 2] = (signed char)(fo);
877 d8[i * 4 + 3] = (signed char)(fw);
878 }
879 else
880 {
881 d16[i * 4 + 0] = short(fu);
882 d16[i * 4 + 1] = short(fv);
883 d16[i * 4 + 2] = short(fo);
884 d16[i * 4 + 3] = short(fw);
885 }
886 }
887}
888
889void meshopt_encodeFilterQuat(void* destination_, size_t count, size_t stride, int bits, const float* data)
890{
891 assert(stride == 8);
892 assert(bits >= 4 && bits <= 16);
893 (void)stride;
894
895 short* destination = static_cast<short*>(destination_);
896
897 const float scaler = sqrtf(2.f);
898
899 for (size_t i = 0; i < count; ++i)
900 {
901 const float* q = &data[i * 4];
902 short* d = &destination[i * 4];
903
904 // establish maximum quaternion component
905 int qc = 0;
906 qc = fabsf(q[1]) > fabsf(q[qc]) ? 1 : qc;
907 qc = fabsf(q[2]) > fabsf(q[qc]) ? 2 : qc;
908 qc = fabsf(q[3]) > fabsf(q[qc]) ? 3 : qc;
909
910 // we use double-cover properties to discard the sign
911 float sign = q[qc] < 0.f ? -1.f : 1.f;
912
913 // note: we always encode a cyclical swizzle to be able to recover the order via rotation
914 d[0] = short(meshopt_quantizeSnorm(q[(qc + 1) & 3] * scaler * sign, bits));
915 d[1] = short(meshopt_quantizeSnorm(q[(qc + 2) & 3] * scaler * sign, bits));
916 d[2] = short(meshopt_quantizeSnorm(q[(qc + 3) & 3] * scaler * sign, bits));
917 d[3] = short((meshopt_quantizeSnorm(1.f, bits) & ~3) | qc);
918 }
919}
920
921void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data)
922{
923 assert(stride > 0 && stride % 4 == 0);
924 assert(bits >= 1 && bits <= 24);
925
926 unsigned int* destination = static_cast<unsigned int*>(destination_);
927 size_t stride_float = stride / sizeof(float);
928
929 for (size_t i = 0; i < count; ++i)
930 {
931 const float* v = &data[i * stride_float];
932 unsigned int* d = &destination[i * stride_float];
933
934 // use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
935 int exp = -100;
936
937 for (size_t j = 0; j < stride_float; ++j)
938 {
939 int e;
940 frexp(v[j], &e);
941
942 exp = (exp < e) ? e : exp;
943 }
944
945 // note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude)
946 exp -= (bits - 1);
947
948 // compute renormalized rounded mantissa for each component
949 int mmask = (1 << 24) - 1;
950
951 for (size_t j = 0; j < stride_float; ++j)
952 {
953 int m = int(ldexp(v[j], -exp) + (v[j] >= 0 ? 0.5f : -0.5f));
954
955 d[j] = (m & mmask) | (unsigned(exp) << 24);
956 }
957 }
958}
959
960#undef SIMD_SSE
961#undef SIMD_NEON
962#undef SIMD_WASM
963