1// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2#include "meshoptimizer.h"
3
4#include <assert.h>
5#include <string.h>
6
7// The block below auto-detects SIMD ISA that can be used on the target platform
8#ifndef MESHOPTIMIZER_NO_SIMD
9
10// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
11#if defined(__AVX__) || defined(__SSSE3__)
12#define SIMD_SSE
13#endif
14
15// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
16#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
17#undef SIMD_SSE
18#define SIMD_AVX
19#endif
20
21// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
22#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
23#define SIMD_SSE
24#define SIMD_FALLBACK
25#endif
26
27// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
28#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
29#define SIMD_SSE
30#define SIMD_FALLBACK
31#define SIMD_TARGET __attribute__((target("ssse3")))
32#endif
33
34// GCC/clang define these when NEON support is available
35#if defined(__ARM_NEON__) || defined(__ARM_NEON)
36#define SIMD_NEON
37#endif
38
39// On MSVC, we assume that ARM builds always target NEON-capable devices
40#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
41#define SIMD_NEON
42#endif
43
44// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
45#if defined(__wasm_simd128__)
46#define SIMD_WASM
47#endif
48
49#ifndef SIMD_TARGET
50#define SIMD_TARGET
51#endif
52
53// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
54// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
55#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
56#define SIMD_LATENCYOPT
57#endif
58
59#endif // !MESHOPTIMIZER_NO_SIMD
60
61#ifdef SIMD_SSE
62#include <tmmintrin.h>
63#endif
64
65#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
66#ifdef _MSC_VER
67#include <intrin.h> // __cpuid
68#else
69#include <cpuid.h> // __cpuid
70#endif
71#endif
72
73#ifdef SIMD_AVX
74#include <immintrin.h>
75#endif
76
77#ifdef SIMD_NEON
78#if defined(_MSC_VER) && defined(_M_ARM64)
79#include <arm64_neon.h>
80#else
81#include <arm_neon.h>
82#endif
83#endif
84
85#ifdef SIMD_WASM
86#undef __DEPRECATED
87#pragma clang diagnostic ignored "-Wdeprecated-declarations"
88#include <wasm_simd128.h>
89#endif
90
91#ifdef SIMD_WASM
92#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i)
93#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
94#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
95#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
96#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
97#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2)
98#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3)
99#endif
100
101namespace meshopt
102{
103
104const unsigned char kVertexHeader = 0xa0;
105
106static int gEncodeVertexVersion = 0;
107
108const size_t kVertexBlockSizeBytes = 8192;
109const size_t kVertexBlockMaxSize = 256;
110const size_t kByteGroupSize = 16;
111const size_t kByteGroupDecodeLimit = 24;
112const size_t kTailMaxSize = 32;
113
114static size_t getVertexBlockSize(size_t vertex_size)
115{
116 // make sure the entire block fits into the scratch buffer
117 size_t result = kVertexBlockSizeBytes / vertex_size;
118
119 // align to byte group size; we encode each byte as a byte group
120 // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
121 result &= ~(kByteGroupSize - 1);
122
123 return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
124}
125
126inline unsigned char zigzag8(unsigned char v)
127{
128 return ((signed char)(v) >> 7) ^ (v << 1);
129}
130
131inline unsigned char unzigzag8(unsigned char v)
132{
133 return -(v & 1) ^ (v >> 1);
134}
135
136static bool encodeBytesGroupZero(const unsigned char* buffer)
137{
138 for (size_t i = 0; i < kByteGroupSize; ++i)
139 if (buffer[i])
140 return false;
141
142 return true;
143}
144
145static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
146{
147 assert(bits >= 1 && bits <= 8);
148
149 if (bits == 1)
150 return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
151
152 if (bits == 8)
153 return kByteGroupSize;
154
155 size_t result = kByteGroupSize * bits / 8;
156
157 unsigned char sentinel = (1 << bits) - 1;
158
159 for (size_t i = 0; i < kByteGroupSize; ++i)
160 result += buffer[i] >= sentinel;
161
162 return result;
163}
164
165static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
166{
167 assert(bits >= 1 && bits <= 8);
168
169 if (bits == 1)
170 return data;
171
172 if (bits == 8)
173 {
174 memcpy(data, buffer, kByteGroupSize);
175 return data + kByteGroupSize;
176 }
177
178 size_t byte_size = 8 / bits;
179 assert(kByteGroupSize % byte_size == 0);
180
181 // fixed portion: bits bits for each value
182 // variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
183 unsigned char sentinel = (1 << bits) - 1;
184
185 for (size_t i = 0; i < kByteGroupSize; i += byte_size)
186 {
187 unsigned char byte = 0;
188
189 for (size_t k = 0; k < byte_size; ++k)
190 {
191 unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
192
193 byte <<= bits;
194 byte |= enc;
195 }
196
197 *data++ = byte;
198 }
199
200 for (size_t i = 0; i < kByteGroupSize; ++i)
201 {
202 if (buffer[i] >= sentinel)
203 {
204 *data++ = buffer[i];
205 }
206 }
207
208 return data;
209}
210
211static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
212{
213 assert(buffer_size % kByteGroupSize == 0);
214
215 unsigned char* header = data;
216
217 // round number of groups to 4 to get number of header bytes
218 size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
219
220 if (size_t(data_end - data) < header_size)
221 return 0;
222
223 data += header_size;
224
225 memset(header, 0, header_size);
226
227 for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
228 {
229 if (size_t(data_end - data) < kByteGroupDecodeLimit)
230 return 0;
231
232 int best_bits = 8;
233 size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
234
235 for (int bits = 1; bits < 8; bits *= 2)
236 {
237 size_t size = encodeBytesGroupMeasure(buffer + i, bits);
238
239 if (size < best_size)
240 {
241 best_bits = bits;
242 best_size = size;
243 }
244 }
245
246 int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
247 assert((1 << bitslog2) == best_bits);
248
249 size_t header_offset = i / kByteGroupSize;
250
251 header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
252
253 unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
254
255 assert(data + best_size == next);
256 data = next;
257 }
258
259 return data;
260}
261
262static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
263{
264 assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
265
266 unsigned char buffer[kVertexBlockMaxSize];
267 assert(sizeof(buffer) % kByteGroupSize == 0);
268
269 // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
270 memset(buffer, 0, sizeof(buffer));
271
272 for (size_t k = 0; k < vertex_size; ++k)
273 {
274 size_t vertex_offset = k;
275
276 unsigned char p = last_vertex[k];
277
278 for (size_t i = 0; i < vertex_count; ++i)
279 {
280 buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
281
282 p = vertex_data[vertex_offset];
283
284 vertex_offset += vertex_size;
285 }
286
287 data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
288 if (!data)
289 return 0;
290 }
291
292 memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
293
294 return data;
295}
296
297#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
298static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
299{
300#define READ() byte = *data++
301#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
302
303 unsigned char byte, enc, encv;
304 const unsigned char* data_var;
305
306 switch (bitslog2)
307 {
308 case 0:
309 memset(buffer, 0, kByteGroupSize);
310 return data;
311 case 1:
312 data_var = data + 4;
313
314 // 4 groups with 4 2-bit values in each byte
315 READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
316 READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
317 READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
318 READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
319
320 return data_var;
321 case 2:
322 data_var = data + 8;
323
324 // 8 groups with 2 4-bit values in each byte
325 READ(), NEXT(4), NEXT(4);
326 READ(), NEXT(4), NEXT(4);
327 READ(), NEXT(4), NEXT(4);
328 READ(), NEXT(4), NEXT(4);
329 READ(), NEXT(4), NEXT(4);
330 READ(), NEXT(4), NEXT(4);
331 READ(), NEXT(4), NEXT(4);
332 READ(), NEXT(4), NEXT(4);
333
334 return data_var;
335 case 3:
336 memcpy(buffer, data, kByteGroupSize);
337 return data + kByteGroupSize;
338 default:
339 assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
340 return data;
341 }
342
343#undef READ
344#undef NEXT
345}
346
347static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
348{
349 assert(buffer_size % kByteGroupSize == 0);
350
351 const unsigned char* header = data;
352
353 // round number of groups to 4 to get number of header bytes
354 size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
355
356 if (size_t(data_end - data) < header_size)
357 return 0;
358
359 data += header_size;
360
361 for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
362 {
363 if (size_t(data_end - data) < kByteGroupDecodeLimit)
364 return 0;
365
366 size_t header_offset = i / kByteGroupSize;
367
368 int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
369
370 data = decodeBytesGroup(data, buffer + i, bitslog2);
371 }
372
373 return data;
374}
375
376static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
377{
378 assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
379
380 unsigned char buffer[kVertexBlockMaxSize];
381 unsigned char transposed[kVertexBlockSizeBytes];
382
383 size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
384
385 for (size_t k = 0; k < vertex_size; ++k)
386 {
387 data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
388 if (!data)
389 return 0;
390
391 size_t vertex_offset = k;
392
393 unsigned char p = last_vertex[k];
394
395 for (size_t i = 0; i < vertex_count; ++i)
396 {
397 unsigned char v = unzigzag8(buffer[i]) + p;
398
399 transposed[vertex_offset] = v;
400 p = v;
401
402 vertex_offset += vertex_size;
403 }
404 }
405
406 memcpy(vertex_data, transposed, vertex_count * vertex_size);
407
408 memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
409
410 return data;
411}
412#endif
413
414#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
415static unsigned char kDecodeBytesGroupShuffle[256][8];
416static unsigned char kDecodeBytesGroupCount[256];
417
418#ifdef __wasm__
419__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
420#endif
421static bool
422decodeBytesGroupBuildTables()
423{
424 for (int mask = 0; mask < 256; ++mask)
425 {
426 unsigned char shuffle[8];
427 unsigned char count = 0;
428
429 for (int i = 0; i < 8; ++i)
430 {
431 int maski = (mask >> i) & 1;
432 shuffle[i] = maski ? count : 0x80;
433 count += (unsigned char)(maski);
434 }
435
436 memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
437 kDecodeBytesGroupCount[mask] = count;
438 }
439
440 return true;
441}
442
443static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
444#endif
445
446#ifdef SIMD_SSE
447SIMD_TARGET
448static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
449{
450 __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
451 __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
452 __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
453
454 __m128i sm1r = _mm_add_epi8(sm1, sm1off);
455
456 return _mm_unpacklo_epi64(sm0, sm1r);
457}
458
459SIMD_TARGET
460static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
461{
462 switch (bitslog2)
463 {
464 case 0:
465 {
466 __m128i result = _mm_setzero_si128();
467
468 _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
469
470 return data;
471 }
472
473 case 1:
474 {
475#ifdef __GNUC__
476 typedef int __attribute__((aligned(1))) unaligned_int;
477#else
478 typedef int unaligned_int;
479#endif
480
481#ifdef SIMD_LATENCYOPT
482 unsigned int data32;
483 memcpy(&data32, data, 4);
484 data32 &= data32 >> 1;
485
486 // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
487 unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
488
489 // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
490 int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
491#endif
492
493 __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
494 __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
495
496 __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
497 __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
498 __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
499
500 __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
501 int mask16 = _mm_movemask_epi8(mask);
502 unsigned char mask0 = (unsigned char)(mask16 & 255);
503 unsigned char mask1 = (unsigned char)(mask16 >> 8);
504
505 __m128i shuf = decodeShuffleMask(mask0, mask1);
506
507 __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
508
509 _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
510
511#ifdef SIMD_LATENCYOPT
512 return data + 4 + datacnt;
513#else
514 return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
515#endif
516 }
517
518 case 2:
519 {
520#ifdef SIMD_LATENCYOPT
521 unsigned long long data64;
522 memcpy(&data64, data, 8);
523 data64 &= data64 >> 1;
524 data64 &= data64 >> 2;
525
526 // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
527 int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
528#endif
529
530 __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
531 __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
532
533 __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
534 __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
535
536 __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
537 int mask16 = _mm_movemask_epi8(mask);
538 unsigned char mask0 = (unsigned char)(mask16 & 255);
539 unsigned char mask1 = (unsigned char)(mask16 >> 8);
540
541 __m128i shuf = decodeShuffleMask(mask0, mask1);
542
543 __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
544
545 _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
546
547#ifdef SIMD_LATENCYOPT
548 return data + 8 + datacnt;
549#else
550 return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
551#endif
552 }
553
554 case 3:
555 {
556 __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
557
558 _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
559
560 return data + 16;
561 }
562
563 default:
564 assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
565 return data;
566 }
567}
568#endif
569
570#ifdef SIMD_AVX
571static const __m128i decodeBytesGroupConfig[] = {
572 _mm_set1_epi8(3),
573 _mm_set1_epi8(15),
574 _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
575 _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
576};
577
578static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
579{
580 switch (bitslog2)
581 {
582 case 0:
583 {
584 __m128i result = _mm_setzero_si128();
585
586 _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
587
588 return data;
589 }
590
591 case 1:
592 case 2:
593 {
594 const unsigned char* skip = data + (bitslog2 << 2);
595
596 __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
597 __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
598
599 __m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
600 __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
601
602 __m128i selw = _mm_shuffle_epi32(selb, 0x44);
603 __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
604 __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
605
606 __m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
607
608 _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
609
610 return skip + _mm_popcnt_u32(mask16);
611 }
612
613 case 3:
614 {
615 __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
616
617 _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
618
619 return data + 16;
620 }
621
622 default:
623 assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
624 return data;
625 }
626}
627#endif
628
629#ifdef SIMD_NEON
630static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
631{
632 uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
633 uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
634
635 uint8x8_t r0 = vtbl1_u8(rest0, sm0);
636 uint8x8_t r1 = vtbl1_u8(rest1, sm1);
637
638 return vcombine_u8(r0, r1);
639}
640
641static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
642{
643 // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
644 const uint64_t magic = 0x000103070f1f3f80ull;
645
646 uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
647
648 mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
649 mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
650}
651
652static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
653{
654 switch (bitslog2)
655 {
656 case 0:
657 {
658 uint8x16_t result = vdupq_n_u8(0);
659
660 vst1q_u8(buffer, result);
661
662 return data;
663 }
664
665 case 1:
666 {
667#ifdef SIMD_LATENCYOPT
668 unsigned int data32;
669 memcpy(&data32, data, 4);
670 data32 &= data32 >> 1;
671
672 // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
673 unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
674
675 // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
676 int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
677#endif
678
679 uint8x8_t sel2 = vld1_u8(data);
680 uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
681 uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
682 uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
683
684 uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
685 unsigned char mask0, mask1;
686 neonMoveMask(mask, mask0, mask1);
687
688 uint8x8_t rest0 = vld1_u8(data + 4);
689 uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
690
691 uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
692
693 vst1q_u8(buffer, result);
694
695#ifdef SIMD_LATENCYOPT
696 return data + 4 + datacnt;
697#else
698 return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
699#endif
700 }
701
702 case 2:
703 {
704#ifdef SIMD_LATENCYOPT
705 unsigned long long data64;
706 memcpy(&data64, data, 8);
707 data64 &= data64 >> 1;
708 data64 &= data64 >> 2;
709
710 // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
711 int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
712#endif
713
714 uint8x8_t sel4 = vld1_u8(data);
715 uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
716 uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
717
718 uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
719 unsigned char mask0, mask1;
720 neonMoveMask(mask, mask0, mask1);
721
722 uint8x8_t rest0 = vld1_u8(data + 8);
723 uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
724
725 uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
726
727 vst1q_u8(buffer, result);
728
729#ifdef SIMD_LATENCYOPT
730 return data + 8 + datacnt;
731#else
732 return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
733#endif
734 }
735
736 case 3:
737 {
738 uint8x16_t result = vld1q_u8(data);
739
740 vst1q_u8(buffer, result);
741
742 return data + 16;
743 }
744
745 default:
746 assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
747 return data;
748 }
749}
750#endif
751
752#ifdef SIMD_WASM
753SIMD_TARGET
754static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
755{
756 v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
757 v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
758
759 v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
760 sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
761
762 v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
763
764 return wasmx_unpacklo_v64x2(sm0, sm1r);
765}
766
767SIMD_TARGET
768static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
769{
770 // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
771 const uint64_t magic = 0x000103070f1f3f80ull;
772
773 mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
774 mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
775}
776
777SIMD_TARGET
778static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
779{
780 unsigned char byte, enc, encv;
781 const unsigned char* data_var;
782
783 switch (bitslog2)
784 {
785 case 0:
786 {
787 v128_t result = wasm_i8x16_splat(0);
788
789 wasm_v128_store(buffer, result);
790
791 return data;
792 }
793
794 case 1:
795 {
796 v128_t sel2 = wasm_v128_load(data);
797 v128_t rest = wasm_v128_load(data + 4);
798
799 v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
800 v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
801 v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
802
803 v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
804
805 unsigned char mask0, mask1;
806 wasmMoveMask(mask, mask0, mask1);
807
808 v128_t shuf = decodeShuffleMask(mask0, mask1);
809
810 v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
811
812 wasm_v128_store(buffer, result);
813
814 return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
815 }
816
817 case 2:
818 {
819 v128_t sel4 = wasm_v128_load(data);
820 v128_t rest = wasm_v128_load(data + 8);
821
822 v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
823 v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
824
825 v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
826
827 unsigned char mask0, mask1;
828 wasmMoveMask(mask, mask0, mask1);
829
830 v128_t shuf = decodeShuffleMask(mask0, mask1);
831
832 v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
833
834 wasm_v128_store(buffer, result);
835
836 return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
837 }
838
839 case 3:
840 {
841 v128_t result = wasm_v128_load(data);
842
843 wasm_v128_store(buffer, result);
844
845 return data + 16;
846 }
847
848 default:
849 assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
850 return data;
851 }
852}
853#endif
854
855#if defined(SIMD_SSE) || defined(SIMD_AVX)
856SIMD_TARGET
857static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
858{
859 __m128i t0 = _mm_unpacklo_epi8(x0, x1);
860 __m128i t1 = _mm_unpackhi_epi8(x0, x1);
861 __m128i t2 = _mm_unpacklo_epi8(x2, x3);
862 __m128i t3 = _mm_unpackhi_epi8(x2, x3);
863
864 x0 = _mm_unpacklo_epi16(t0, t2);
865 x1 = _mm_unpackhi_epi16(t0, t2);
866 x2 = _mm_unpacklo_epi16(t1, t3);
867 x3 = _mm_unpackhi_epi16(t1, t3);
868}
869
870SIMD_TARGET
871static __m128i unzigzag8(__m128i v)
872{
873 __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
874 __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
875
876 return _mm_xor_si128(xl, xr);
877}
878#endif
879
880#ifdef SIMD_NEON
881static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
882{
883 uint8x16x2_t t01 = vzipq_u8(x0, x1);
884 uint8x16x2_t t23 = vzipq_u8(x2, x3);
885
886 uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
887 uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
888
889 x0 = vreinterpretq_u8_u16(x01.val[0]);
890 x1 = vreinterpretq_u8_u16(x01.val[1]);
891 x2 = vreinterpretq_u8_u16(x23.val[0]);
892 x3 = vreinterpretq_u8_u16(x23.val[1]);
893}
894
895static uint8x16_t unzigzag8(uint8x16_t v)
896{
897 uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
898 uint8x16_t xr = vshrq_n_u8(v, 1);
899
900 return veorq_u8(xl, xr);
901}
902#endif
903
904#ifdef SIMD_WASM
905SIMD_TARGET
906static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
907{
908 v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
909 v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
910 v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
911 v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
912
913 x0 = wasmx_unpacklo_v16x8(t0, t2);
914 x1 = wasmx_unpackhi_v16x8(t0, t2);
915 x2 = wasmx_unpacklo_v16x8(t1, t3);
916 x3 = wasmx_unpackhi_v16x8(t1, t3);
917}
918
919SIMD_TARGET
920static v128_t unzigzag8(v128_t v)
921{
922 v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
923 v128_t xr = wasm_u8x16_shr(v, 1);
924
925 return wasm_v128_xor(xl, xr);
926}
927#endif
928
929#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
930SIMD_TARGET
931static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
932{
933 assert(buffer_size % kByteGroupSize == 0);
934 assert(kByteGroupSize == 16);
935
936 const unsigned char* header = data;
937
938 // round number of groups to 4 to get number of header bytes
939 size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
940
941 if (size_t(data_end - data) < header_size)
942 return 0;
943
944 data += header_size;
945
946 size_t i = 0;
947
948 // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
949 for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
950 {
951 size_t header_offset = i / kByteGroupSize;
952 unsigned char header_byte = header[header_offset / 4];
953
954 data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
955 data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
956 data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
957 data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
958 }
959
960 // slow-path: process remaining groups
961 for (; i < buffer_size; i += kByteGroupSize)
962 {
963 if (size_t(data_end - data) < kByteGroupDecodeLimit)
964 return 0;
965
966 size_t header_offset = i / kByteGroupSize;
967
968 int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
969
970 data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
971 }
972
973 return data;
974}
975
976SIMD_TARGET
977static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
978{
979 assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
980
981 unsigned char buffer[kVertexBlockMaxSize * 4];
982 unsigned char transposed[kVertexBlockSizeBytes];
983
984 size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
985
986 for (size_t k = 0; k < vertex_size; k += 4)
987 {
988 for (size_t j = 0; j < 4; ++j)
989 {
990 data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
991 if (!data)
992 return 0;
993 }
994
995#if defined(SIMD_SSE) || defined(SIMD_AVX)
996#define TEMP __m128i
997#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
998#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
999#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
1000#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
1001#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
1002#endif
1003
1004#ifdef SIMD_NEON
1005#define TEMP uint8x8_t
1006#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
1007#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
1008#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
1009#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
1010#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
1011#endif
1012
1013#ifdef SIMD_WASM
1014#define TEMP v128_t
1015#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
1016#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
1017#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
1018#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
1019#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
1020#endif
1021
1022 PREP();
1023
1024 unsigned char* savep = transposed + k;
1025
1026 for (size_t j = 0; j < vertex_count_aligned; j += 16)
1027 {
1028 LOAD(0);
1029 LOAD(1);
1030 LOAD(2);
1031 LOAD(3);
1032
1033 r0 = unzigzag8(r0);
1034 r1 = unzigzag8(r1);
1035 r2 = unzigzag8(r2);
1036 r3 = unzigzag8(r3);
1037
1038 transpose8(r0, r1, r2, r3);
1039
1040 TEMP t0, t1, t2, t3;
1041
1042 GRP4(0);
1043 FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1044 SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1045
1046 GRP4(1);
1047 FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1048 SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1049
1050 GRP4(2);
1051 FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1052 SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1053
1054 GRP4(3);
1055 FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1056 SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1057
1058#undef TEMP
1059#undef PREP
1060#undef LOAD
1061#undef GRP4
1062#undef FIXD
1063#undef SAVE
1064 }
1065 }
1066
1067 memcpy(vertex_data, transposed, vertex_count * vertex_size);
1068
1069 memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
1070
1071 return data;
1072}
1073#endif
1074
1075#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
1076static unsigned int getCpuFeatures()
1077{
1078 int cpuinfo[4] = {};
1079#ifdef _MSC_VER
1080 __cpuid(cpuinfo, 1);
1081#else
1082 __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
1083#endif
1084 return cpuinfo[2];
1085}
1086
1087static unsigned int cpuid = getCpuFeatures();
1088#endif
1089
1090} // namespace meshopt
1091
1092size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
1093{
1094 using namespace meshopt;
1095
1096 assert(vertex_size > 0 && vertex_size <= 256);
1097 assert(vertex_size % 4 == 0);
1098
1099 const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
1100
1101 unsigned char* data = buffer;
1102 unsigned char* data_end = buffer + buffer_size;
1103
1104 if (size_t(data_end - data) < 1 + vertex_size)
1105 return 0;
1106
1107 int version = gEncodeVertexVersion;
1108
1109 *data++ = (unsigned char)(kVertexHeader | version);
1110
1111 unsigned char first_vertex[256] = {};
1112 if (vertex_count > 0)
1113 memcpy(first_vertex, vertex_data, vertex_size);
1114
1115 unsigned char last_vertex[256] = {};
1116 memcpy(last_vertex, first_vertex, vertex_size);
1117
1118 size_t vertex_block_size = getVertexBlockSize(vertex_size);
1119
1120 size_t vertex_offset = 0;
1121
1122 while (vertex_offset < vertex_count)
1123 {
1124 size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
1125
1126 data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
1127 if (!data)
1128 return 0;
1129
1130 vertex_offset += block_size;
1131 }
1132
1133 size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
1134
1135 if (size_t(data_end - data) < tail_size)
1136 return 0;
1137
1138 // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
1139 if (vertex_size < kTailMaxSize)
1140 {
1141 memset(data, 0, kTailMaxSize - vertex_size);
1142 data += kTailMaxSize - vertex_size;
1143 }
1144
1145 memcpy(data, first_vertex, vertex_size);
1146 data += vertex_size;
1147
1148 assert(data >= buffer + tail_size);
1149 assert(data <= buffer + buffer_size);
1150
1151 return data - buffer;
1152}
1153
1154size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
1155{
1156 using namespace meshopt;
1157
1158 assert(vertex_size > 0 && vertex_size <= 256);
1159 assert(vertex_size % 4 == 0);
1160
1161 size_t vertex_block_size = getVertexBlockSize(vertex_size);
1162 size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
1163
1164 size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
1165 size_t vertex_block_data_size = vertex_block_size;
1166
1167 size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
1168
1169 return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
1170}
1171
1172void meshopt_encodeVertexVersion(int version)
1173{
1174 assert(unsigned(version) <= 0);
1175
1176 meshopt::gEncodeVertexVersion = version;
1177}
1178
1179int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
1180{
1181 using namespace meshopt;
1182
1183 assert(vertex_size > 0 && vertex_size <= 256);
1184 assert(vertex_size % 4 == 0);
1185
1186 const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
1187
1188#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
1189 decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
1190#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
1191 decode = decodeVertexBlockSimd;
1192#else
1193 decode = decodeVertexBlock;
1194#endif
1195
1196#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
1197 assert(gDecodeBytesGroupInitialized);
1198 (void)gDecodeBytesGroupInitialized;
1199#endif
1200
1201 unsigned char* vertex_data = static_cast<unsigned char*>(destination);
1202
1203 const unsigned char* data = buffer;
1204 const unsigned char* data_end = buffer + buffer_size;
1205
1206 if (size_t(data_end - data) < 1 + vertex_size)
1207 return -2;
1208
1209 unsigned char data_header = *data++;
1210
1211 if ((data_header & 0xf0) != kVertexHeader)
1212 return -1;
1213
1214 int version = data_header & 0x0f;
1215 if (version > 0)
1216 return -1;
1217
1218 unsigned char last_vertex[256];
1219 memcpy(last_vertex, data_end - vertex_size, vertex_size);
1220
1221 size_t vertex_block_size = getVertexBlockSize(vertex_size);
1222
1223 size_t vertex_offset = 0;
1224
1225 while (vertex_offset < vertex_count)
1226 {
1227 size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
1228
1229 data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
1230 if (!data)
1231 return -2;
1232
1233 vertex_offset += block_size;
1234 }
1235
1236 size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
1237
1238 if (size_t(data_end - data) != tail_size)
1239 return -3;
1240
1241 return 0;
1242}
1243
1244#undef SIMD_NEON
1245#undef SIMD_SSE
1246#undef SIMD_AVX
1247#undef SIMD_WASM
1248#undef SIMD_FALLBACK
1249#undef SIMD_TARGET
1250