1// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2#include "meshoptimizer.h"
3
4#include <assert.h>
5#include <string.h>
6
7// This work is based on:
8// Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
9// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
10namespace meshopt
11{
12
13const size_t kCacheSizeMax = 16;
14const size_t kValenceMax = 8;
15
16struct VertexScoreTable
17{
18 float cache[1 + kCacheSizeMax];
19 float live[1 + kValenceMax];
20};
21
22// Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
23static const VertexScoreTable kVertexScoreTable = {
24 {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
25 {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
26};
27
28// Tuned to minimize the encoded index buffer size
29static const VertexScoreTable kVertexScoreTableStrip = {
30 {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
31 {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
32};
33
34struct TriangleAdjacency
35{
36 unsigned int* counts;
37 unsigned int* offsets;
38 unsigned int* data;
39};
40
41static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
42{
43 size_t face_count = index_count / 3;
44
45 // allocate arrays
46 adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
47 adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
48 adjacency.data = allocator.allocate<unsigned int>(index_count);
49
50 // fill triangle counts
51 memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
52
53 for (size_t i = 0; i < index_count; ++i)
54 {
55 assert(indices[i] < vertex_count);
56
57 adjacency.counts[indices[i]]++;
58 }
59
60 // fill offset table
61 unsigned int offset = 0;
62
63 for (size_t i = 0; i < vertex_count; ++i)
64 {
65 adjacency.offsets[i] = offset;
66 offset += adjacency.counts[i];
67 }
68
69 assert(offset == index_count);
70
71 // fill triangle data
72 for (size_t i = 0; i < face_count; ++i)
73 {
74 unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
75
76 adjacency.data[adjacency.offsets[a]++] = unsigned(i);
77 adjacency.data[adjacency.offsets[b]++] = unsigned(i);
78 adjacency.data[adjacency.offsets[c]++] = unsigned(i);
79 }
80
81 // fix offsets that have been disturbed by the previous pass
82 for (size_t i = 0; i < vertex_count; ++i)
83 {
84 assert(adjacency.offsets[i] >= adjacency.counts[i]);
85
86 adjacency.offsets[i] -= adjacency.counts[i];
87 }
88}
89
90static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
91{
92 // check dead-end stack
93 while (dead_end_top)
94 {
95 unsigned int vertex = dead_end[--dead_end_top];
96
97 if (live_triangles[vertex] > 0)
98 return vertex;
99 }
100
101 // input order
102 while (input_cursor < vertex_count)
103 {
104 if (live_triangles[input_cursor] > 0)
105 return input_cursor;
106
107 ++input_cursor;
108 }
109
110 return ~0u;
111}
112
113static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
114{
115 unsigned int best_candidate = ~0u;
116 int best_priority = -1;
117
118 for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
119 {
120 unsigned int vertex = *next_candidate;
121
122 // otherwise we don't need to process it
123 if (live_triangles[vertex] > 0)
124 {
125 int priority = 0;
126
127 // will it be in cache after fanning?
128 if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
129 {
130 priority = timestamp - cache_timestamps[vertex]; // position in cache
131 }
132
133 if (priority > best_priority)
134 {
135 best_candidate = vertex;
136 best_priority = priority;
137 }
138 }
139 }
140
141 return best_candidate;
142}
143
144static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
145{
146 assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
147
148 unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
149
150 return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
151}
152
153static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
154{
155 // input order
156 while (input_cursor < face_count)
157 {
158 if (!emitted_flags[input_cursor])
159 return input_cursor;
160
161 ++input_cursor;
162 }
163
164 return ~0u;
165}
166
167} // namespace meshopt
168
169void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
170{
171 using namespace meshopt;
172
173 assert(index_count % 3 == 0);
174
175 meshopt_Allocator allocator;
176
177 // guard for empty meshes
178 if (index_count == 0 || vertex_count == 0)
179 return;
180
181 // support in-place optimization
182 if (destination == indices)
183 {
184 unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
185 memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
186 indices = indices_copy;
187 }
188
189 unsigned int cache_size = 16;
190 assert(cache_size <= kCacheSizeMax);
191
192 size_t face_count = index_count / 3;
193
194 // build adjacency information
195 TriangleAdjacency adjacency = {};
196 buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
197
198 // live triangle counts
199 unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
200 memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
201
202 // emitted flags
203 unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
204 memset(emitted_flags, 0, face_count);
205
206 // compute initial vertex scores
207 float* vertex_scores = allocator.allocate<float>(vertex_count);
208
209 for (size_t i = 0; i < vertex_count; ++i)
210 vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
211
212 // compute triangle scores
213 float* triangle_scores = allocator.allocate<float>(face_count);
214
215 for (size_t i = 0; i < face_count; ++i)
216 {
217 unsigned int a = indices[i * 3 + 0];
218 unsigned int b = indices[i * 3 + 1];
219 unsigned int c = indices[i * 3 + 2];
220
221 triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
222 }
223
224 unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
225 unsigned int* cache = cache_holder;
226 unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
227 size_t cache_count = 0;
228
229 unsigned int current_triangle = 0;
230 unsigned int input_cursor = 1;
231
232 unsigned int output_triangle = 0;
233
234 while (current_triangle != ~0u)
235 {
236 assert(output_triangle < face_count);
237
238 unsigned int a = indices[current_triangle * 3 + 0];
239 unsigned int b = indices[current_triangle * 3 + 1];
240 unsigned int c = indices[current_triangle * 3 + 2];
241
242 // output indices
243 destination[output_triangle * 3 + 0] = a;
244 destination[output_triangle * 3 + 1] = b;
245 destination[output_triangle * 3 + 2] = c;
246 output_triangle++;
247
248 // update emitted flags
249 emitted_flags[current_triangle] = true;
250 triangle_scores[current_triangle] = 0;
251
252 // new triangle
253 size_t cache_write = 0;
254 cache_new[cache_write++] = a;
255 cache_new[cache_write++] = b;
256 cache_new[cache_write++] = c;
257
258 // old triangles
259 for (size_t i = 0; i < cache_count; ++i)
260 {
261 unsigned int index = cache[i];
262
263 if (index != a && index != b && index != c)
264 {
265 cache_new[cache_write++] = index;
266 }
267 }
268
269 unsigned int* cache_temp = cache;
270 cache = cache_new, cache_new = cache_temp;
271 cache_count = cache_write > cache_size ? cache_size : cache_write;
272
273 // update live triangle counts
274 live_triangles[a]--;
275 live_triangles[b]--;
276 live_triangles[c]--;
277
278 // remove emitted triangle from adjacency data
279 // this makes sure that we spend less time traversing these lists on subsequent iterations
280 for (size_t k = 0; k < 3; ++k)
281 {
282 unsigned int index = indices[current_triangle * 3 + k];
283
284 unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
285 size_t neighbors_size = adjacency.counts[index];
286
287 for (size_t i = 0; i < neighbors_size; ++i)
288 {
289 unsigned int tri = neighbors[i];
290
291 if (tri == current_triangle)
292 {
293 neighbors[i] = neighbors[neighbors_size - 1];
294 adjacency.counts[index]--;
295 break;
296 }
297 }
298 }
299
300 unsigned int best_triangle = ~0u;
301 float best_score = 0;
302
303 // update cache positions, vertex scores and triangle scores, and find next best triangle
304 for (size_t i = 0; i < cache_write; ++i)
305 {
306 unsigned int index = cache[i];
307
308 int cache_position = i >= cache_size ? -1 : int(i);
309
310 // update vertex score
311 float score = vertexScore(table, cache_position, live_triangles[index]);
312 float score_diff = score - vertex_scores[index];
313
314 vertex_scores[index] = score;
315
316 // update scores of vertex triangles
317 const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index];
318 const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index];
319
320 for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
321 {
322 unsigned int tri = *it;
323 assert(!emitted_flags[tri]);
324
325 float tri_score = triangle_scores[tri] + score_diff;
326 assert(tri_score > 0);
327
328 if (best_score < tri_score)
329 {
330 best_triangle = tri;
331 best_score = tri_score;
332 }
333
334 triangle_scores[tri] = tri_score;
335 }
336 }
337
338 // step through input triangles in order if we hit a dead-end
339 current_triangle = best_triangle;
340
341 if (current_triangle == ~0u)
342 {
343 current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
344 }
345 }
346
347 assert(input_cursor == face_count);
348 assert(output_triangle == face_count);
349}
350
351void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
352{
353 meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
354}
355
356void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
357{
358 meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
359}
360
361void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
362{
363 using namespace meshopt;
364
365 assert(index_count % 3 == 0);
366 assert(cache_size >= 3);
367
368 meshopt_Allocator allocator;
369
370 // guard for empty meshes
371 if (index_count == 0 || vertex_count == 0)
372 return;
373
374 // support in-place optimization
375 if (destination == indices)
376 {
377 unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
378 memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
379 indices = indices_copy;
380 }
381
382 size_t face_count = index_count / 3;
383
384 // build adjacency information
385 TriangleAdjacency adjacency = {};
386 buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
387
388 // live triangle counts
389 unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
390 memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
391
392 // cache time stamps
393 unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
394 memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
395
396 // dead-end stack
397 unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
398 unsigned int dead_end_top = 0;
399
400 // emitted flags
401 unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
402 memset(emitted_flags, 0, face_count);
403
404 unsigned int current_vertex = 0;
405
406 unsigned int timestamp = cache_size + 1;
407 unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
408
409 unsigned int output_triangle = 0;
410
411 while (current_vertex != ~0u)
412 {
413 const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
414
415 // emit all vertex neighbors
416 const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
417 const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex];
418
419 for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
420 {
421 unsigned int triangle = *it;
422
423 if (!emitted_flags[triangle])
424 {
425 unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
426
427 // output indices
428 destination[output_triangle * 3 + 0] = a;
429 destination[output_triangle * 3 + 1] = b;
430 destination[output_triangle * 3 + 2] = c;
431 output_triangle++;
432
433 // update dead-end stack
434 dead_end[dead_end_top + 0] = a;
435 dead_end[dead_end_top + 1] = b;
436 dead_end[dead_end_top + 2] = c;
437 dead_end_top += 3;
438
439 // update live triangle counts
440 live_triangles[a]--;
441 live_triangles[b]--;
442 live_triangles[c]--;
443
444 // update cache info
445 // if vertex is not in cache, put it in cache
446 if (timestamp - cache_timestamps[a] > cache_size)
447 cache_timestamps[a] = timestamp++;
448
449 if (timestamp - cache_timestamps[b] > cache_size)
450 cache_timestamps[b] = timestamp++;
451
452 if (timestamp - cache_timestamps[c] > cache_size)
453 cache_timestamps[c] = timestamp++;
454
455 // update emitted flags
456 emitted_flags[triangle] = true;
457 }
458 }
459
460 // next candidates are the ones we pushed to dead-end stack just now
461 const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
462
463 // get next vertex
464 current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
465
466 if (current_vertex == ~0u)
467 {
468 current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
469 }
470 }
471
472 assert(output_triangle == face_count);
473}
474