| 1 | // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details |
| 2 | #include "meshoptimizer.h" |
| 3 | |
| 4 | #include <assert.h> |
| 5 | #include <string.h> |
| 6 | |
| 7 | // This work is based on: |
| 8 | // Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006 |
| 9 | // Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007 |
| 10 | namespace meshopt |
| 11 | { |
| 12 | |
| 13 | const size_t kCacheSizeMax = 16; |
| 14 | const size_t kValenceMax = 8; |
| 15 | |
| 16 | struct VertexScoreTable |
| 17 | { |
| 18 | float cache[1 + kCacheSizeMax]; |
| 19 | float live[1 + kValenceMax]; |
| 20 | }; |
| 21 | |
| 22 | // Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD |
| 23 | static const VertexScoreTable kVertexScoreTable = { |
| 24 | {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f}, |
| 25 | {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f}, |
| 26 | }; |
| 27 | |
| 28 | // Tuned to minimize the encoded index buffer size |
| 29 | static const VertexScoreTable kVertexScoreTableStrip = { |
| 30 | {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f}, |
| 31 | {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f}, |
| 32 | }; |
| 33 | |
| 34 | struct TriangleAdjacency |
| 35 | { |
| 36 | unsigned int* counts; |
| 37 | unsigned int* offsets; |
| 38 | unsigned int* data; |
| 39 | }; |
| 40 | |
| 41 | static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) |
| 42 | { |
| 43 | size_t face_count = index_count / 3; |
| 44 | |
| 45 | // allocate arrays |
| 46 | adjacency.counts = allocator.allocate<unsigned int>(vertex_count); |
| 47 | adjacency.offsets = allocator.allocate<unsigned int>(vertex_count); |
| 48 | adjacency.data = allocator.allocate<unsigned int>(index_count); |
| 49 | |
| 50 | // fill triangle counts |
| 51 | memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int)); |
| 52 | |
| 53 | for (size_t i = 0; i < index_count; ++i) |
| 54 | { |
| 55 | assert(indices[i] < vertex_count); |
| 56 | |
| 57 | adjacency.counts[indices[i]]++; |
| 58 | } |
| 59 | |
| 60 | // fill offset table |
| 61 | unsigned int offset = 0; |
| 62 | |
| 63 | for (size_t i = 0; i < vertex_count; ++i) |
| 64 | { |
| 65 | adjacency.offsets[i] = offset; |
| 66 | offset += adjacency.counts[i]; |
| 67 | } |
| 68 | |
| 69 | assert(offset == index_count); |
| 70 | |
| 71 | // fill triangle data |
| 72 | for (size_t i = 0; i < face_count; ++i) |
| 73 | { |
| 74 | unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; |
| 75 | |
| 76 | adjacency.data[adjacency.offsets[a]++] = unsigned(i); |
| 77 | adjacency.data[adjacency.offsets[b]++] = unsigned(i); |
| 78 | adjacency.data[adjacency.offsets[c]++] = unsigned(i); |
| 79 | } |
| 80 | |
| 81 | // fix offsets that have been disturbed by the previous pass |
| 82 | for (size_t i = 0; i < vertex_count; ++i) |
| 83 | { |
| 84 | assert(adjacency.offsets[i] >= adjacency.counts[i]); |
| 85 | |
| 86 | adjacency.offsets[i] -= adjacency.counts[i]; |
| 87 | } |
| 88 | } |
| 89 | |
| 90 | static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count) |
| 91 | { |
| 92 | // check dead-end stack |
| 93 | while (dead_end_top) |
| 94 | { |
| 95 | unsigned int vertex = dead_end[--dead_end_top]; |
| 96 | |
| 97 | if (live_triangles[vertex] > 0) |
| 98 | return vertex; |
| 99 | } |
| 100 | |
| 101 | // input order |
| 102 | while (input_cursor < vertex_count) |
| 103 | { |
| 104 | if (live_triangles[input_cursor] > 0) |
| 105 | return input_cursor; |
| 106 | |
| 107 | ++input_cursor; |
| 108 | } |
| 109 | |
| 110 | return ~0u; |
| 111 | } |
| 112 | |
| 113 | static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size) |
| 114 | { |
| 115 | unsigned int best_candidate = ~0u; |
| 116 | int best_priority = -1; |
| 117 | |
| 118 | for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate) |
| 119 | { |
| 120 | unsigned int vertex = *next_candidate; |
| 121 | |
| 122 | // otherwise we don't need to process it |
| 123 | if (live_triangles[vertex] > 0) |
| 124 | { |
| 125 | int priority = 0; |
| 126 | |
| 127 | // will it be in cache after fanning? |
| 128 | if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size) |
| 129 | { |
| 130 | priority = timestamp - cache_timestamps[vertex]; // position in cache |
| 131 | } |
| 132 | |
| 133 | if (priority > best_priority) |
| 134 | { |
| 135 | best_candidate = vertex; |
| 136 | best_priority = priority; |
| 137 | } |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | return best_candidate; |
| 142 | } |
| 143 | |
| 144 | static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles) |
| 145 | { |
| 146 | assert(cache_position >= -1 && cache_position < int(kCacheSizeMax)); |
| 147 | |
| 148 | unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax; |
| 149 | |
| 150 | return table->cache[1 + cache_position] + table->live[live_triangles_clamped]; |
| 151 | } |
| 152 | |
| 153 | static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count) |
| 154 | { |
| 155 | // input order |
| 156 | while (input_cursor < face_count) |
| 157 | { |
| 158 | if (!emitted_flags[input_cursor]) |
| 159 | return input_cursor; |
| 160 | |
| 161 | ++input_cursor; |
| 162 | } |
| 163 | |
| 164 | return ~0u; |
| 165 | } |
| 166 | |
| 167 | } // namespace meshopt |
| 168 | |
| 169 | void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table) |
| 170 | { |
| 171 | using namespace meshopt; |
| 172 | |
| 173 | assert(index_count % 3 == 0); |
| 174 | |
| 175 | meshopt_Allocator allocator; |
| 176 | |
| 177 | // guard for empty meshes |
| 178 | if (index_count == 0 || vertex_count == 0) |
| 179 | return; |
| 180 | |
| 181 | // support in-place optimization |
| 182 | if (destination == indices) |
| 183 | { |
| 184 | unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count); |
| 185 | memcpy(indices_copy, indices, index_count * sizeof(unsigned int)); |
| 186 | indices = indices_copy; |
| 187 | } |
| 188 | |
| 189 | unsigned int cache_size = 16; |
| 190 | assert(cache_size <= kCacheSizeMax); |
| 191 | |
| 192 | size_t face_count = index_count / 3; |
| 193 | |
| 194 | // build adjacency information |
| 195 | TriangleAdjacency adjacency = {}; |
| 196 | buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); |
| 197 | |
| 198 | // live triangle counts |
| 199 | unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count); |
| 200 | memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int)); |
| 201 | |
| 202 | // emitted flags |
| 203 | unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count); |
| 204 | memset(emitted_flags, 0, face_count); |
| 205 | |
| 206 | // compute initial vertex scores |
| 207 | float* vertex_scores = allocator.allocate<float>(vertex_count); |
| 208 | |
| 209 | for (size_t i = 0; i < vertex_count; ++i) |
| 210 | vertex_scores[i] = vertexScore(table, -1, live_triangles[i]); |
| 211 | |
| 212 | // compute triangle scores |
| 213 | float* triangle_scores = allocator.allocate<float>(face_count); |
| 214 | |
| 215 | for (size_t i = 0; i < face_count; ++i) |
| 216 | { |
| 217 | unsigned int a = indices[i * 3 + 0]; |
| 218 | unsigned int b = indices[i * 3 + 1]; |
| 219 | unsigned int c = indices[i * 3 + 2]; |
| 220 | |
| 221 | triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c]; |
| 222 | } |
| 223 | |
| 224 | unsigned int cache_holder[2 * (kCacheSizeMax + 3)]; |
| 225 | unsigned int* cache = cache_holder; |
| 226 | unsigned int* cache_new = cache_holder + kCacheSizeMax + 3; |
| 227 | size_t cache_count = 0; |
| 228 | |
| 229 | unsigned int current_triangle = 0; |
| 230 | unsigned int input_cursor = 1; |
| 231 | |
| 232 | unsigned int output_triangle = 0; |
| 233 | |
| 234 | while (current_triangle != ~0u) |
| 235 | { |
| 236 | assert(output_triangle < face_count); |
| 237 | |
| 238 | unsigned int a = indices[current_triangle * 3 + 0]; |
| 239 | unsigned int b = indices[current_triangle * 3 + 1]; |
| 240 | unsigned int c = indices[current_triangle * 3 + 2]; |
| 241 | |
| 242 | // output indices |
| 243 | destination[output_triangle * 3 + 0] = a; |
| 244 | destination[output_triangle * 3 + 1] = b; |
| 245 | destination[output_triangle * 3 + 2] = c; |
| 246 | output_triangle++; |
| 247 | |
| 248 | // update emitted flags |
| 249 | emitted_flags[current_triangle] = true; |
| 250 | triangle_scores[current_triangle] = 0; |
| 251 | |
| 252 | // new triangle |
| 253 | size_t cache_write = 0; |
| 254 | cache_new[cache_write++] = a; |
| 255 | cache_new[cache_write++] = b; |
| 256 | cache_new[cache_write++] = c; |
| 257 | |
| 258 | // old triangles |
| 259 | for (size_t i = 0; i < cache_count; ++i) |
| 260 | { |
| 261 | unsigned int index = cache[i]; |
| 262 | |
| 263 | if (index != a && index != b && index != c) |
| 264 | { |
| 265 | cache_new[cache_write++] = index; |
| 266 | } |
| 267 | } |
| 268 | |
| 269 | unsigned int* cache_temp = cache; |
| 270 | cache = cache_new, cache_new = cache_temp; |
| 271 | cache_count = cache_write > cache_size ? cache_size : cache_write; |
| 272 | |
| 273 | // update live triangle counts |
| 274 | live_triangles[a]--; |
| 275 | live_triangles[b]--; |
| 276 | live_triangles[c]--; |
| 277 | |
| 278 | // remove emitted triangle from adjacency data |
| 279 | // this makes sure that we spend less time traversing these lists on subsequent iterations |
| 280 | for (size_t k = 0; k < 3; ++k) |
| 281 | { |
| 282 | unsigned int index = indices[current_triangle * 3 + k]; |
| 283 | |
| 284 | unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; |
| 285 | size_t neighbors_size = adjacency.counts[index]; |
| 286 | |
| 287 | for (size_t i = 0; i < neighbors_size; ++i) |
| 288 | { |
| 289 | unsigned int tri = neighbors[i]; |
| 290 | |
| 291 | if (tri == current_triangle) |
| 292 | { |
| 293 | neighbors[i] = neighbors[neighbors_size - 1]; |
| 294 | adjacency.counts[index]--; |
| 295 | break; |
| 296 | } |
| 297 | } |
| 298 | } |
| 299 | |
| 300 | unsigned int best_triangle = ~0u; |
| 301 | float best_score = 0; |
| 302 | |
| 303 | // update cache positions, vertex scores and triangle scores, and find next best triangle |
| 304 | for (size_t i = 0; i < cache_write; ++i) |
| 305 | { |
| 306 | unsigned int index = cache[i]; |
| 307 | |
| 308 | int cache_position = i >= cache_size ? -1 : int(i); |
| 309 | |
| 310 | // update vertex score |
| 311 | float score = vertexScore(table, cache_position, live_triangles[index]); |
| 312 | float score_diff = score - vertex_scores[index]; |
| 313 | |
| 314 | vertex_scores[index] = score; |
| 315 | |
| 316 | // update scores of vertex triangles |
| 317 | const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index]; |
| 318 | const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index]; |
| 319 | |
| 320 | for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it) |
| 321 | { |
| 322 | unsigned int tri = *it; |
| 323 | assert(!emitted_flags[tri]); |
| 324 | |
| 325 | float tri_score = triangle_scores[tri] + score_diff; |
| 326 | assert(tri_score > 0); |
| 327 | |
| 328 | if (best_score < tri_score) |
| 329 | { |
| 330 | best_triangle = tri; |
| 331 | best_score = tri_score; |
| 332 | } |
| 333 | |
| 334 | triangle_scores[tri] = tri_score; |
| 335 | } |
| 336 | } |
| 337 | |
| 338 | // step through input triangles in order if we hit a dead-end |
| 339 | current_triangle = best_triangle; |
| 340 | |
| 341 | if (current_triangle == ~0u) |
| 342 | { |
| 343 | current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count); |
| 344 | } |
| 345 | } |
| 346 | |
| 347 | assert(input_cursor == face_count); |
| 348 | assert(output_triangle == face_count); |
| 349 | } |
| 350 | |
| 351 | void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count) |
| 352 | { |
| 353 | meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable); |
| 354 | } |
| 355 | |
| 356 | void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count) |
| 357 | { |
| 358 | meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip); |
| 359 | } |
| 360 | |
| 361 | void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size) |
| 362 | { |
| 363 | using namespace meshopt; |
| 364 | |
| 365 | assert(index_count % 3 == 0); |
| 366 | assert(cache_size >= 3); |
| 367 | |
| 368 | meshopt_Allocator allocator; |
| 369 | |
| 370 | // guard for empty meshes |
| 371 | if (index_count == 0 || vertex_count == 0) |
| 372 | return; |
| 373 | |
| 374 | // support in-place optimization |
| 375 | if (destination == indices) |
| 376 | { |
| 377 | unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count); |
| 378 | memcpy(indices_copy, indices, index_count * sizeof(unsigned int)); |
| 379 | indices = indices_copy; |
| 380 | } |
| 381 | |
| 382 | size_t face_count = index_count / 3; |
| 383 | |
| 384 | // build adjacency information |
| 385 | TriangleAdjacency adjacency = {}; |
| 386 | buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); |
| 387 | |
| 388 | // live triangle counts |
| 389 | unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count); |
| 390 | memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int)); |
| 391 | |
| 392 | // cache time stamps |
| 393 | unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count); |
| 394 | memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int)); |
| 395 | |
| 396 | // dead-end stack |
| 397 | unsigned int* dead_end = allocator.allocate<unsigned int>(index_count); |
| 398 | unsigned int dead_end_top = 0; |
| 399 | |
| 400 | // emitted flags |
| 401 | unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count); |
| 402 | memset(emitted_flags, 0, face_count); |
| 403 | |
| 404 | unsigned int current_vertex = 0; |
| 405 | |
| 406 | unsigned int timestamp = cache_size + 1; |
| 407 | unsigned int input_cursor = 1; // vertex to restart from in case of dead-end |
| 408 | |
| 409 | unsigned int output_triangle = 0; |
| 410 | |
| 411 | while (current_vertex != ~0u) |
| 412 | { |
| 413 | const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top; |
| 414 | |
| 415 | // emit all vertex neighbors |
| 416 | const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex]; |
| 417 | const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex]; |
| 418 | |
| 419 | for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it) |
| 420 | { |
| 421 | unsigned int triangle = *it; |
| 422 | |
| 423 | if (!emitted_flags[triangle]) |
| 424 | { |
| 425 | unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; |
| 426 | |
| 427 | // output indices |
| 428 | destination[output_triangle * 3 + 0] = a; |
| 429 | destination[output_triangle * 3 + 1] = b; |
| 430 | destination[output_triangle * 3 + 2] = c; |
| 431 | output_triangle++; |
| 432 | |
| 433 | // update dead-end stack |
| 434 | dead_end[dead_end_top + 0] = a; |
| 435 | dead_end[dead_end_top + 1] = b; |
| 436 | dead_end[dead_end_top + 2] = c; |
| 437 | dead_end_top += 3; |
| 438 | |
| 439 | // update live triangle counts |
| 440 | live_triangles[a]--; |
| 441 | live_triangles[b]--; |
| 442 | live_triangles[c]--; |
| 443 | |
| 444 | // update cache info |
| 445 | // if vertex is not in cache, put it in cache |
| 446 | if (timestamp - cache_timestamps[a] > cache_size) |
| 447 | cache_timestamps[a] = timestamp++; |
| 448 | |
| 449 | if (timestamp - cache_timestamps[b] > cache_size) |
| 450 | cache_timestamps[b] = timestamp++; |
| 451 | |
| 452 | if (timestamp - cache_timestamps[c] > cache_size) |
| 453 | cache_timestamps[c] = timestamp++; |
| 454 | |
| 455 | // update emitted flags |
| 456 | emitted_flags[triangle] = true; |
| 457 | } |
| 458 | } |
| 459 | |
| 460 | // next candidates are the ones we pushed to dead-end stack just now |
| 461 | const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top; |
| 462 | |
| 463 | // get next vertex |
| 464 | current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size); |
| 465 | |
| 466 | if (current_vertex == ~0u) |
| 467 | { |
| 468 | current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count); |
| 469 | } |
| 470 | } |
| 471 | |
| 472 | assert(output_triangle == face_count); |
| 473 | } |
| 474 | |