| 1 | // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details |
| 2 | #include "meshoptimizer.h" |
| 3 | |
| 4 | #include <assert.h> |
| 5 | #include <math.h> |
| 6 | #include <string.h> |
| 7 | |
| 8 | // This work is based on: |
| 9 | // Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007 |
| 10 | namespace meshopt |
| 11 | { |
| 12 | |
| 13 | static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count) |
| 14 | { |
| 15 | size_t vertex_stride_float = vertex_positions_stride / sizeof(float); |
| 16 | |
| 17 | float mesh_centroid[3] = {}; |
| 18 | |
| 19 | for (size_t i = 0; i < index_count; ++i) |
| 20 | { |
| 21 | const float* p = vertex_positions + vertex_stride_float * indices[i]; |
| 22 | |
| 23 | mesh_centroid[0] += p[0]; |
| 24 | mesh_centroid[1] += p[1]; |
| 25 | mesh_centroid[2] += p[2]; |
| 26 | } |
| 27 | |
| 28 | mesh_centroid[0] /= index_count; |
| 29 | mesh_centroid[1] /= index_count; |
| 30 | mesh_centroid[2] /= index_count; |
| 31 | |
| 32 | for (size_t cluster = 0; cluster < cluster_count; ++cluster) |
| 33 | { |
| 34 | size_t cluster_begin = clusters[cluster] * 3; |
| 35 | size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count; |
| 36 | assert(cluster_begin < cluster_end); |
| 37 | |
| 38 | float cluster_area = 0; |
| 39 | float cluster_centroid[3] = {}; |
| 40 | float cluster_normal[3] = {}; |
| 41 | |
| 42 | for (size_t i = cluster_begin; i < cluster_end; i += 3) |
| 43 | { |
| 44 | const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0]; |
| 45 | const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1]; |
| 46 | const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2]; |
| 47 | |
| 48 | float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]}; |
| 49 | float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]}; |
| 50 | |
| 51 | float normalx = p10[1] * p20[2] - p10[2] * p20[1]; |
| 52 | float normaly = p10[2] * p20[0] - p10[0] * p20[2]; |
| 53 | float normalz = p10[0] * p20[1] - p10[1] * p20[0]; |
| 54 | |
| 55 | float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz); |
| 56 | |
| 57 | cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3); |
| 58 | cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3); |
| 59 | cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3); |
| 60 | cluster_normal[0] += normalx; |
| 61 | cluster_normal[1] += normaly; |
| 62 | cluster_normal[2] += normalz; |
| 63 | cluster_area += area; |
| 64 | } |
| 65 | |
| 66 | float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area; |
| 67 | |
| 68 | cluster_centroid[0] *= inv_cluster_area; |
| 69 | cluster_centroid[1] *= inv_cluster_area; |
| 70 | cluster_centroid[2] *= inv_cluster_area; |
| 71 | |
| 72 | float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]); |
| 73 | float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length; |
| 74 | |
| 75 | cluster_normal[0] *= inv_cluster_normal_length; |
| 76 | cluster_normal[1] *= inv_cluster_normal_length; |
| 77 | cluster_normal[2] *= inv_cluster_normal_length; |
| 78 | |
| 79 | float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]}; |
| 80 | |
| 81 | sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2]; |
| 82 | } |
| 83 | } |
| 84 | |
| 85 | static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count) |
| 86 | { |
| 87 | // compute sort data bounds and renormalize, using fixed point snorm |
| 88 | float sort_data_max = 1e-3f; |
| 89 | |
| 90 | for (size_t i = 0; i < cluster_count; ++i) |
| 91 | { |
| 92 | float dpa = fabsf(sort_data[i]); |
| 93 | |
| 94 | sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max; |
| 95 | } |
| 96 | |
| 97 | const int sort_bits = 11; |
| 98 | |
| 99 | for (size_t i = 0; i < cluster_count; ++i) |
| 100 | { |
| 101 | // note that we flip distribution since high dot product should come first |
| 102 | float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max); |
| 103 | |
| 104 | sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1); |
| 105 | } |
| 106 | |
| 107 | // fill histogram for counting sort |
| 108 | unsigned int histogram[1 << sort_bits]; |
| 109 | memset(histogram, 0, sizeof(histogram)); |
| 110 | |
| 111 | for (size_t i = 0; i < cluster_count; ++i) |
| 112 | { |
| 113 | histogram[sort_keys[i]]++; |
| 114 | } |
| 115 | |
| 116 | // compute offsets based on histogram data |
| 117 | size_t histogram_sum = 0; |
| 118 | |
| 119 | for (size_t i = 0; i < 1 << sort_bits; ++i) |
| 120 | { |
| 121 | size_t count = histogram[i]; |
| 122 | histogram[i] = unsigned(histogram_sum); |
| 123 | histogram_sum += count; |
| 124 | } |
| 125 | |
| 126 | assert(histogram_sum == cluster_count); |
| 127 | |
| 128 | // compute sort order based on offsets |
| 129 | for (size_t i = 0; i < cluster_count; ++i) |
| 130 | { |
| 131 | sort_order[histogram[sort_keys[i]]++] = unsigned(i); |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp) |
| 136 | { |
| 137 | unsigned int cache_misses = 0; |
| 138 | |
| 139 | // if vertex is not in cache, put it in cache |
| 140 | if (timestamp - cache_timestamps[a] > cache_size) |
| 141 | { |
| 142 | cache_timestamps[a] = timestamp++; |
| 143 | cache_misses++; |
| 144 | } |
| 145 | |
| 146 | if (timestamp - cache_timestamps[b] > cache_size) |
| 147 | { |
| 148 | cache_timestamps[b] = timestamp++; |
| 149 | cache_misses++; |
| 150 | } |
| 151 | |
| 152 | if (timestamp - cache_timestamps[c] > cache_size) |
| 153 | { |
| 154 | cache_timestamps[c] = timestamp++; |
| 155 | cache_misses++; |
| 156 | } |
| 157 | |
| 158 | return cache_misses; |
| 159 | } |
| 160 | |
| 161 | static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps) |
| 162 | { |
| 163 | memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int)); |
| 164 | |
| 165 | unsigned int timestamp = cache_size + 1; |
| 166 | |
| 167 | size_t face_count = index_count / 3; |
| 168 | |
| 169 | size_t result = 0; |
| 170 | |
| 171 | for (size_t i = 0; i < face_count; ++i) |
| 172 | { |
| 173 | unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp); |
| 174 | |
| 175 | // when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh |
| 176 | // that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently |
| 177 | // suggests an inefficiency in the vertex cache optimization algorithm |
| 178 | // usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0 |
| 179 | if (i == 0 || m == 3) |
| 180 | { |
| 181 | destination[result++] = unsigned(i); |
| 182 | } |
| 183 | } |
| 184 | |
| 185 | assert(result <= index_count / 3); |
| 186 | |
| 187 | return result; |
| 188 | } |
| 189 | |
| 190 | static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps) |
| 191 | { |
| 192 | memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int)); |
| 193 | |
| 194 | unsigned int timestamp = 0; |
| 195 | |
| 196 | size_t result = 0; |
| 197 | |
| 198 | for (size_t it = 0; it < cluster_count; ++it) |
| 199 | { |
| 200 | size_t start = clusters[it]; |
| 201 | size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3; |
| 202 | assert(start < end); |
| 203 | |
| 204 | // reset cache |
| 205 | timestamp += cache_size + 1; |
| 206 | |
| 207 | // measure cluster ACMR |
| 208 | unsigned int cluster_misses = 0; |
| 209 | |
| 210 | for (size_t i = start; i < end; ++i) |
| 211 | { |
| 212 | unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp); |
| 213 | |
| 214 | cluster_misses += m; |
| 215 | } |
| 216 | |
| 217 | float cluster_threshold = threshold * (float(cluster_misses) / float(end - start)); |
| 218 | |
| 219 | // first cluster always starts from the hard cluster boundary |
| 220 | destination[result++] = unsigned(start); |
| 221 | |
| 222 | // reset cache |
| 223 | timestamp += cache_size + 1; |
| 224 | |
| 225 | unsigned int running_misses = 0; |
| 226 | unsigned int running_faces = 0; |
| 227 | |
| 228 | for (size_t i = start; i < end; ++i) |
| 229 | { |
| 230 | unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp); |
| 231 | |
| 232 | running_misses += m; |
| 233 | running_faces += 1; |
| 234 | |
| 235 | if (float(running_misses) / float(running_faces) <= cluster_threshold) |
| 236 | { |
| 237 | // we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one |
| 238 | // note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last |
| 239 | // cluster is empty; however, the 'pop_back' after the loop will clean it up |
| 240 | destination[result++] = unsigned(i + 1); |
| 241 | |
| 242 | // reset cache |
| 243 | timestamp += cache_size + 1; |
| 244 | |
| 245 | running_misses = 0; |
| 246 | running_faces = 0; |
| 247 | } |
| 248 | } |
| 249 | |
| 250 | // each time we reach the target ACMR we flush the cluster |
| 251 | // this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles |
| 252 | // in the last cluster, producing a very bad ACMR and significantly penalizing the overall results |
| 253 | // thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one |
| 254 | // there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end' |
| 255 | // to the cluster boundary array which we need to remove anyway - this code will do that automatically |
| 256 | if (destination[result - 1] != start) |
| 257 | { |
| 258 | result--; |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | assert(result >= cluster_count); |
| 263 | assert(result <= index_count / 3); |
| 264 | |
| 265 | return result; |
| 266 | } |
| 267 | |
| 268 | } // namespace meshopt |
| 269 | |
| 270 | void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold) |
| 271 | { |
| 272 | using namespace meshopt; |
| 273 | |
| 274 | assert(index_count % 3 == 0); |
| 275 | assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); |
| 276 | assert(vertex_positions_stride % sizeof(float) == 0); |
| 277 | |
| 278 | meshopt_Allocator allocator; |
| 279 | |
| 280 | // guard for empty meshes |
| 281 | if (index_count == 0 || vertex_count == 0) |
| 282 | return; |
| 283 | |
| 284 | // support in-place optimization |
| 285 | if (destination == indices) |
| 286 | { |
| 287 | unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count); |
| 288 | memcpy(indices_copy, indices, index_count * sizeof(unsigned int)); |
| 289 | indices = indices_copy; |
| 290 | } |
| 291 | |
| 292 | unsigned int cache_size = 16; |
| 293 | |
| 294 | unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count); |
| 295 | |
| 296 | // generate hard boundaries from full-triangle cache misses |
| 297 | unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3); |
| 298 | size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps); |
| 299 | |
| 300 | // generate soft boundaries |
| 301 | unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1); |
| 302 | size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps); |
| 303 | |
| 304 | const unsigned int* clusters = soft_clusters; |
| 305 | size_t cluster_count = soft_cluster_count; |
| 306 | |
| 307 | // fill sort data |
| 308 | float* sort_data = allocator.allocate<float>(cluster_count); |
| 309 | calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count); |
| 310 | |
| 311 | // sort clusters using sort data |
| 312 | unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count); |
| 313 | unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count); |
| 314 | calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count); |
| 315 | |
| 316 | // fill output buffer |
| 317 | size_t offset = 0; |
| 318 | |
| 319 | for (size_t it = 0; it < cluster_count; ++it) |
| 320 | { |
| 321 | unsigned int cluster = sort_order[it]; |
| 322 | assert(cluster < cluster_count); |
| 323 | |
| 324 | size_t cluster_begin = clusters[cluster] * 3; |
| 325 | size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count; |
| 326 | assert(cluster_begin < cluster_end); |
| 327 | |
| 328 | memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int)); |
| 329 | offset += cluster_end - cluster_begin; |
| 330 | } |
| 331 | |
| 332 | assert(offset == index_count); |
| 333 | } |
| 334 | |