1 | // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details |
2 | #include "meshoptimizer.h" |
3 | |
4 | #include <assert.h> |
5 | #include <math.h> |
6 | #include <string.h> |
7 | |
8 | // This work is based on: |
9 | // Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007 |
10 | namespace meshopt |
11 | { |
12 | |
13 | static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count) |
14 | { |
15 | size_t vertex_stride_float = vertex_positions_stride / sizeof(float); |
16 | |
17 | float mesh_centroid[3] = {}; |
18 | |
19 | for (size_t i = 0; i < index_count; ++i) |
20 | { |
21 | const float* p = vertex_positions + vertex_stride_float * indices[i]; |
22 | |
23 | mesh_centroid[0] += p[0]; |
24 | mesh_centroid[1] += p[1]; |
25 | mesh_centroid[2] += p[2]; |
26 | } |
27 | |
28 | mesh_centroid[0] /= index_count; |
29 | mesh_centroid[1] /= index_count; |
30 | mesh_centroid[2] /= index_count; |
31 | |
32 | for (size_t cluster = 0; cluster < cluster_count; ++cluster) |
33 | { |
34 | size_t cluster_begin = clusters[cluster] * 3; |
35 | size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count; |
36 | assert(cluster_begin < cluster_end); |
37 | |
38 | float cluster_area = 0; |
39 | float cluster_centroid[3] = {}; |
40 | float cluster_normal[3] = {}; |
41 | |
42 | for (size_t i = cluster_begin; i < cluster_end; i += 3) |
43 | { |
44 | const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0]; |
45 | const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1]; |
46 | const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2]; |
47 | |
48 | float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]}; |
49 | float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]}; |
50 | |
51 | float normalx = p10[1] * p20[2] - p10[2] * p20[1]; |
52 | float normaly = p10[2] * p20[0] - p10[0] * p20[2]; |
53 | float normalz = p10[0] * p20[1] - p10[1] * p20[0]; |
54 | |
55 | float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz); |
56 | |
57 | cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3); |
58 | cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3); |
59 | cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3); |
60 | cluster_normal[0] += normalx; |
61 | cluster_normal[1] += normaly; |
62 | cluster_normal[2] += normalz; |
63 | cluster_area += area; |
64 | } |
65 | |
66 | float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area; |
67 | |
68 | cluster_centroid[0] *= inv_cluster_area; |
69 | cluster_centroid[1] *= inv_cluster_area; |
70 | cluster_centroid[2] *= inv_cluster_area; |
71 | |
72 | float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]); |
73 | float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length; |
74 | |
75 | cluster_normal[0] *= inv_cluster_normal_length; |
76 | cluster_normal[1] *= inv_cluster_normal_length; |
77 | cluster_normal[2] *= inv_cluster_normal_length; |
78 | |
79 | float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]}; |
80 | |
81 | sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2]; |
82 | } |
83 | } |
84 | |
85 | static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count) |
86 | { |
87 | // compute sort data bounds and renormalize, using fixed point snorm |
88 | float sort_data_max = 1e-3f; |
89 | |
90 | for (size_t i = 0; i < cluster_count; ++i) |
91 | { |
92 | float dpa = fabsf(sort_data[i]); |
93 | |
94 | sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max; |
95 | } |
96 | |
97 | const int sort_bits = 11; |
98 | |
99 | for (size_t i = 0; i < cluster_count; ++i) |
100 | { |
101 | // note that we flip distribution since high dot product should come first |
102 | float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max); |
103 | |
104 | sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1); |
105 | } |
106 | |
107 | // fill histogram for counting sort |
108 | unsigned int histogram[1 << sort_bits]; |
109 | memset(histogram, 0, sizeof(histogram)); |
110 | |
111 | for (size_t i = 0; i < cluster_count; ++i) |
112 | { |
113 | histogram[sort_keys[i]]++; |
114 | } |
115 | |
116 | // compute offsets based on histogram data |
117 | size_t histogram_sum = 0; |
118 | |
119 | for (size_t i = 0; i < 1 << sort_bits; ++i) |
120 | { |
121 | size_t count = histogram[i]; |
122 | histogram[i] = unsigned(histogram_sum); |
123 | histogram_sum += count; |
124 | } |
125 | |
126 | assert(histogram_sum == cluster_count); |
127 | |
128 | // compute sort order based on offsets |
129 | for (size_t i = 0; i < cluster_count; ++i) |
130 | { |
131 | sort_order[histogram[sort_keys[i]]++] = unsigned(i); |
132 | } |
133 | } |
134 | |
135 | static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp) |
136 | { |
137 | unsigned int cache_misses = 0; |
138 | |
139 | // if vertex is not in cache, put it in cache |
140 | if (timestamp - cache_timestamps[a] > cache_size) |
141 | { |
142 | cache_timestamps[a] = timestamp++; |
143 | cache_misses++; |
144 | } |
145 | |
146 | if (timestamp - cache_timestamps[b] > cache_size) |
147 | { |
148 | cache_timestamps[b] = timestamp++; |
149 | cache_misses++; |
150 | } |
151 | |
152 | if (timestamp - cache_timestamps[c] > cache_size) |
153 | { |
154 | cache_timestamps[c] = timestamp++; |
155 | cache_misses++; |
156 | } |
157 | |
158 | return cache_misses; |
159 | } |
160 | |
161 | static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps) |
162 | { |
163 | memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int)); |
164 | |
165 | unsigned int timestamp = cache_size + 1; |
166 | |
167 | size_t face_count = index_count / 3; |
168 | |
169 | size_t result = 0; |
170 | |
171 | for (size_t i = 0; i < face_count; ++i) |
172 | { |
173 | unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp); |
174 | |
175 | // when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh |
176 | // that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently |
177 | // suggests an inefficiency in the vertex cache optimization algorithm |
178 | // usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0 |
179 | if (i == 0 || m == 3) |
180 | { |
181 | destination[result++] = unsigned(i); |
182 | } |
183 | } |
184 | |
185 | assert(result <= index_count / 3); |
186 | |
187 | return result; |
188 | } |
189 | |
190 | static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps) |
191 | { |
192 | memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int)); |
193 | |
194 | unsigned int timestamp = 0; |
195 | |
196 | size_t result = 0; |
197 | |
198 | for (size_t it = 0; it < cluster_count; ++it) |
199 | { |
200 | size_t start = clusters[it]; |
201 | size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3; |
202 | assert(start < end); |
203 | |
204 | // reset cache |
205 | timestamp += cache_size + 1; |
206 | |
207 | // measure cluster ACMR |
208 | unsigned int cluster_misses = 0; |
209 | |
210 | for (size_t i = start; i < end; ++i) |
211 | { |
212 | unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp); |
213 | |
214 | cluster_misses += m; |
215 | } |
216 | |
217 | float cluster_threshold = threshold * (float(cluster_misses) / float(end - start)); |
218 | |
219 | // first cluster always starts from the hard cluster boundary |
220 | destination[result++] = unsigned(start); |
221 | |
222 | // reset cache |
223 | timestamp += cache_size + 1; |
224 | |
225 | unsigned int running_misses = 0; |
226 | unsigned int running_faces = 0; |
227 | |
228 | for (size_t i = start; i < end; ++i) |
229 | { |
230 | unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp); |
231 | |
232 | running_misses += m; |
233 | running_faces += 1; |
234 | |
235 | if (float(running_misses) / float(running_faces) <= cluster_threshold) |
236 | { |
237 | // we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one |
238 | // note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last |
239 | // cluster is empty; however, the 'pop_back' after the loop will clean it up |
240 | destination[result++] = unsigned(i + 1); |
241 | |
242 | // reset cache |
243 | timestamp += cache_size + 1; |
244 | |
245 | running_misses = 0; |
246 | running_faces = 0; |
247 | } |
248 | } |
249 | |
250 | // each time we reach the target ACMR we flush the cluster |
251 | // this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles |
252 | // in the last cluster, producing a very bad ACMR and significantly penalizing the overall results |
253 | // thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one |
254 | // there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end' |
255 | // to the cluster boundary array which we need to remove anyway - this code will do that automatically |
256 | if (destination[result - 1] != start) |
257 | { |
258 | result--; |
259 | } |
260 | } |
261 | |
262 | assert(result >= cluster_count); |
263 | assert(result <= index_count / 3); |
264 | |
265 | return result; |
266 | } |
267 | |
268 | } // namespace meshopt |
269 | |
270 | void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold) |
271 | { |
272 | using namespace meshopt; |
273 | |
274 | assert(index_count % 3 == 0); |
275 | assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); |
276 | assert(vertex_positions_stride % sizeof(float) == 0); |
277 | |
278 | meshopt_Allocator allocator; |
279 | |
280 | // guard for empty meshes |
281 | if (index_count == 0 || vertex_count == 0) |
282 | return; |
283 | |
284 | // support in-place optimization |
285 | if (destination == indices) |
286 | { |
287 | unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count); |
288 | memcpy(indices_copy, indices, index_count * sizeof(unsigned int)); |
289 | indices = indices_copy; |
290 | } |
291 | |
292 | unsigned int cache_size = 16; |
293 | |
294 | unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count); |
295 | |
296 | // generate hard boundaries from full-triangle cache misses |
297 | unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3); |
298 | size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps); |
299 | |
300 | // generate soft boundaries |
301 | unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1); |
302 | size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps); |
303 | |
304 | const unsigned int* clusters = soft_clusters; |
305 | size_t cluster_count = soft_cluster_count; |
306 | |
307 | // fill sort data |
308 | float* sort_data = allocator.allocate<float>(cluster_count); |
309 | calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count); |
310 | |
311 | // sort clusters using sort data |
312 | unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count); |
313 | unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count); |
314 | calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count); |
315 | |
316 | // fill output buffer |
317 | size_t offset = 0; |
318 | |
319 | for (size_t it = 0; it < cluster_count; ++it) |
320 | { |
321 | unsigned int cluster = sort_order[it]; |
322 | assert(cluster < cluster_count); |
323 | |
324 | size_t cluster_begin = clusters[cluster] * 3; |
325 | size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count; |
326 | assert(cluster_begin < cluster_end); |
327 | |
328 | memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int)); |
329 | offset += cluster_end - cluster_begin; |
330 | } |
331 | |
332 | assert(offset == index_count); |
333 | } |
334 | |