1 | // basisu_backend.cpp |
2 | // Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 (the "License"); |
5 | // you may not use this file except in compliance with the License. |
6 | // You may obtain a copy of the License at |
7 | // |
8 | // http://www.apache.org/licenses/LICENSE-2.0 |
9 | // |
10 | // Unless required by applicable law or agreed to in writing, software |
11 | // distributed under the License is distributed on an "AS IS" BASIS, |
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | // See the License for the specific language governing permissions and |
14 | // limitations under the License. |
15 | // |
16 | // TODO: This code originally supported full ETC1 and ETC1S, so there's some legacy stuff in here. |
17 | // |
18 | #include "basisu_backend.h" |
19 | |
20 | #if BASISU_SUPPORT_SSE |
21 | #define CPPSPMD_NAME(a) a##_sse41 |
22 | #include "basisu_kernels_declares.h" |
23 | #endif |
24 | |
25 | #define BASISU_FASTER_SELECTOR_REORDERING 0 |
26 | #define BASISU_BACKEND_VERIFY(c) verify(c, __LINE__); |
27 | |
28 | namespace basisu |
29 | { |
30 | // TODO |
31 | static inline void verify(bool condition, int line) |
32 | { |
33 | if (!condition) |
34 | { |
35 | fprintf(stderr, "ERROR: basisu_backend: verify() failed at line %i!\n" , line); |
36 | abort(); |
37 | } |
38 | } |
39 | |
40 | basisu_backend::basisu_backend() |
41 | { |
42 | clear(); |
43 | } |
44 | |
45 | void basisu_backend::clear() |
46 | { |
47 | m_pFront_end = NULL; |
48 | m_params.clear(); |
49 | m_output.clear(); |
50 | } |
51 | |
52 | void basisu_backend::init(basisu_frontend* pFront_end, basisu_backend_params& params, const basisu_backend_slice_desc_vec& slice_descs) |
53 | { |
54 | m_pFront_end = pFront_end; |
55 | m_params = params; |
56 | m_slices = slice_descs; |
57 | |
58 | debug_printf("basisu_backend::Init: Slices: %u, ETC1S: %u, EndpointRDOQualityThresh: %f, SelectorRDOQualityThresh: %f\n" , |
59 | m_slices.size(), |
60 | params.m_etc1s, |
61 | params.m_endpoint_rdo_quality_thresh, |
62 | params.m_selector_rdo_quality_thresh); |
63 | |
64 | debug_printf("Frontend endpoints: %u selectors: %u\n" , m_pFront_end->get_total_endpoint_clusters(), m_pFront_end->get_total_selector_clusters()); |
65 | |
66 | for (uint32_t i = 0; i < m_slices.size(); i++) |
67 | { |
68 | debug_printf("Slice: %u, OrigWidth: %u, OrigHeight: %u, Width: %u, Height: %u, NumBlocksX: %u, NumBlocksY: %u, FirstBlockIndex: %u\n" , |
69 | i, |
70 | m_slices[i].m_orig_width, m_slices[i].m_orig_height, |
71 | m_slices[i].m_width, m_slices[i].m_height, |
72 | m_slices[i].m_num_blocks_x, m_slices[i].m_num_blocks_y, |
73 | m_slices[i].m_first_block_index); |
74 | } |
75 | } |
76 | |
77 | void basisu_backend::create_endpoint_palette() |
78 | { |
79 | const basisu_frontend& r = *m_pFront_end; |
80 | |
81 | m_output.m_num_endpoints = r.get_total_endpoint_clusters(); |
82 | |
83 | m_endpoint_palette.resize(r.get_total_endpoint_clusters()); |
84 | for (uint32_t i = 0; i < r.get_total_endpoint_clusters(); i++) |
85 | { |
86 | etc1_endpoint_palette_entry& e = m_endpoint_palette[i]; |
87 | |
88 | e.m_color5_valid = r.get_endpoint_cluster_color_is_used(i, false); |
89 | e.m_color5 = r.get_endpoint_cluster_unscaled_color(i, false); |
90 | e.m_inten5 = r.get_endpoint_cluster_inten_table(i, false); |
91 | |
92 | BASISU_BACKEND_VERIFY(e.m_color5_valid); |
93 | } |
94 | } |
95 | |
96 | void basisu_backend::create_selector_palette() |
97 | { |
98 | const basisu_frontend& r = *m_pFront_end; |
99 | |
100 | m_output.m_num_selectors = r.get_total_selector_clusters(); |
101 | |
102 | m_selector_palette.resize(r.get_total_selector_clusters()); |
103 | |
104 | for (uint32_t i = 0; i < r.get_total_selector_clusters(); i++) |
105 | { |
106 | etc1_selector_palette_entry& s = m_selector_palette[i]; |
107 | |
108 | const etc_block& selector_bits = r.get_selector_cluster_selector_bits(i); |
109 | |
110 | for (uint32_t y = 0; y < 4; y++) |
111 | { |
112 | for (uint32_t x = 0; x < 4; x++) |
113 | { |
114 | s[y * 4 + x] = static_cast<uint8_t>(selector_bits.get_selector(x, y)); |
115 | } |
116 | } |
117 | } |
118 | } |
119 | |
120 | static const struct |
121 | { |
122 | int8_t m_dx, m_dy; |
123 | } g_endpoint_preds[] = |
124 | { |
125 | { -1, 0 }, |
126 | { 0, -1 }, |
127 | { -1, -1 } |
128 | }; |
129 | |
130 | void basisu_backend::reoptimize_and_sort_endpoints_codebook(uint32_t total_block_endpoints_remapped, uint_vec& all_endpoint_indices) |
131 | { |
132 | basisu_frontend& r = *m_pFront_end; |
133 | //const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames; |
134 | |
135 | if (m_params.m_used_global_codebooks) |
136 | { |
137 | m_endpoint_remap_table_old_to_new.clear(); |
138 | m_endpoint_remap_table_old_to_new.resize(r.get_total_endpoint_clusters()); |
139 | for (uint32_t i = 0; i < r.get_total_endpoint_clusters(); i++) |
140 | m_endpoint_remap_table_old_to_new[i] = i; |
141 | } |
142 | else |
143 | { |
144 | //if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 0)) |
145 | if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 1)) |
146 | { |
147 | // We've changed the block endpoint indices, so we need to go and adjust the endpoint codebook (remove unused entries, optimize existing entries that have changed) |
148 | uint_vec new_block_endpoints(get_total_blocks()); |
149 | |
150 | for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++) |
151 | { |
152 | const uint32_t first_block_index = m_slices[slice_index].m_first_block_index; |
153 | const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x; |
154 | const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y; |
155 | |
156 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
157 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
158 | new_block_endpoints[first_block_index + block_x + block_y * num_blocks_x] = m_slice_encoder_blocks[slice_index](block_x, block_y).m_endpoint_index; |
159 | } |
160 | |
161 | int_vec old_to_new_endpoint_indices; |
162 | r.reoptimize_remapped_endpoints(new_block_endpoints, old_to_new_endpoint_indices, true); |
163 | |
164 | create_endpoint_palette(); |
165 | |
166 | for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++) |
167 | { |
168 | //const uint32_t first_block_index = m_slices[slice_index].m_first_block_index; |
169 | |
170 | //const uint32_t width = m_slices[slice_index].m_width; |
171 | //const uint32_t height = m_slices[slice_index].m_height; |
172 | const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x; |
173 | const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y; |
174 | |
175 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
176 | { |
177 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
178 | { |
179 | //const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x; |
180 | |
181 | encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y); |
182 | |
183 | m.m_endpoint_index = old_to_new_endpoint_indices[m.m_endpoint_index]; |
184 | } // block_x |
185 | } // block_y |
186 | } // slice_index |
187 | |
188 | for (uint32_t i = 0; i < all_endpoint_indices.size(); i++) |
189 | all_endpoint_indices[i] = old_to_new_endpoint_indices[all_endpoint_indices[i]]; |
190 | |
191 | } //if (total_block_endpoints_remapped) |
192 | |
193 | // Sort endpoint codebook |
194 | palette_index_reorderer reorderer; |
195 | reorderer.init((uint32_t)all_endpoint_indices.size(), &all_endpoint_indices[0], r.get_total_endpoint_clusters(), nullptr, nullptr, 0); |
196 | m_endpoint_remap_table_old_to_new = reorderer.get_remap_table(); |
197 | } |
198 | |
199 | // For endpoints, old_to_new[] may not be bijective! |
200 | // Some "old" entries may be unused and don't get remapped into the "new" array. |
201 | |
202 | m_old_endpoint_was_used.clear(); |
203 | m_old_endpoint_was_used.resize(r.get_total_endpoint_clusters()); |
204 | uint32_t first_old_entry_index = UINT32_MAX; |
205 | |
206 | for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++) |
207 | { |
208 | const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x, num_blocks_y = m_slices[slice_index].m_num_blocks_y; |
209 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
210 | { |
211 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
212 | { |
213 | encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y); |
214 | const uint32_t old_endpoint_index = m.m_endpoint_index; |
215 | |
216 | m_old_endpoint_was_used[old_endpoint_index] = true; |
217 | first_old_entry_index = basisu::minimum(first_old_entry_index, old_endpoint_index); |
218 | } // block_x |
219 | } // block_y |
220 | } // slice_index |
221 | |
222 | debug_printf("basisu_backend::reoptimize_and_sort_endpoints_codebook: First old entry index: %u\n" , first_old_entry_index); |
223 | |
224 | m_new_endpoint_was_used.clear(); |
225 | m_new_endpoint_was_used.resize(r.get_total_endpoint_clusters()); |
226 | |
227 | m_endpoint_remap_table_new_to_old.clear(); |
228 | m_endpoint_remap_table_new_to_old.resize(r.get_total_endpoint_clusters()); |
229 | |
230 | // Set unused entries in the new array to point to the first used entry in the old array. |
231 | m_endpoint_remap_table_new_to_old.set_all(first_old_entry_index); |
232 | |
233 | for (uint32_t old_index = 0; old_index < m_endpoint_remap_table_old_to_new.size(); old_index++) |
234 | { |
235 | if (m_old_endpoint_was_used[old_index]) |
236 | { |
237 | const uint32_t new_index = m_endpoint_remap_table_old_to_new[old_index]; |
238 | |
239 | m_new_endpoint_was_used[new_index] = true; |
240 | |
241 | m_endpoint_remap_table_new_to_old[new_index] = old_index; |
242 | } |
243 | } |
244 | } |
245 | |
246 | void basisu_backend::sort_selector_codebook() |
247 | { |
248 | basisu_frontend& r = *m_pFront_end; |
249 | |
250 | m_selector_remap_table_new_to_old.resize(r.get_total_selector_clusters()); |
251 | |
252 | if ((m_params.m_compression_level == 0) || (m_params.m_used_global_codebooks)) |
253 | { |
254 | for (uint32_t i = 0; i < r.get_total_selector_clusters(); i++) |
255 | m_selector_remap_table_new_to_old[i] = i; |
256 | } |
257 | else |
258 | { |
259 | m_selector_remap_table_new_to_old[0] = 0; |
260 | uint32_t prev_selector_index = 0; |
261 | |
262 | int_vec remaining_selectors; |
263 | remaining_selectors.reserve(r.get_total_selector_clusters() - 1); |
264 | for (uint32_t i = 1; i < r.get_total_selector_clusters(); i++) |
265 | remaining_selectors.push_back(i); |
266 | |
267 | uint_vec selector_palette_bytes(m_selector_palette.size()); |
268 | for (uint32_t i = 0; i < m_selector_palette.size(); i++) |
269 | selector_palette_bytes[i] = m_selector_palette[i].get_byte(0) | (m_selector_palette[i].get_byte(1) << 8) | (m_selector_palette[i].get_byte(2) << 16) | (m_selector_palette[i].get_byte(3) << 24); |
270 | |
271 | // This is the traveling salesman problem. |
272 | for (uint32_t i = 1; i < r.get_total_selector_clusters(); i++) |
273 | { |
274 | uint32_t best_hamming_dist = 100; |
275 | uint32_t best_index = 0; |
276 | |
277 | #if BASISU_FASTER_SELECTOR_REORDERING |
278 | const uint32_t step = (remaining_selectors.size() > 16) ? 16 : 1; |
279 | for (uint32_t j = 0; j < remaining_selectors.size(); j += step) |
280 | #else |
281 | for (uint32_t j = 0; j < remaining_selectors.size(); j++) |
282 | #endif |
283 | { |
284 | int selector_index = remaining_selectors[j]; |
285 | |
286 | uint32_t k = selector_palette_bytes[prev_selector_index] ^ selector_palette_bytes[selector_index]; |
287 | uint32_t hamming_dist = g_hamming_dist[k & 0xFF] + g_hamming_dist[(k >> 8) & 0xFF] + g_hamming_dist[(k >> 16) & 0xFF] + g_hamming_dist[k >> 24]; |
288 | |
289 | if (hamming_dist < best_hamming_dist) |
290 | { |
291 | best_hamming_dist = hamming_dist; |
292 | best_index = j; |
293 | if (best_hamming_dist <= 1) |
294 | break; |
295 | } |
296 | } |
297 | |
298 | prev_selector_index = remaining_selectors[best_index]; |
299 | m_selector_remap_table_new_to_old[i] = prev_selector_index; |
300 | |
301 | remaining_selectors[best_index] = remaining_selectors.back(); |
302 | remaining_selectors.resize(remaining_selectors.size() - 1); |
303 | } |
304 | } |
305 | |
306 | m_selector_remap_table_old_to_new.resize(r.get_total_selector_clusters()); |
307 | for (uint32_t i = 0; i < m_selector_remap_table_new_to_old.size(); i++) |
308 | m_selector_remap_table_old_to_new[m_selector_remap_table_new_to_old[i]] = i; |
309 | } |
310 | int basisu_backend::find_video_frame(int slice_index, int delta) |
311 | { |
312 | for (uint32_t s = 0; s < m_slices.size(); s++) |
313 | { |
314 | if ((int)m_slices[s].m_source_file_index != ((int)m_slices[slice_index].m_source_file_index + delta)) |
315 | continue; |
316 | if (m_slices[s].m_mip_index != m_slices[slice_index].m_mip_index) |
317 | continue; |
318 | |
319 | // Being super paranoid here. |
320 | if (m_slices[s].m_num_blocks_x != (m_slices[slice_index].m_num_blocks_x)) |
321 | continue; |
322 | if (m_slices[s].m_num_blocks_y != (m_slices[slice_index].m_num_blocks_y)) |
323 | continue; |
324 | if (m_slices[s].m_alpha != (m_slices[slice_index].m_alpha)) |
325 | continue; |
326 | return s; |
327 | } |
328 | |
329 | return -1; |
330 | } |
331 | |
332 | void basisu_backend::check_for_valid_cr_blocks() |
333 | { |
334 | basisu_frontend& r = *m_pFront_end; |
335 | const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames; |
336 | |
337 | if (!is_video) |
338 | return; |
339 | |
340 | debug_printf("basisu_backend::check_for_valid_cr_blocks\n" ); |
341 | |
342 | uint32_t total_crs = 0; |
343 | uint32_t total_invalid_crs = 0; |
344 | |
345 | for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++) |
346 | { |
347 | const bool is_iframe = m_slices[slice_index].m_iframe; |
348 | //const uint32_t first_block_index = m_slices[slice_index].m_first_block_index; |
349 | |
350 | //const uint32_t width = m_slices[slice_index].m_width; |
351 | //const uint32_t height = m_slices[slice_index].m_height; |
352 | const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x; |
353 | const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y; |
354 | const int prev_frame_slice_index = find_video_frame(slice_index, -1); |
355 | |
356 | // If we don't have a previous frame, and we're not an i-frame, something is wrong. |
357 | if ((prev_frame_slice_index < 0) && (!is_iframe)) |
358 | { |
359 | BASISU_BACKEND_VERIFY(0); |
360 | } |
361 | |
362 | if ((is_iframe) || (prev_frame_slice_index < 0)) |
363 | { |
364 | // Ensure no blocks use CR's |
365 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
366 | { |
367 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
368 | { |
369 | encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y); |
370 | BASISU_BACKEND_VERIFY(m.m_endpoint_predictor != basist::CR_ENDPOINT_PRED_INDEX); |
371 | } |
372 | } |
373 | } |
374 | else |
375 | { |
376 | // For blocks that use CR's, make sure the endpoints/selectors haven't really changed. |
377 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
378 | { |
379 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
380 | { |
381 | encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y); |
382 | |
383 | if (m.m_endpoint_predictor == basist::CR_ENDPOINT_PRED_INDEX) |
384 | { |
385 | total_crs++; |
386 | |
387 | encoder_block& prev_m = m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y); |
388 | |
389 | if ((m.m_endpoint_index != prev_m.m_endpoint_index) || (m.m_selector_index != prev_m.m_selector_index)) |
390 | { |
391 | total_invalid_crs++; |
392 | } |
393 | } |
394 | } // block_x |
395 | } // block_y |
396 | |
397 | } // !slice_index |
398 | |
399 | } // slice_index |
400 | |
401 | debug_printf("Total CR's: %u, Total invalid CR's: %u\n" , total_crs, total_invalid_crs); |
402 | |
403 | BASISU_BACKEND_VERIFY(total_invalid_crs == 0); |
404 | } |
405 | |
406 | void basisu_backend::create_encoder_blocks() |
407 | { |
408 | debug_printf("basisu_backend::create_encoder_blocks\n" ); |
409 | |
410 | interval_timer tm; |
411 | tm.start(); |
412 | |
413 | basisu_frontend& r = *m_pFront_end; |
414 | const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames; |
415 | |
416 | m_slice_encoder_blocks.resize(m_slices.size()); |
417 | |
418 | uint32_t total_endpoint_pred_missed = 0, total_endpoint_pred_hits = 0, total_block_endpoints_remapped = 0; |
419 | |
420 | uint_vec all_endpoint_indices; |
421 | all_endpoint_indices.reserve(get_total_blocks()); |
422 | |
423 | for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++) |
424 | { |
425 | const int prev_frame_slice_index = is_video ? find_video_frame(slice_index, -1) : -1; |
426 | const bool is_iframe = m_slices[slice_index].m_iframe; |
427 | const uint32_t first_block_index = m_slices[slice_index].m_first_block_index; |
428 | |
429 | //const uint32_t width = m_slices[slice_index].m_width; |
430 | //const uint32_t height = m_slices[slice_index].m_height; |
431 | const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x; |
432 | const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y; |
433 | |
434 | m_slice_encoder_blocks[slice_index].resize(num_blocks_x, num_blocks_y); |
435 | |
436 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
437 | { |
438 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
439 | { |
440 | const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x; |
441 | |
442 | encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y); |
443 | |
444 | m.m_endpoint_index = r.get_subblock_endpoint_cluster_index(block_index, 0); |
445 | BASISU_BACKEND_VERIFY(r.get_subblock_endpoint_cluster_index(block_index, 0) == r.get_subblock_endpoint_cluster_index(block_index, 1)); |
446 | |
447 | m.m_selector_index = r.get_block_selector_cluster_index(block_index); |
448 | |
449 | m.m_endpoint_predictor = basist::NO_ENDPOINT_PRED_INDEX; |
450 | |
451 | const uint32_t block_endpoint = m.m_endpoint_index; |
452 | |
453 | uint32_t best_endpoint_pred = UINT32_MAX; |
454 | |
455 | for (uint32_t endpoint_pred = 0; endpoint_pred < basist::NUM_ENDPOINT_PREDS; endpoint_pred++) |
456 | { |
457 | if ((is_video) && (endpoint_pred == basist::CR_ENDPOINT_PRED_INDEX)) |
458 | { |
459 | if ((prev_frame_slice_index != -1) && (!is_iframe)) |
460 | { |
461 | const uint32_t cur_endpoint = m_slice_encoder_blocks[slice_index](block_x, block_y).m_endpoint_index; |
462 | const uint32_t cur_selector = m_slice_encoder_blocks[slice_index](block_x, block_y).m_selector_index; |
463 | const uint32_t prev_endpoint = m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y).m_endpoint_index; |
464 | const uint32_t prev_selector = m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y).m_selector_index; |
465 | if ((cur_endpoint == prev_endpoint) && (cur_selector == prev_selector)) |
466 | { |
467 | best_endpoint_pred = basist::CR_ENDPOINT_PRED_INDEX; |
468 | m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y).m_is_cr_target = true; |
469 | } |
470 | } |
471 | } |
472 | else |
473 | { |
474 | int pred_block_x = block_x + g_endpoint_preds[endpoint_pred].m_dx; |
475 | if ((pred_block_x < 0) || (pred_block_x >= (int)num_blocks_x)) |
476 | continue; |
477 | |
478 | int pred_block_y = block_y + g_endpoint_preds[endpoint_pred].m_dy; |
479 | if ((pred_block_y < 0) || (pred_block_y >= (int)num_blocks_y)) |
480 | continue; |
481 | |
482 | uint32_t pred_endpoint = m_slice_encoder_blocks[slice_index](pred_block_x, pred_block_y).m_endpoint_index; |
483 | |
484 | if (pred_endpoint == block_endpoint) |
485 | { |
486 | if (endpoint_pred < best_endpoint_pred) |
487 | { |
488 | best_endpoint_pred = endpoint_pred; |
489 | } |
490 | } |
491 | } |
492 | |
493 | } // endpoint_pred |
494 | |
495 | if (best_endpoint_pred != UINT32_MAX) |
496 | { |
497 | m.m_endpoint_predictor = best_endpoint_pred; |
498 | |
499 | total_endpoint_pred_hits++; |
500 | } |
501 | else if (m_params.m_endpoint_rdo_quality_thresh > 0.0f) |
502 | { |
503 | const pixel_block& src_pixels = r.get_source_pixel_block(block_index); |
504 | |
505 | etc_block etc_blk(r.get_output_block(block_index)); |
506 | |
507 | uint64_t cur_err = etc_blk.evaluate_etc1_error(src_pixels.get_ptr(), r.get_params().m_perceptual); |
508 | |
509 | if (cur_err) |
510 | { |
511 | const uint64_t thresh_err = (uint64_t)(cur_err * maximum(1.0f, m_params.m_endpoint_rdo_quality_thresh)); |
512 | |
513 | etc_block trial_etc_block(etc_blk); |
514 | |
515 | uint64_t best_err = UINT64_MAX; |
516 | uint32_t best_endpoint_index = 0; |
517 | |
518 | best_endpoint_pred = UINT32_MAX; |
519 | |
520 | for (uint32_t endpoint_pred = 0; endpoint_pred < basist::NUM_ENDPOINT_PREDS; endpoint_pred++) |
521 | { |
522 | if ((is_video) && (endpoint_pred == basist::CR_ENDPOINT_PRED_INDEX)) |
523 | continue; |
524 | |
525 | int pred_block_x = block_x + g_endpoint_preds[endpoint_pred].m_dx; |
526 | if ((pred_block_x < 0) || (pred_block_x >= (int)num_blocks_x)) |
527 | continue; |
528 | |
529 | int pred_block_y = block_y + g_endpoint_preds[endpoint_pred].m_dy; |
530 | if ((pred_block_y < 0) || (pred_block_y >= (int)num_blocks_y)) |
531 | continue; |
532 | |
533 | uint32_t pred_endpoint_index = m_slice_encoder_blocks[slice_index](pred_block_x, pred_block_y).m_endpoint_index; |
534 | |
535 | uint32_t pred_inten = r.get_endpoint_cluster_inten_table(pred_endpoint_index, false); |
536 | color_rgba pred_color = r.get_endpoint_cluster_unscaled_color(pred_endpoint_index, false); |
537 | |
538 | trial_etc_block.set_block_color5(pred_color, pred_color); |
539 | trial_etc_block.set_inten_table(0, pred_inten); |
540 | trial_etc_block.set_inten_table(1, pred_inten); |
541 | |
542 | color_rgba trial_colors[16]; |
543 | unpack_etc1(trial_etc_block, trial_colors); |
544 | |
545 | uint64_t trial_err = 0; |
546 | if (r.get_params().m_perceptual) |
547 | { |
548 | for (uint32_t p = 0; p < 16; p++) |
549 | { |
550 | trial_err += color_distance(true, src_pixels.get_ptr()[p], trial_colors[p], false); |
551 | if (trial_err > thresh_err) |
552 | break; |
553 | } |
554 | } |
555 | else |
556 | { |
557 | for (uint32_t p = 0; p < 16; p++) |
558 | { |
559 | trial_err += color_distance(false, src_pixels.get_ptr()[p], trial_colors[p], false); |
560 | if (trial_err > thresh_err) |
561 | break; |
562 | } |
563 | } |
564 | |
565 | if (trial_err <= thresh_err) |
566 | { |
567 | if ((trial_err < best_err) || ((trial_err == best_err) && (endpoint_pred < best_endpoint_pred))) |
568 | { |
569 | best_endpoint_pred = endpoint_pred; |
570 | best_err = trial_err; |
571 | best_endpoint_index = pred_endpoint_index; |
572 | } |
573 | } |
574 | } // endpoint_pred |
575 | |
576 | if (best_endpoint_pred != UINT32_MAX) |
577 | { |
578 | m.m_endpoint_index = best_endpoint_index; |
579 | m.m_endpoint_predictor = best_endpoint_pred; |
580 | |
581 | total_endpoint_pred_hits++; |
582 | total_block_endpoints_remapped++; |
583 | } |
584 | else |
585 | { |
586 | total_endpoint_pred_missed++; |
587 | } |
588 | } |
589 | } |
590 | else |
591 | { |
592 | total_endpoint_pred_missed++; |
593 | } |
594 | |
595 | if (m.m_endpoint_predictor == basist::NO_ENDPOINT_PRED_INDEX) |
596 | { |
597 | all_endpoint_indices.push_back(m.m_endpoint_index); |
598 | } |
599 | |
600 | } // block_x |
601 | |
602 | } // block_y |
603 | |
604 | } // slice |
605 | |
606 | debug_printf("total_endpoint_pred_missed: %u (%3.2f%%) total_endpoint_pred_hit: %u (%3.2f%%), total_block_endpoints_remapped: %u (%3.2f%%)\n" , |
607 | total_endpoint_pred_missed, total_endpoint_pred_missed * 100.0f / get_total_blocks(), |
608 | total_endpoint_pred_hits, total_endpoint_pred_hits * 100.0f / get_total_blocks(), |
609 | total_block_endpoints_remapped, total_block_endpoints_remapped * 100.0f / get_total_blocks()); |
610 | |
611 | reoptimize_and_sort_endpoints_codebook(total_block_endpoints_remapped, all_endpoint_indices); |
612 | |
613 | sort_selector_codebook(); |
614 | check_for_valid_cr_blocks(); |
615 | |
616 | debug_printf("Elapsed time: %3.3f secs\n" , tm.get_elapsed_secs()); |
617 | } |
618 | |
619 | void basisu_backend::compute_slice_crcs() |
620 | { |
621 | for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++) |
622 | { |
623 | //const uint32_t first_block_index = m_slices[slice_index].m_first_block_index; |
624 | const uint32_t width = m_slices[slice_index].m_width; |
625 | const uint32_t height = m_slices[slice_index].m_height; |
626 | const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x; |
627 | const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y; |
628 | |
629 | gpu_image gi; |
630 | gi.init(texture_format::cETC1, width, height); |
631 | |
632 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
633 | { |
634 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
635 | { |
636 | //const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x; |
637 | |
638 | encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y); |
639 | |
640 | { |
641 | etc_block& output_block = *(etc_block*)gi.get_block_ptr(block_x, block_y); |
642 | |
643 | output_block.set_diff_bit(true); |
644 | // Setting the flip bit to false to be compatible with the Khronos KDFS. |
645 | //output_block.set_flip_bit(true); |
646 | output_block.set_flip_bit(false); |
647 | |
648 | const uint32_t endpoint_index = m.m_endpoint_index; |
649 | |
650 | output_block.set_block_color5_etc1s(m_endpoint_palette[endpoint_index].m_color5); |
651 | output_block.set_inten_tables_etc1s(m_endpoint_palette[endpoint_index].m_inten5); |
652 | |
653 | const uint32_t selector_idx = m.m_selector_index; |
654 | |
655 | const etc1_selector_palette_entry& selectors = m_selector_palette[selector_idx]; |
656 | for (uint32_t sy = 0; sy < 4; sy++) |
657 | for (uint32_t sx = 0; sx < 4; sx++) |
658 | output_block.set_selector(sx, sy, selectors(sx, sy)); |
659 | } |
660 | |
661 | } // block_x |
662 | } // block_y |
663 | |
664 | m_output.m_slice_image_crcs[slice_index] = basist::crc16(gi.get_ptr(), gi.get_size_in_bytes(), 0); |
665 | |
666 | if (m_params.m_debug_images) |
667 | { |
668 | image gi_unpacked; |
669 | gi.unpack(gi_unpacked); |
670 | |
671 | char buf[256]; |
672 | #ifdef _WIN32 |
673 | sprintf_s(buf, sizeof(buf), "basisu_backend_slice_%u.png" , slice_index); |
674 | #else |
675 | snprintf(buf, sizeof(buf), "basisu_backend_slice_%u.png" , slice_index); |
676 | #endif |
677 | save_png(buf, gi_unpacked); |
678 | } |
679 | |
680 | } // slice_index |
681 | } |
682 | |
683 | //uint32_t g_color_delta_hist[255 * 3 + 1]; |
684 | //uint32_t g_color_delta_bad_hist[255 * 3 + 1]; |
685 | |
686 | // TODO: Split this into multiple methods. |
687 | bool basisu_backend::encode_image() |
688 | { |
689 | basisu_frontend& r = *m_pFront_end; |
690 | const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames; |
691 | |
692 | uint32_t total_used_selector_history_buf = 0; |
693 | uint32_t total_selector_indices_remapped = 0; |
694 | |
695 | basist::approx_move_to_front selector_history_buf(basist::MAX_SELECTOR_HISTORY_BUF_SIZE); |
696 | histogram selector_history_buf_histogram(basist::MAX_SELECTOR_HISTORY_BUF_SIZE); |
697 | histogram selector_histogram(r.get_total_selector_clusters() + basist::MAX_SELECTOR_HISTORY_BUF_SIZE + 1); |
698 | histogram selector_history_buf_rle_histogram(1 << basist::SELECTOR_HISTORY_BUF_RLE_COUNT_BITS); |
699 | |
700 | basisu::vector<uint_vec> selector_syms(m_slices.size()); |
701 | |
702 | const uint32_t SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX = r.get_total_selector_clusters(); |
703 | const uint32_t SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + basist::MAX_SELECTOR_HISTORY_BUF_SIZE; |
704 | |
705 | m_output.m_slice_image_crcs.resize(m_slices.size()); |
706 | |
707 | histogram delta_endpoint_histogram(r.get_total_endpoint_clusters()); |
708 | |
709 | histogram endpoint_pred_histogram(basist::ENDPOINT_PRED_TOTAL_SYMBOLS); |
710 | basisu::vector<uint_vec> endpoint_pred_syms(m_slices.size()); |
711 | |
712 | uint32_t total_endpoint_indices_remapped = 0; |
713 | |
714 | uint_vec block_endpoint_indices, block_selector_indices; |
715 | |
716 | interval_timer tm; |
717 | tm.start(); |
718 | |
719 | const int COLOR_DELTA_THRESH = 8; |
720 | const int SEL_DIFF_THRESHOLD = 11; |
721 | |
722 | for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++) |
723 | { |
724 | //const int prev_frame_slice_index = is_video ? find_video_frame(slice_index, -1) : -1; |
725 | //const int next_frame_slice_index = is_video ? find_video_frame(slice_index, 1) : -1; |
726 | const uint32_t first_block_index = m_slices[slice_index].m_first_block_index; |
727 | //const uint32_t width = m_slices[slice_index].m_width; |
728 | //const uint32_t height = m_slices[slice_index].m_height; |
729 | const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x; |
730 | const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y; |
731 | |
732 | selector_history_buf.reset(); |
733 | |
734 | int selector_history_buf_rle_count = 0; |
735 | |
736 | int prev_endpoint_pred_sym_bits = -1, endpoint_pred_repeat_count = 0; |
737 | |
738 | uint32_t prev_endpoint_index = 0; |
739 | |
740 | vector2D<uint8_t> block_endpoints_are_referenced(num_blocks_x, num_blocks_y); |
741 | |
742 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
743 | { |
744 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
745 | { |
746 | //const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x; |
747 | |
748 | encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y); |
749 | |
750 | if (m.m_endpoint_predictor == 0) |
751 | block_endpoints_are_referenced(block_x - 1, block_y) = true; |
752 | else if (m.m_endpoint_predictor == 1) |
753 | block_endpoints_are_referenced(block_x, block_y - 1) = true; |
754 | else if (m.m_endpoint_predictor == 2) |
755 | { |
756 | if (!is_video) |
757 | block_endpoints_are_referenced(block_x - 1, block_y - 1) = true; |
758 | } |
759 | if (is_video) |
760 | { |
761 | if (m.m_is_cr_target) |
762 | block_endpoints_are_referenced(block_x, block_y) = true; |
763 | } |
764 | |
765 | } // block_x |
766 | } // block_y |
767 | |
768 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
769 | { |
770 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
771 | { |
772 | const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x; |
773 | |
774 | encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y); |
775 | |
776 | if (((block_x & 1) == 0) && ((block_y & 1) == 0)) |
777 | { |
778 | uint32_t endpoint_pred_cur_sym_bits = 0; |
779 | |
780 | for (uint32_t y = 0; y < 2; y++) |
781 | { |
782 | for (uint32_t x = 0; x < 2; x++) |
783 | { |
784 | const uint32_t bx = block_x + x; |
785 | const uint32_t by = block_y + y; |
786 | |
787 | uint32_t pred = basist::NO_ENDPOINT_PRED_INDEX; |
788 | if ((bx < num_blocks_x) && (by < num_blocks_y)) |
789 | pred = m_slice_encoder_blocks[slice_index](bx, by).m_endpoint_predictor; |
790 | |
791 | endpoint_pred_cur_sym_bits |= (pred << (x * 2 + y * 4)); |
792 | } |
793 | } |
794 | |
795 | if ((int)endpoint_pred_cur_sym_bits == prev_endpoint_pred_sym_bits) |
796 | { |
797 | endpoint_pred_repeat_count++; |
798 | } |
799 | else |
800 | { |
801 | if (endpoint_pred_repeat_count > 0) |
802 | { |
803 | if (endpoint_pred_repeat_count > (int)basist::ENDPOINT_PRED_MIN_REPEAT_COUNT) |
804 | { |
805 | endpoint_pred_histogram.inc(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL); |
806 | endpoint_pred_syms[slice_index].push_back(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL); |
807 | |
808 | endpoint_pred_syms[slice_index].push_back(endpoint_pred_repeat_count); |
809 | } |
810 | else |
811 | { |
812 | for (int j = 0; j < endpoint_pred_repeat_count; j++) |
813 | { |
814 | endpoint_pred_histogram.inc(prev_endpoint_pred_sym_bits); |
815 | endpoint_pred_syms[slice_index].push_back(prev_endpoint_pred_sym_bits); |
816 | } |
817 | } |
818 | |
819 | endpoint_pred_repeat_count = 0; |
820 | } |
821 | |
822 | endpoint_pred_histogram.inc(endpoint_pred_cur_sym_bits); |
823 | endpoint_pred_syms[slice_index].push_back(endpoint_pred_cur_sym_bits); |
824 | |
825 | prev_endpoint_pred_sym_bits = endpoint_pred_cur_sym_bits; |
826 | } |
827 | } |
828 | |
829 | int new_endpoint_index = m_endpoint_remap_table_old_to_new[m.m_endpoint_index]; |
830 | |
831 | if (m.m_endpoint_predictor == basist::NO_ENDPOINT_PRED_INDEX) |
832 | { |
833 | int endpoint_delta = new_endpoint_index - prev_endpoint_index; |
834 | |
835 | if ((m_params.m_endpoint_rdo_quality_thresh > 1.0f) && (iabs(endpoint_delta) > 1) && (!block_endpoints_are_referenced(block_x, block_y))) |
836 | { |
837 | const pixel_block& src_pixels = r.get_source_pixel_block(block_index); |
838 | |
839 | etc_block etc_blk(r.get_output_block(block_index)); |
840 | |
841 | const uint64_t cur_err = etc_blk.evaluate_etc1_error(src_pixels.get_ptr(), r.get_params().m_perceptual); |
842 | const uint32_t cur_inten5 = etc_blk.get_inten_table(0); |
843 | |
844 | const etc1_endpoint_palette_entry& cur_endpoints = m_endpoint_palette[m.m_endpoint_index]; |
845 | |
846 | if (cur_err) |
847 | { |
848 | const float endpoint_remap_thresh = maximum(1.0f, m_params.m_endpoint_rdo_quality_thresh); |
849 | const uint64_t thresh_err = (uint64_t)(cur_err * endpoint_remap_thresh); |
850 | |
851 | //const int MAX_ENDPOINT_SEARCH_DIST = (m_params.m_compression_level >= 2) ? 64 : 32; |
852 | const int MAX_ENDPOINT_SEARCH_DIST = (m_params.m_compression_level >= 2) ? 64 : 16; |
853 | |
854 | if (!g_cpu_supports_sse41) |
855 | { |
856 | const uint64_t initial_best_trial_err = UINT64_MAX; |
857 | uint64_t best_trial_err = initial_best_trial_err; |
858 | int best_trial_idx = 0; |
859 | |
860 | etc_block trial_etc_blk(etc_blk); |
861 | |
862 | const int search_dist = minimum<int>(iabs(endpoint_delta) - 1, MAX_ENDPOINT_SEARCH_DIST); |
863 | for (int d = -search_dist; d < search_dist; d++) |
864 | { |
865 | int trial_idx = prev_endpoint_index + d; |
866 | if (trial_idx < 0) |
867 | trial_idx += (int)r.get_total_endpoint_clusters(); |
868 | else if (trial_idx >= (int)r.get_total_endpoint_clusters()) |
869 | trial_idx -= (int)r.get_total_endpoint_clusters(); |
870 | |
871 | if (trial_idx == new_endpoint_index) |
872 | continue; |
873 | |
874 | // Skip it if this new endpoint palette entry is actually never used. |
875 | if (!m_new_endpoint_was_used[trial_idx]) |
876 | continue; |
877 | |
878 | const etc1_endpoint_palette_entry& p = m_endpoint_palette[m_endpoint_remap_table_new_to_old[trial_idx]]; |
879 | |
880 | if (m_params.m_compression_level <= 1) |
881 | { |
882 | if (p.m_inten5 > cur_inten5) |
883 | continue; |
884 | |
885 | int delta_r = iabs(cur_endpoints.m_color5.r - p.m_color5.r); |
886 | int delta_g = iabs(cur_endpoints.m_color5.g - p.m_color5.g); |
887 | int delta_b = iabs(cur_endpoints.m_color5.b - p.m_color5.b); |
888 | int color_delta = delta_r + delta_g + delta_b; |
889 | |
890 | if (color_delta > COLOR_DELTA_THRESH) |
891 | continue; |
892 | } |
893 | |
894 | trial_etc_blk.set_block_color5_etc1s(p.m_color5); |
895 | trial_etc_blk.set_inten_tables_etc1s(p.m_inten5); |
896 | |
897 | uint64_t trial_err = trial_etc_blk.evaluate_etc1_error(src_pixels.get_ptr(), r.get_params().m_perceptual); |
898 | |
899 | if ((trial_err < best_trial_err) && (trial_err <= thresh_err)) |
900 | { |
901 | best_trial_err = trial_err; |
902 | best_trial_idx = trial_idx; |
903 | } |
904 | } |
905 | |
906 | if (best_trial_err != initial_best_trial_err) |
907 | { |
908 | m.m_endpoint_index = m_endpoint_remap_table_new_to_old[best_trial_idx]; |
909 | |
910 | new_endpoint_index = best_trial_idx; |
911 | |
912 | endpoint_delta = new_endpoint_index - prev_endpoint_index; |
913 | |
914 | total_endpoint_indices_remapped++; |
915 | } |
916 | } |
917 | else |
918 | { |
919 | #if BASISU_SUPPORT_SSE |
920 | uint8_t block_selectors[16]; |
921 | for (uint32_t i = 0; i < 16; i++) |
922 | block_selectors[i] = (uint8_t)etc_blk.get_selector(i & 3, i >> 2); |
923 | |
924 | const int64_t initial_best_trial_err = INT64_MAX; |
925 | int64_t best_trial_err = initial_best_trial_err; |
926 | int best_trial_idx = 0; |
927 | |
928 | const int search_dist = minimum<int>(iabs(endpoint_delta) - 1, MAX_ENDPOINT_SEARCH_DIST); |
929 | for (int d = -search_dist; d < search_dist; d++) |
930 | { |
931 | int trial_idx = prev_endpoint_index + d; |
932 | if (trial_idx < 0) |
933 | trial_idx += (int)r.get_total_endpoint_clusters(); |
934 | else if (trial_idx >= (int)r.get_total_endpoint_clusters()) |
935 | trial_idx -= (int)r.get_total_endpoint_clusters(); |
936 | |
937 | if (trial_idx == new_endpoint_index) |
938 | continue; |
939 | |
940 | // Skip it if this new endpoint palette entry is actually never used. |
941 | if (!m_new_endpoint_was_used[trial_idx]) |
942 | continue; |
943 | |
944 | const etc1_endpoint_palette_entry& p = m_endpoint_palette[m_endpoint_remap_table_new_to_old[trial_idx]]; |
945 | |
946 | if (m_params.m_compression_level <= 1) |
947 | { |
948 | if (p.m_inten5 > cur_inten5) |
949 | continue; |
950 | |
951 | int delta_r = iabs(cur_endpoints.m_color5.r - p.m_color5.r); |
952 | int delta_g = iabs(cur_endpoints.m_color5.g - p.m_color5.g); |
953 | int delta_b = iabs(cur_endpoints.m_color5.b - p.m_color5.b); |
954 | int color_delta = delta_r + delta_g + delta_b; |
955 | |
956 | if (color_delta > COLOR_DELTA_THRESH) |
957 | continue; |
958 | } |
959 | |
960 | color_rgba block_colors[4]; |
961 | etc_block::get_block_colors_etc1s(block_colors, p.m_color5, p.m_inten5); |
962 | |
963 | int64_t trial_err; |
964 | if (r.get_params().m_perceptual) |
965 | { |
966 | perceptual_distance_rgb_4_N_sse41(&trial_err, block_selectors, block_colors, src_pixels.get_ptr(), 16, best_trial_err); |
967 | } |
968 | else |
969 | { |
970 | linear_distance_rgb_4_N_sse41(&trial_err, block_selectors, block_colors, src_pixels.get_ptr(), 16, best_trial_err); |
971 | } |
972 | |
973 | //if (trial_err > thresh_err) |
974 | // g_color_delta_bad_hist[color_delta]++; |
975 | |
976 | if ((trial_err < best_trial_err) && (trial_err <= (int64_t)thresh_err)) |
977 | { |
978 | best_trial_err = trial_err; |
979 | best_trial_idx = trial_idx; |
980 | } |
981 | } |
982 | |
983 | if (best_trial_err != initial_best_trial_err) |
984 | { |
985 | m.m_endpoint_index = m_endpoint_remap_table_new_to_old[best_trial_idx]; |
986 | |
987 | new_endpoint_index = best_trial_idx; |
988 | |
989 | endpoint_delta = new_endpoint_index - prev_endpoint_index; |
990 | |
991 | total_endpoint_indices_remapped++; |
992 | } |
993 | #endif // BASISU_SUPPORT_SSE |
994 | } // if (!g_cpu_supports_sse41) |
995 | |
996 | } // if (cur_err) |
997 | |
998 | } // if ((m_params.m_endpoint_rdo_quality_thresh > 1.0f) && (iabs(endpoint_delta) > 1) && (!block_endpoints_are_referenced(block_x, block_y))) |
999 | |
1000 | if (endpoint_delta < 0) |
1001 | endpoint_delta += (int)r.get_total_endpoint_clusters(); |
1002 | |
1003 | delta_endpoint_histogram.inc(endpoint_delta); |
1004 | |
1005 | } // if (m.m_endpoint_predictor == basist::NO_ENDPOINT_PRED_INDEX) |
1006 | |
1007 | block_endpoint_indices.push_back(m_endpoint_remap_table_new_to_old[new_endpoint_index]); |
1008 | |
1009 | prev_endpoint_index = new_endpoint_index; |
1010 | |
1011 | if ((!is_video) || (m.m_endpoint_predictor != basist::CR_ENDPOINT_PRED_INDEX)) |
1012 | { |
1013 | int new_selector_index = m_selector_remap_table_old_to_new[m.m_selector_index]; |
1014 | |
1015 | const float selector_remap_thresh = maximum(1.0f, m_params.m_selector_rdo_quality_thresh); //2.5f; |
1016 | |
1017 | int selector_history_buf_index = -1; |
1018 | |
1019 | // At low comp levels this hurts compression a tiny amount, but is significantly faster so it's a good tradeoff. |
1020 | if ((m.m_is_cr_target) || (m_params.m_compression_level <= 1)) |
1021 | { |
1022 | for (uint32_t j = 0; j < selector_history_buf.size(); j++) |
1023 | { |
1024 | const int trial_idx = selector_history_buf[j]; |
1025 | if (trial_idx == new_selector_index) |
1026 | { |
1027 | total_used_selector_history_buf++; |
1028 | selector_history_buf_index = j; |
1029 | selector_history_buf_histogram.inc(j); |
1030 | break; |
1031 | } |
1032 | } |
1033 | } |
1034 | |
1035 | // If the block is a CR target we can't override its selectors. |
1036 | if ((!m.m_is_cr_target) && (selector_history_buf_index == -1)) |
1037 | { |
1038 | const pixel_block& src_pixels = r.get_source_pixel_block(block_index); |
1039 | |
1040 | etc_block etc_blk = r.get_output_block(block_index); |
1041 | |
1042 | // This is new code - the initial release just used the endpoints from the frontend, which isn't correct/accurate. |
1043 | const etc1_endpoint_palette_entry& q = m_endpoint_palette[m_endpoint_remap_table_new_to_old[new_endpoint_index]]; |
1044 | etc_blk.set_block_color5_etc1s(q.m_color5); |
1045 | etc_blk.set_inten_tables_etc1s(q.m_inten5); |
1046 | |
1047 | color_rgba block_colors[4]; |
1048 | etc_blk.get_block_colors(block_colors, 0); |
1049 | |
1050 | const uint8_t* pCur_selectors = &m_selector_palette[m.m_selector_index][0]; |
1051 | |
1052 | uint64_t cur_err = 0; |
1053 | if (r.get_params().m_perceptual) |
1054 | { |
1055 | for (uint32_t p = 0; p < 16; p++) |
1056 | cur_err += color_distance(true, src_pixels.get_ptr()[p], block_colors[pCur_selectors[p]], false); |
1057 | } |
1058 | else |
1059 | { |
1060 | for (uint32_t p = 0; p < 16; p++) |
1061 | cur_err += color_distance(false, src_pixels.get_ptr()[p], block_colors[pCur_selectors[p]], false); |
1062 | } |
1063 | |
1064 | const uint64_t limit_err = (uint64_t)ceilf(cur_err * selector_remap_thresh); |
1065 | |
1066 | // Even if cur_err==limit_err, we still want to scan the history buffer because there may be equivalent entries that are cheaper to code. |
1067 | |
1068 | uint64_t best_trial_err = UINT64_MAX; |
1069 | int best_trial_idx = 0; |
1070 | uint32_t best_trial_history_buf_idx = 0; |
1071 | |
1072 | for (uint32_t j = 0; j < selector_history_buf.size(); j++) |
1073 | { |
1074 | const int trial_idx = selector_history_buf[j]; |
1075 | |
1076 | const uint8_t* pSelectors = &m_selector_palette[m_selector_remap_table_new_to_old[trial_idx]][0]; |
1077 | |
1078 | if (m_params.m_compression_level <= 1) |
1079 | { |
1080 | // Predict if evaluating the full color error would cause an early out, by summing the abs err of the selector indices. |
1081 | int sel_diff = 0; |
1082 | for (uint32_t p = 0; p < 16; p += 4) |
1083 | { |
1084 | sel_diff += iabs(pCur_selectors[p + 0] - pSelectors[p + 0]); |
1085 | sel_diff += iabs(pCur_selectors[p + 1] - pSelectors[p + 1]); |
1086 | sel_diff += iabs(pCur_selectors[p + 2] - pSelectors[p + 2]); |
1087 | sel_diff += iabs(pCur_selectors[p + 3] - pSelectors[p + 3]); |
1088 | if (sel_diff >= SEL_DIFF_THRESHOLD) |
1089 | break; |
1090 | } |
1091 | if (sel_diff >= SEL_DIFF_THRESHOLD) |
1092 | continue; |
1093 | } |
1094 | |
1095 | const uint64_t thresh_err = minimum(limit_err, best_trial_err); |
1096 | uint64_t trial_err = 0; |
1097 | |
1098 | // This tends to early out quickly, so SSE has a hard time competing. |
1099 | if (r.get_params().m_perceptual) |
1100 | { |
1101 | for (uint32_t p = 0; p < 16; p++) |
1102 | { |
1103 | uint32_t sel = pSelectors[p]; |
1104 | trial_err += color_distance(true, src_pixels.get_ptr()[p], block_colors[sel], false); |
1105 | if (trial_err > thresh_err) |
1106 | break; |
1107 | } |
1108 | } |
1109 | else |
1110 | { |
1111 | for (uint32_t p = 0; p < 16; p++) |
1112 | { |
1113 | uint32_t sel = pSelectors[p]; |
1114 | trial_err += color_distance(false, src_pixels.get_ptr()[p], block_colors[sel], false); |
1115 | if (trial_err > thresh_err) |
1116 | break; |
1117 | } |
1118 | } |
1119 | |
1120 | if ((trial_err < best_trial_err) && (trial_err <= thresh_err)) |
1121 | { |
1122 | assert(trial_err <= limit_err); |
1123 | |
1124 | best_trial_err = trial_err; |
1125 | best_trial_idx = trial_idx; |
1126 | best_trial_history_buf_idx = j; |
1127 | } |
1128 | } |
1129 | |
1130 | if (best_trial_err != UINT64_MAX) |
1131 | { |
1132 | if (new_selector_index != best_trial_idx) |
1133 | total_selector_indices_remapped++; |
1134 | |
1135 | new_selector_index = best_trial_idx; |
1136 | |
1137 | total_used_selector_history_buf++; |
1138 | |
1139 | selector_history_buf_index = best_trial_history_buf_idx; |
1140 | |
1141 | selector_history_buf_histogram.inc(best_trial_history_buf_idx); |
1142 | } |
1143 | |
1144 | } // if (m_params.m_selector_rdo_quality_thresh > 0.0f) |
1145 | |
1146 | m.m_selector_index = m_selector_remap_table_new_to_old[new_selector_index]; |
1147 | |
1148 | |
1149 | if ((selector_history_buf_rle_count) && (selector_history_buf_index != 0)) |
1150 | { |
1151 | if (selector_history_buf_rle_count >= (int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH) |
1152 | { |
1153 | selector_syms[slice_index].push_back(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX); |
1154 | selector_syms[slice_index].push_back(selector_history_buf_rle_count); |
1155 | |
1156 | int run_sym = selector_history_buf_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH; |
1157 | if (run_sym >= ((int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1)) |
1158 | selector_history_buf_rle_histogram.inc(basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1); |
1159 | else |
1160 | selector_history_buf_rle_histogram.inc(run_sym); |
1161 | |
1162 | selector_histogram.inc(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX); |
1163 | } |
1164 | else |
1165 | { |
1166 | for (int k = 0; k < selector_history_buf_rle_count; k++) |
1167 | { |
1168 | uint32_t sym_index = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + 0; |
1169 | |
1170 | selector_syms[slice_index].push_back(sym_index); |
1171 | |
1172 | selector_histogram.inc(sym_index); |
1173 | } |
1174 | } |
1175 | |
1176 | selector_history_buf_rle_count = 0; |
1177 | } |
1178 | |
1179 | if (selector_history_buf_index >= 0) |
1180 | { |
1181 | if (selector_history_buf_index == 0) |
1182 | selector_history_buf_rle_count++; |
1183 | else |
1184 | { |
1185 | uint32_t history_buf_sym = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + selector_history_buf_index; |
1186 | |
1187 | selector_syms[slice_index].push_back(history_buf_sym); |
1188 | |
1189 | selector_histogram.inc(history_buf_sym); |
1190 | } |
1191 | } |
1192 | else |
1193 | { |
1194 | selector_syms[slice_index].push_back(new_selector_index); |
1195 | |
1196 | selector_histogram.inc(new_selector_index); |
1197 | } |
1198 | |
1199 | m.m_selector_history_buf_index = selector_history_buf_index; |
1200 | |
1201 | if (selector_history_buf_index < 0) |
1202 | selector_history_buf.add(new_selector_index); |
1203 | else if (selector_history_buf.size()) |
1204 | selector_history_buf.use(selector_history_buf_index); |
1205 | } |
1206 | block_selector_indices.push_back(m.m_selector_index); |
1207 | |
1208 | } // block_x |
1209 | |
1210 | } // block_y |
1211 | |
1212 | if (endpoint_pred_repeat_count > 0) |
1213 | { |
1214 | if (endpoint_pred_repeat_count > (int)basist::ENDPOINT_PRED_MIN_REPEAT_COUNT) |
1215 | { |
1216 | endpoint_pred_histogram.inc(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL); |
1217 | endpoint_pred_syms[slice_index].push_back(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL); |
1218 | |
1219 | endpoint_pred_syms[slice_index].push_back(endpoint_pred_repeat_count); |
1220 | } |
1221 | else |
1222 | { |
1223 | for (int j = 0; j < endpoint_pred_repeat_count; j++) |
1224 | { |
1225 | endpoint_pred_histogram.inc(prev_endpoint_pred_sym_bits); |
1226 | endpoint_pred_syms[slice_index].push_back(prev_endpoint_pred_sym_bits); |
1227 | } |
1228 | } |
1229 | |
1230 | endpoint_pred_repeat_count = 0; |
1231 | } |
1232 | |
1233 | if (selector_history_buf_rle_count) |
1234 | { |
1235 | if (selector_history_buf_rle_count >= (int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH) |
1236 | { |
1237 | selector_syms[slice_index].push_back(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX); |
1238 | selector_syms[slice_index].push_back(selector_history_buf_rle_count); |
1239 | |
1240 | int run_sym = selector_history_buf_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH; |
1241 | if (run_sym >= ((int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1)) |
1242 | selector_history_buf_rle_histogram.inc(basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1); |
1243 | else |
1244 | selector_history_buf_rle_histogram.inc(run_sym); |
1245 | |
1246 | selector_histogram.inc(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX); |
1247 | } |
1248 | else |
1249 | { |
1250 | for (int i = 0; i < selector_history_buf_rle_count; i++) |
1251 | { |
1252 | uint32_t sym_index = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + 0; |
1253 | |
1254 | selector_syms[slice_index].push_back(sym_index); |
1255 | |
1256 | selector_histogram.inc(sym_index); |
1257 | } |
1258 | } |
1259 | |
1260 | selector_history_buf_rle_count = 0; |
1261 | } |
1262 | |
1263 | } // slice_index |
1264 | |
1265 | //for (int i = 0; i <= 255 * 3; i++) |
1266 | //{ |
1267 | // printf("%u, %u, %f\n", g_color_delta_bad_hist[i], g_color_delta_hist[i], g_color_delta_hist[i] ? g_color_delta_bad_hist[i] / (float)g_color_delta_hist[i] : 0); |
1268 | //} |
1269 | |
1270 | double total_prep_time = tm.get_elapsed_secs(); |
1271 | debug_printf("basisu_backend::encode_image: Total prep time: %3.2f\n" , total_prep_time); |
1272 | |
1273 | debug_printf("Endpoint pred RDO total endpoint indices remapped: %u %3.2f%%\n" , |
1274 | total_endpoint_indices_remapped, total_endpoint_indices_remapped * 100.0f / get_total_blocks()); |
1275 | |
1276 | debug_printf("Selector history RDO total selector indices remapped: %u %3.2f%%, Used history buf: %u %3.2f%%\n" , |
1277 | total_selector_indices_remapped, total_selector_indices_remapped * 100.0f / get_total_blocks(), |
1278 | total_used_selector_history_buf, total_used_selector_history_buf * 100.0f / get_total_blocks()); |
1279 | |
1280 | //if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 0)) |
1281 | if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 1) && (!m_params.m_used_global_codebooks)) |
1282 | { |
1283 | int_vec unused; |
1284 | r.reoptimize_remapped_endpoints(block_endpoint_indices, unused, false, &block_selector_indices); |
1285 | |
1286 | create_endpoint_palette(); |
1287 | } |
1288 | |
1289 | check_for_valid_cr_blocks(); |
1290 | compute_slice_crcs(); |
1291 | |
1292 | double endpoint_pred_entropy = endpoint_pred_histogram.get_entropy() / endpoint_pred_histogram.get_total(); |
1293 | double delta_endpoint_entropy = delta_endpoint_histogram.get_entropy() / delta_endpoint_histogram.get_total(); |
1294 | double selector_entropy = selector_histogram.get_entropy() / selector_histogram.get_total(); |
1295 | |
1296 | debug_printf("Histogram entropy: EndpointPred: %3.3f DeltaEndpoint: %3.3f DeltaSelector: %3.3f\n" , endpoint_pred_entropy, delta_endpoint_entropy, selector_entropy); |
1297 | |
1298 | if (!endpoint_pred_histogram.get_total()) |
1299 | endpoint_pred_histogram.inc(0); |
1300 | huffman_encoding_table endpoint_pred_model; |
1301 | if (!endpoint_pred_model.init(endpoint_pred_histogram, 16)) |
1302 | { |
1303 | error_printf("endpoint_pred_model.init() failed!" ); |
1304 | return false; |
1305 | } |
1306 | |
1307 | if (!delta_endpoint_histogram.get_total()) |
1308 | delta_endpoint_histogram.inc(0); |
1309 | huffman_encoding_table delta_endpoint_model; |
1310 | if (!delta_endpoint_model.init(delta_endpoint_histogram, 16)) |
1311 | { |
1312 | error_printf("delta_endpoint_model.init() failed!" ); |
1313 | return false; |
1314 | } |
1315 | if (!selector_histogram.get_total()) |
1316 | selector_histogram.inc(0); |
1317 | |
1318 | huffman_encoding_table selector_model; |
1319 | if (!selector_model.init(selector_histogram, 16)) |
1320 | { |
1321 | error_printf("selector_model.init() failed!" ); |
1322 | return false; |
1323 | } |
1324 | |
1325 | if (!selector_history_buf_rle_histogram.get_total()) |
1326 | selector_history_buf_rle_histogram.inc(0); |
1327 | |
1328 | huffman_encoding_table selector_history_buf_rle_model; |
1329 | if (!selector_history_buf_rle_model.init(selector_history_buf_rle_histogram, 16)) |
1330 | { |
1331 | error_printf("selector_history_buf_rle_model.init() failed!" ); |
1332 | return false; |
1333 | } |
1334 | |
1335 | bitwise_coder coder; |
1336 | coder.init(1024 * 1024 * 4); |
1337 | |
1338 | uint32_t endpoint_pred_model_bits = coder.emit_huffman_table(endpoint_pred_model); |
1339 | uint32_t delta_endpoint_bits = coder.emit_huffman_table(delta_endpoint_model); |
1340 | uint32_t selector_model_bits = coder.emit_huffman_table(selector_model); |
1341 | uint32_t selector_history_buf_run_sym_bits = coder.emit_huffman_table(selector_history_buf_rle_model); |
1342 | |
1343 | coder.put_bits(basist::MAX_SELECTOR_HISTORY_BUF_SIZE, 13); |
1344 | |
1345 | debug_printf("Model sizes: EndpointPred: %u bits %u bytes (%3.3f bpp) DeltaEndpoint: %u bits %u bytes (%3.3f bpp) Selector: %u bits %u bytes (%3.3f bpp) SelectorHistBufRLE: %u bits %u bytes (%3.3f bpp)\n" , |
1346 | endpoint_pred_model_bits, (endpoint_pred_model_bits + 7) / 8, endpoint_pred_model_bits / float(get_total_input_texels()), |
1347 | delta_endpoint_bits, (delta_endpoint_bits + 7) / 8, delta_endpoint_bits / float(get_total_input_texels()), |
1348 | selector_model_bits, (selector_model_bits + 7) / 8, selector_model_bits / float(get_total_input_texels()), |
1349 | selector_history_buf_run_sym_bits, (selector_history_buf_run_sym_bits + 7) / 8, selector_history_buf_run_sym_bits / float(get_total_input_texels())); |
1350 | |
1351 | coder.flush(); |
1352 | |
1353 | m_output.m_slice_image_tables = coder.get_bytes(); |
1354 | |
1355 | uint32_t total_endpoint_pred_bits = 0, total_delta_endpoint_bits = 0, total_selector_bits = 0; |
1356 | |
1357 | uint32_t total_image_bytes = 0; |
1358 | |
1359 | m_output.m_slice_image_data.resize(m_slices.size()); |
1360 | |
1361 | for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++) |
1362 | { |
1363 | //const uint32_t width = m_slices[slice_index].m_width; |
1364 | //const uint32_t height = m_slices[slice_index].m_height; |
1365 | const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x; |
1366 | const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y; |
1367 | |
1368 | coder.init(1024 * 1024 * 4); |
1369 | |
1370 | uint32_t cur_selector_sym_ofs = 0; |
1371 | uint32_t selector_rle_count = 0; |
1372 | |
1373 | int endpoint_pred_repeat_count = 0; |
1374 | uint32_t cur_endpoint_pred_sym_ofs = 0; |
1375 | // uint32_t prev_endpoint_pred_sym = 0; |
1376 | uint32_t prev_endpoint_index = 0; |
1377 | |
1378 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
1379 | { |
1380 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
1381 | { |
1382 | const encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y); |
1383 | |
1384 | if (((block_x & 1) == 0) && ((block_y & 1) == 0)) |
1385 | { |
1386 | if (endpoint_pred_repeat_count > 0) |
1387 | { |
1388 | endpoint_pred_repeat_count--; |
1389 | } |
1390 | else |
1391 | { |
1392 | uint32_t sym = endpoint_pred_syms[slice_index][cur_endpoint_pred_sym_ofs++]; |
1393 | |
1394 | if (sym == basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL) |
1395 | { |
1396 | total_endpoint_pred_bits += coder.put_code(sym, endpoint_pred_model); |
1397 | |
1398 | endpoint_pred_repeat_count = endpoint_pred_syms[slice_index][cur_endpoint_pred_sym_ofs++]; |
1399 | assert(endpoint_pred_repeat_count >= (int)basist::ENDPOINT_PRED_MIN_REPEAT_COUNT); |
1400 | |
1401 | total_endpoint_pred_bits += coder.put_vlc(endpoint_pred_repeat_count - basist::ENDPOINT_PRED_MIN_REPEAT_COUNT, basist::ENDPOINT_PRED_COUNT_VLC_BITS); |
1402 | |
1403 | endpoint_pred_repeat_count--; |
1404 | } |
1405 | else |
1406 | { |
1407 | total_endpoint_pred_bits += coder.put_code(sym, endpoint_pred_model); |
1408 | |
1409 | //prev_endpoint_pred_sym = sym; |
1410 | } |
1411 | } |
1412 | } |
1413 | |
1414 | const int new_endpoint_index = m_endpoint_remap_table_old_to_new[m.m_endpoint_index]; |
1415 | |
1416 | if (m.m_endpoint_predictor == basist::NO_ENDPOINT_PRED_INDEX) |
1417 | { |
1418 | int endpoint_delta = new_endpoint_index - prev_endpoint_index; |
1419 | if (endpoint_delta < 0) |
1420 | endpoint_delta += (int)r.get_total_endpoint_clusters(); |
1421 | |
1422 | total_delta_endpoint_bits += coder.put_code(endpoint_delta, delta_endpoint_model); |
1423 | } |
1424 | |
1425 | prev_endpoint_index = new_endpoint_index; |
1426 | |
1427 | if ((!is_video) || (m.m_endpoint_predictor != basist::CR_ENDPOINT_PRED_INDEX)) |
1428 | { |
1429 | if (!selector_rle_count) |
1430 | { |
1431 | uint32_t selector_sym_index = selector_syms[slice_index][cur_selector_sym_ofs++]; |
1432 | |
1433 | if (selector_sym_index == SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX) |
1434 | selector_rle_count = selector_syms[slice_index][cur_selector_sym_ofs++]; |
1435 | |
1436 | total_selector_bits += coder.put_code(selector_sym_index, selector_model); |
1437 | |
1438 | if (selector_sym_index == SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX) |
1439 | { |
1440 | int run_sym = selector_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH; |
1441 | if (run_sym >= ((int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1)) |
1442 | { |
1443 | total_selector_bits += coder.put_code(basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1, selector_history_buf_rle_model); |
1444 | |
1445 | uint32_t n = selector_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH; |
1446 | total_selector_bits += coder.put_vlc(n, 7); |
1447 | } |
1448 | else |
1449 | total_selector_bits += coder.put_code(run_sym, selector_history_buf_rle_model); |
1450 | } |
1451 | } |
1452 | |
1453 | if (selector_rle_count) |
1454 | selector_rle_count--; |
1455 | } |
1456 | |
1457 | } // block_x |
1458 | |
1459 | } // block_y |
1460 | |
1461 | BASISU_BACKEND_VERIFY(cur_endpoint_pred_sym_ofs == endpoint_pred_syms[slice_index].size()); |
1462 | BASISU_BACKEND_VERIFY(cur_selector_sym_ofs == selector_syms[slice_index].size()); |
1463 | |
1464 | coder.flush(); |
1465 | |
1466 | m_output.m_slice_image_data[slice_index] = coder.get_bytes(); |
1467 | |
1468 | total_image_bytes += (uint32_t)coder.get_bytes().size(); |
1469 | |
1470 | debug_printf("Slice %u compressed size: %u bytes, %3.3f bits per slice texel\n" , slice_index, m_output.m_slice_image_data[slice_index].size(), m_output.m_slice_image_data[slice_index].size() * 8.0f / (m_slices[slice_index].m_orig_width * m_slices[slice_index].m_orig_height)); |
1471 | |
1472 | } // slice_index |
1473 | |
1474 | const double total_texels = static_cast<double>(get_total_input_texels()); |
1475 | const double total_blocks = static_cast<double>(get_total_blocks()); |
1476 | |
1477 | debug_printf("Total endpoint pred bits: %u bytes: %u bits/texel: %3.3f bits/block: %3.3f\n" , total_endpoint_pred_bits, total_endpoint_pred_bits / 8, total_endpoint_pred_bits / total_texels, total_endpoint_pred_bits / total_blocks); |
1478 | debug_printf("Total delta endpoint bits: %u bytes: %u bits/texel: %3.3f bits/block: %3.3f\n" , total_delta_endpoint_bits, total_delta_endpoint_bits / 8, total_delta_endpoint_bits / total_texels, total_delta_endpoint_bits / total_blocks); |
1479 | debug_printf("Total selector bits: %u bytes: %u bits/texel: %3.3f bits/block: %3.3f\n" , total_selector_bits, total_selector_bits / 8, total_selector_bits / total_texels, total_selector_bits / total_blocks); |
1480 | |
1481 | debug_printf("Total table bytes: %u, %3.3f bits/texel\n" , m_output.m_slice_image_tables.size(), m_output.m_slice_image_tables.size() * 8.0f / total_texels); |
1482 | debug_printf("Total image bytes: %u, %3.3f bits/texel\n" , total_image_bytes, total_image_bytes * 8.0f / total_texels); |
1483 | |
1484 | return true; |
1485 | } |
1486 | |
1487 | bool basisu_backend::encode_endpoint_palette() |
1488 | { |
1489 | const basisu_frontend& r = *m_pFront_end; |
1490 | |
1491 | // The endpoint indices may have been changed by the backend's RDO step, so go and figure out which ones are actually used again. |
1492 | bool_vec old_endpoint_was_used(r.get_total_endpoint_clusters()); |
1493 | uint32_t first_old_entry_index = UINT32_MAX; |
1494 | |
1495 | for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++) |
1496 | { |
1497 | const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x, num_blocks_y = m_slices[slice_index].m_num_blocks_y; |
1498 | for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) |
1499 | { |
1500 | for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) |
1501 | { |
1502 | encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y); |
1503 | const uint32_t old_endpoint_index = m.m_endpoint_index; |
1504 | |
1505 | old_endpoint_was_used[old_endpoint_index] = true; |
1506 | first_old_entry_index = basisu::minimum(first_old_entry_index, old_endpoint_index); |
1507 | } // block_x |
1508 | } // block_y |
1509 | } // slice_index |
1510 | |
1511 | debug_printf("basisu_backend::encode_endpoint_palette: first_old_entry_index: %u\n" , first_old_entry_index); |
1512 | |
1513 | // Maps NEW to OLD endpoints |
1514 | uint_vec endpoint_remap_table_new_to_old(r.get_total_endpoint_clusters()); |
1515 | endpoint_remap_table_new_to_old.set_all(first_old_entry_index); |
1516 | |
1517 | bool_vec new_endpoint_was_used(r.get_total_endpoint_clusters()); |
1518 | |
1519 | for (uint32_t old_endpoint_index = 0; old_endpoint_index < m_endpoint_remap_table_old_to_new.size(); old_endpoint_index++) |
1520 | { |
1521 | if (old_endpoint_was_used[old_endpoint_index]) |
1522 | { |
1523 | const uint32_t new_endpoint_index = m_endpoint_remap_table_old_to_new[old_endpoint_index]; |
1524 | |
1525 | new_endpoint_was_used[new_endpoint_index] = true; |
1526 | |
1527 | endpoint_remap_table_new_to_old[new_endpoint_index] = old_endpoint_index; |
1528 | } |
1529 | } |
1530 | |
1531 | // TODO: Some new endpoint palette entries may actually be unused and aren't worth coding. Fix that. |
1532 | |
1533 | uint32_t total_unused_new_entries = 0; |
1534 | for (uint32_t i = 0; i < new_endpoint_was_used.size(); i++) |
1535 | if (!new_endpoint_was_used[i]) |
1536 | total_unused_new_entries++; |
1537 | debug_printf("basisu_backend::encode_endpoint_palette: total_unused_new_entries: %u out of %u\n" , total_unused_new_entries, new_endpoint_was_used.size()); |
1538 | |
1539 | bool is_grayscale = true; |
1540 | for (uint32_t old_endpoint_index = 0; old_endpoint_index < (uint32_t)m_endpoint_palette.size(); old_endpoint_index++) |
1541 | { |
1542 | int r5 = m_endpoint_palette[old_endpoint_index].m_color5[0]; |
1543 | int g5 = m_endpoint_palette[old_endpoint_index].m_color5[1]; |
1544 | int b5 = m_endpoint_palette[old_endpoint_index].m_color5[2]; |
1545 | if ((r5 != g5) || (r5 != b5)) |
1546 | { |
1547 | is_grayscale = false; |
1548 | break; |
1549 | } |
1550 | } |
1551 | |
1552 | histogram color5_delta_hist0(32); // prev 0-9, delta is -9 to 31 |
1553 | histogram color5_delta_hist1(32); // prev 10-21, delta is -21 to 21 |
1554 | histogram color5_delta_hist2(32); // prev 22-31, delta is -31 to 9 |
1555 | histogram inten_delta_hist(8); |
1556 | |
1557 | color_rgba prev_color5(16, 16, 16, 0); |
1558 | uint32_t prev_inten = 0; |
1559 | |
1560 | for (uint32_t new_endpoint_index = 0; new_endpoint_index < r.get_total_endpoint_clusters(); new_endpoint_index++) |
1561 | { |
1562 | const uint32_t old_endpoint_index = endpoint_remap_table_new_to_old[new_endpoint_index]; |
1563 | |
1564 | int delta_inten = m_endpoint_palette[old_endpoint_index].m_inten5 - prev_inten; |
1565 | inten_delta_hist.inc(delta_inten & 7); |
1566 | prev_inten = m_endpoint_palette[old_endpoint_index].m_inten5; |
1567 | |
1568 | for (uint32_t i = 0; i < (is_grayscale ? 1U : 3U); i++) |
1569 | { |
1570 | const int delta = (m_endpoint_palette[old_endpoint_index].m_color5[i] - prev_color5[i]) & 31; |
1571 | |
1572 | if (prev_color5[i] <= basist::COLOR5_PAL0_PREV_HI) |
1573 | color5_delta_hist0.inc(delta); |
1574 | else if (prev_color5[i] <= basist::COLOR5_PAL1_PREV_HI) |
1575 | color5_delta_hist1.inc(delta); |
1576 | else |
1577 | color5_delta_hist2.inc(delta); |
1578 | |
1579 | prev_color5[i] = m_endpoint_palette[old_endpoint_index].m_color5[i]; |
1580 | } |
1581 | } |
1582 | |
1583 | if (!color5_delta_hist0.get_total()) color5_delta_hist0.inc(0); |
1584 | if (!color5_delta_hist1.get_total()) color5_delta_hist1.inc(0); |
1585 | if (!color5_delta_hist2.get_total()) color5_delta_hist2.inc(0); |
1586 | |
1587 | huffman_encoding_table color5_delta_model0, color5_delta_model1, color5_delta_model2, inten_delta_model; |
1588 | if (!color5_delta_model0.init(color5_delta_hist0, 16)) |
1589 | { |
1590 | error_printf("color5_delta_model.init() failed!" ); |
1591 | return false; |
1592 | } |
1593 | |
1594 | if (!color5_delta_model1.init(color5_delta_hist1, 16)) |
1595 | { |
1596 | error_printf("color5_delta_model.init() failed!" ); |
1597 | return false; |
1598 | } |
1599 | |
1600 | if (!color5_delta_model2.init(color5_delta_hist2, 16)) |
1601 | { |
1602 | error_printf("color5_delta_model.init() failed!" ); |
1603 | return false; |
1604 | } |
1605 | |
1606 | if (!inten_delta_model.init(inten_delta_hist, 16)) |
1607 | { |
1608 | error_printf("inten3_model.init() failed!" ); |
1609 | return false; |
1610 | } |
1611 | |
1612 | bitwise_coder coder; |
1613 | |
1614 | coder.init(8192); |
1615 | |
1616 | coder.emit_huffman_table(color5_delta_model0); |
1617 | coder.emit_huffman_table(color5_delta_model1); |
1618 | coder.emit_huffman_table(color5_delta_model2); |
1619 | coder.emit_huffman_table(inten_delta_model); |
1620 | |
1621 | coder.put_bits(is_grayscale, 1); |
1622 | |
1623 | prev_color5.set(16, 16, 16, 0); |
1624 | prev_inten = 0; |
1625 | |
1626 | for (uint32_t new_endpoint_index = 0; new_endpoint_index < r.get_total_endpoint_clusters(); new_endpoint_index++) |
1627 | { |
1628 | const uint32_t old_endpoint_index = endpoint_remap_table_new_to_old[new_endpoint_index]; |
1629 | |
1630 | int delta_inten = (m_endpoint_palette[old_endpoint_index].m_inten5 - prev_inten) & 7; |
1631 | coder.put_code(delta_inten, inten_delta_model); |
1632 | prev_inten = m_endpoint_palette[old_endpoint_index].m_inten5; |
1633 | |
1634 | for (uint32_t i = 0; i < (is_grayscale ? 1U : 3U); i++) |
1635 | { |
1636 | const int delta = (m_endpoint_palette[old_endpoint_index].m_color5[i] - prev_color5[i]) & 31; |
1637 | |
1638 | if (prev_color5[i] <= basist::COLOR5_PAL0_PREV_HI) |
1639 | coder.put_code(delta, color5_delta_model0); |
1640 | else if (prev_color5[i] <= basist::COLOR5_PAL1_PREV_HI) |
1641 | coder.put_code(delta, color5_delta_model1); |
1642 | else |
1643 | coder.put_code(delta, color5_delta_model2); |
1644 | |
1645 | prev_color5[i] = m_endpoint_palette[old_endpoint_index].m_color5[i]; |
1646 | } |
1647 | |
1648 | } // q |
1649 | |
1650 | coder.flush(); |
1651 | |
1652 | m_output.m_endpoint_palette = coder.get_bytes(); |
1653 | |
1654 | debug_printf("Endpoint codebook size: %u bits %u bytes, Bits per entry: %3.1f, Avg bits/texel: %3.3f\n" , |
1655 | 8 * (int)m_output.m_endpoint_palette.size(), (int)m_output.m_endpoint_palette.size(), m_output.m_endpoint_palette.size() * 8.0f / r.get_total_endpoint_clusters(), m_output.m_endpoint_palette.size() * 8.0f / get_total_input_texels()); |
1656 | |
1657 | return true; |
1658 | } |
1659 | |
1660 | bool basisu_backend::encode_selector_palette() |
1661 | { |
1662 | const basisu_frontend& r = *m_pFront_end; |
1663 | |
1664 | histogram delta_selector_pal_histogram(256); |
1665 | |
1666 | for (uint32_t q = 0; q < r.get_total_selector_clusters(); q++) |
1667 | { |
1668 | if (!q) |
1669 | continue; |
1670 | |
1671 | const etc1_selector_palette_entry& cur = m_selector_palette[m_selector_remap_table_new_to_old[q]]; |
1672 | const etc1_selector_palette_entry predictor(m_selector_palette[m_selector_remap_table_new_to_old[q - 1]]); |
1673 | |
1674 | for (uint32_t j = 0; j < 4; j++) |
1675 | delta_selector_pal_histogram.inc(cur.get_byte(j) ^ predictor.get_byte(j)); |
1676 | } |
1677 | |
1678 | if (!delta_selector_pal_histogram.get_total()) |
1679 | delta_selector_pal_histogram.inc(0); |
1680 | |
1681 | huffman_encoding_table delta_selector_pal_model; |
1682 | if (!delta_selector_pal_model.init(delta_selector_pal_histogram, 16)) |
1683 | { |
1684 | error_printf("delta_selector_pal_model.init() failed!" ); |
1685 | return false; |
1686 | } |
1687 | |
1688 | bitwise_coder coder; |
1689 | coder.init(1024 * 1024); |
1690 | |
1691 | coder.put_bits(0, 1); // use global codebook |
1692 | coder.put_bits(0, 1); // uses hybrid codebooks |
1693 | |
1694 | coder.put_bits(0, 1); // raw bytes |
1695 | |
1696 | coder.emit_huffman_table(delta_selector_pal_model); |
1697 | |
1698 | for (uint32_t q = 0; q < r.get_total_selector_clusters(); q++) |
1699 | { |
1700 | if (!q) |
1701 | { |
1702 | for (uint32_t j = 0; j < 4; j++) |
1703 | coder.put_bits(m_selector_palette[m_selector_remap_table_new_to_old[q]].get_byte(j), 8); |
1704 | continue; |
1705 | } |
1706 | |
1707 | const etc1_selector_palette_entry& cur = m_selector_palette[m_selector_remap_table_new_to_old[q]]; |
1708 | const etc1_selector_palette_entry predictor(m_selector_palette[m_selector_remap_table_new_to_old[q - 1]]); |
1709 | |
1710 | for (uint32_t j = 0; j < 4; j++) |
1711 | coder.put_code(cur.get_byte(j) ^ predictor.get_byte(j), delta_selector_pal_model); |
1712 | } |
1713 | |
1714 | coder.flush(); |
1715 | |
1716 | m_output.m_selector_palette = coder.get_bytes(); |
1717 | |
1718 | if (m_output.m_selector_palette.size() >= r.get_total_selector_clusters() * 4) |
1719 | { |
1720 | coder.init(1024 * 1024); |
1721 | |
1722 | coder.put_bits(0, 1); // use global codebook |
1723 | coder.put_bits(0, 1); // uses hybrid codebooks |
1724 | |
1725 | coder.put_bits(1, 1); // raw bytes |
1726 | |
1727 | for (uint32_t q = 0; q < r.get_total_selector_clusters(); q++) |
1728 | { |
1729 | const uint32_t i = m_selector_remap_table_new_to_old[q]; |
1730 | |
1731 | for (uint32_t j = 0; j < 4; j++) |
1732 | coder.put_bits(m_selector_palette[i].get_byte(j), 8); |
1733 | } |
1734 | |
1735 | coder.flush(); |
1736 | |
1737 | m_output.m_selector_palette = coder.get_bytes(); |
1738 | } |
1739 | |
1740 | debug_printf("Selector codebook bits: %u bytes: %u, Bits per entry: %3.1f, Avg bits/texel: %3.3f\n" , |
1741 | (int)m_output.m_selector_palette.size() * 8, (int)m_output.m_selector_palette.size(), |
1742 | m_output.m_selector_palette.size() * 8.0f / r.get_total_selector_clusters(), m_output.m_selector_palette.size() * 8.0f / get_total_input_texels()); |
1743 | |
1744 | return true; |
1745 | } |
1746 | |
1747 | uint32_t basisu_backend::encode() |
1748 | { |
1749 | //const bool is_video = m_pFront_end->get_params().m_tex_type == basist::cBASISTexTypeVideoFrames; |
1750 | m_output.m_slice_desc = m_slices; |
1751 | m_output.m_etc1s = m_params.m_etc1s; |
1752 | m_output.m_uses_global_codebooks = m_params.m_used_global_codebooks; |
1753 | m_output.m_srgb = m_pFront_end->get_params().m_perceptual; |
1754 | |
1755 | create_endpoint_palette(); |
1756 | create_selector_palette(); |
1757 | |
1758 | create_encoder_blocks(); |
1759 | |
1760 | if (!encode_image()) |
1761 | return 0; |
1762 | |
1763 | if (!encode_endpoint_palette()) |
1764 | return 0; |
1765 | |
1766 | if (!encode_selector_palette()) |
1767 | return 0; |
1768 | |
1769 | uint32_t total_compressed_bytes = (uint32_t)(m_output.m_slice_image_tables.size() + m_output.m_endpoint_palette.size() + m_output.m_selector_palette.size()); |
1770 | for (uint32_t i = 0; i < m_output.m_slice_image_data.size(); i++) |
1771 | total_compressed_bytes += (uint32_t)m_output.m_slice_image_data[i].size(); |
1772 | |
1773 | debug_printf("Wrote %u bytes, %3.3f bits/texel\n" , total_compressed_bytes, total_compressed_bytes * 8.0f / get_total_input_texels()); |
1774 | |
1775 | return total_compressed_bytes; |
1776 | } |
1777 | |
1778 | } // namespace basisu |
1779 | |