1 | // SPDX-License-Identifier: Apache-2.0 |
2 | // ---------------------------------------------------------------------------- |
3 | // Copyright 2011-2022 Arm Limited |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | // use this file except in compliance with the License. You may obtain a copy |
7 | // of the License at: |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, software |
12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | // License for the specific language governing permissions and limitations |
15 | // under the License. |
16 | // ---------------------------------------------------------------------------- |
17 | |
18 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
19 | |
20 | /** |
21 | * @brief Functions for finding best endpoint format. |
22 | * |
23 | * We assume there are two independent sources of error in any given partition: |
24 | * |
25 | * - Encoding choice errors |
26 | * - Quantization errors |
27 | * |
28 | * Encoding choice errors are caused by encoder decisions. For example: |
29 | * |
30 | * - Using luminance instead of separate RGB components. |
31 | * - Using a constant 1.0 alpha instead of storing an alpha component. |
32 | * - Using RGB+scale instead of storing two full RGB endpoints. |
33 | * |
34 | * Quantization errors occur due to the limited precision we use for storage. These errors generally |
35 | * scale with quantization level, but are not actually independent of color encoding. In particular: |
36 | * |
37 | * - If we can use offset encoding then quantization error is halved. |
38 | * - If we can use blue-contraction then quantization error for RG is halved. |
39 | * - If we use HDR endpoints the quantization error is higher. |
40 | * |
41 | * Apart from these effects, we assume the error is proportional to the quantization step size. |
42 | */ |
43 | |
44 | |
45 | #include "astcenc_internal.h" |
46 | #include "astcenc_vecmathlib.h" |
47 | |
48 | #include <assert.h> |
49 | |
50 | /** |
51 | * @brief Compute the errors of the endpoint line options for one partition. |
52 | * |
53 | * Uncorrelated data assumes storing completely independent RGBA channels for each endpoint. Same |
54 | * chroma data assumes storing RGBA endpoints which pass though the origin (LDR only). RGBL data |
55 | * assumes storing RGB + lumashift (HDR only). Luminance error assumes storing RGB channels as a |
56 | * single value. |
57 | * |
58 | * |
59 | * @param pi The partition info data. |
60 | * @param partition_index The partition index to compule the error for. |
61 | * @param blk The image block. |
62 | * @param uncor_pline The endpoint line assuming uncorrelated endpoints. |
63 | * @param[out] uncor_err The computed error for the uncorrelated endpoint line. |
64 | * @param samec_pline The endpoint line assuming the same chroma for both endpoints. |
65 | * @param[out] samec_err The computed error for the uncorrelated endpoint line. |
66 | * @param rgbl_pline The endpoint line assuming RGB + lumashift data. |
67 | * @param[out] rgbl_err The computed error for the RGB + lumashift endpoint line. |
68 | * @param l_pline The endpoint line assuming luminance data. |
69 | * @param[out] l_err The computed error for the luminance endpoint line. |
70 | * @param[out] a_drop_err The computed error for dropping the alpha component. |
71 | */ |
72 | static void compute_error_squared_rgb_single_partition( |
73 | const partition_info& pi, |
74 | int partition_index, |
75 | const image_block& blk, |
76 | const processed_line3& uncor_pline, |
77 | float& uncor_err, |
78 | const processed_line3& samec_pline, |
79 | float& samec_err, |
80 | const processed_line3& rgbl_pline, |
81 | float& rgbl_err, |
82 | const processed_line3& l_pline, |
83 | float& l_err, |
84 | float& a_drop_err |
85 | ) { |
86 | vfloat4 ews = blk.channel_weight; |
87 | |
88 | unsigned int texel_count = pi.partition_texel_count[partition_index]; |
89 | const uint8_t* texel_indexes = pi.texels_of_partition[partition_index]; |
90 | promise(texel_count > 0); |
91 | |
92 | vfloatacc a_drop_errv = vfloatacc::zero(); |
93 | vfloat default_a(blk.get_default_alpha()); |
94 | |
95 | vfloatacc uncor_errv = vfloatacc::zero(); |
96 | vfloat uncor_bs0(uncor_pline.bs.lane<0>()); |
97 | vfloat uncor_bs1(uncor_pline.bs.lane<1>()); |
98 | vfloat uncor_bs2(uncor_pline.bs.lane<2>()); |
99 | |
100 | vfloat uncor_amod0(uncor_pline.amod.lane<0>()); |
101 | vfloat uncor_amod1(uncor_pline.amod.lane<1>()); |
102 | vfloat uncor_amod2(uncor_pline.amod.lane<2>()); |
103 | |
104 | vfloatacc samec_errv = vfloatacc::zero(); |
105 | vfloat samec_bs0(samec_pline.bs.lane<0>()); |
106 | vfloat samec_bs1(samec_pline.bs.lane<1>()); |
107 | vfloat samec_bs2(samec_pline.bs.lane<2>()); |
108 | |
109 | vfloatacc rgbl_errv = vfloatacc::zero(); |
110 | vfloat rgbl_bs0(rgbl_pline.bs.lane<0>()); |
111 | vfloat rgbl_bs1(rgbl_pline.bs.lane<1>()); |
112 | vfloat rgbl_bs2(rgbl_pline.bs.lane<2>()); |
113 | |
114 | vfloat rgbl_amod0(rgbl_pline.amod.lane<0>()); |
115 | vfloat rgbl_amod1(rgbl_pline.amod.lane<1>()); |
116 | vfloat rgbl_amod2(rgbl_pline.amod.lane<2>()); |
117 | |
118 | vfloatacc l_errv = vfloatacc::zero(); |
119 | vfloat l_bs0(l_pline.bs.lane<0>()); |
120 | vfloat l_bs1(l_pline.bs.lane<1>()); |
121 | vfloat l_bs2(l_pline.bs.lane<2>()); |
122 | |
123 | vint lane_ids = vint::lane_id(); |
124 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
125 | { |
126 | vint tix(texel_indexes + i); |
127 | |
128 | vmask mask = lane_ids < vint(texel_count); |
129 | lane_ids += vint(ASTCENC_SIMD_WIDTH); |
130 | |
131 | // Compute the error that arises from just ditching alpha |
132 | vfloat data_a = gatherf(blk.data_a, tix); |
133 | vfloat alpha_diff = data_a - default_a; |
134 | alpha_diff = alpha_diff * alpha_diff; |
135 | |
136 | haccumulate(a_drop_errv, alpha_diff, mask); |
137 | |
138 | vfloat data_r = gatherf(blk.data_r, tix); |
139 | vfloat data_g = gatherf(blk.data_g, tix); |
140 | vfloat data_b = gatherf(blk.data_b, tix); |
141 | |
142 | // Compute uncorrelated error |
143 | vfloat param = data_r * uncor_bs0 |
144 | + data_g * uncor_bs1 |
145 | + data_b * uncor_bs2; |
146 | |
147 | vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r; |
148 | vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g; |
149 | vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b; |
150 | |
151 | vfloat error = dist0 * dist0 * ews.lane<0>() |
152 | + dist1 * dist1 * ews.lane<1>() |
153 | + dist2 * dist2 * ews.lane<2>(); |
154 | |
155 | haccumulate(uncor_errv, error, mask); |
156 | |
157 | // Compute same chroma error - no "amod", its always zero |
158 | param = data_r * samec_bs0 |
159 | + data_g * samec_bs1 |
160 | + data_b * samec_bs2; |
161 | |
162 | dist0 = (param * samec_bs0) - data_r; |
163 | dist1 = (param * samec_bs1) - data_g; |
164 | dist2 = (param * samec_bs2) - data_b; |
165 | |
166 | error = dist0 * dist0 * ews.lane<0>() |
167 | + dist1 * dist1 * ews.lane<1>() |
168 | + dist2 * dist2 * ews.lane<2>(); |
169 | |
170 | haccumulate(samec_errv, error, mask); |
171 | |
172 | // Compute rgbl error |
173 | param = data_r * rgbl_bs0 |
174 | + data_g * rgbl_bs1 |
175 | + data_b * rgbl_bs2; |
176 | |
177 | dist0 = (rgbl_amod0 + param * rgbl_bs0) - data_r; |
178 | dist1 = (rgbl_amod1 + param * rgbl_bs1) - data_g; |
179 | dist2 = (rgbl_amod2 + param * rgbl_bs2) - data_b; |
180 | |
181 | error = dist0 * dist0 * ews.lane<0>() |
182 | + dist1 * dist1 * ews.lane<1>() |
183 | + dist2 * dist2 * ews.lane<2>(); |
184 | |
185 | haccumulate(rgbl_errv, error, mask); |
186 | |
187 | // Compute luma error - no "amod", its always zero |
188 | param = data_r * l_bs0 |
189 | + data_g * l_bs1 |
190 | + data_b * l_bs2; |
191 | |
192 | dist0 = (param * l_bs0) - data_r; |
193 | dist1 = (param * l_bs1) - data_g; |
194 | dist2 = (param * l_bs2) - data_b; |
195 | |
196 | error = dist0 * dist0 * ews.lane<0>() |
197 | + dist1 * dist1 * ews.lane<1>() |
198 | + dist2 * dist2 * ews.lane<2>(); |
199 | |
200 | haccumulate(l_errv, error, mask); |
201 | } |
202 | |
203 | a_drop_err = hadd_s(a_drop_errv) * ews.lane<3>(); |
204 | uncor_err = hadd_s(uncor_errv); |
205 | samec_err = hadd_s(samec_errv); |
206 | rgbl_err = hadd_s(rgbl_errv); |
207 | l_err = hadd_s(l_errv); |
208 | } |
209 | |
210 | /** |
211 | * @brief For a given set of input colors and partitioning determine endpoint encode errors. |
212 | * |
213 | * This function determines the color error that results from RGB-scale encoding (LDR only), |
214 | * RGB-lumashift encoding (HDR only), luminance-encoding, and alpha drop. Also determines whether |
215 | * the endpoints are eligible for offset encoding or blue-contraction |
216 | * |
217 | * @param blk The image block. |
218 | * @param pi The partition info data. |
219 | * @param ep The idealized endpoints. |
220 | * @param[out] eci The resulting encoding choice error metrics. |
221 | */ |
222 | static void compute_encoding_choice_errors( |
223 | const image_block& blk, |
224 | const partition_info& pi, |
225 | const endpoints& ep, |
226 | encoding_choice_errors eci[BLOCK_MAX_PARTITIONS]) |
227 | { |
228 | int partition_count = pi.partition_count; |
229 | promise(partition_count > 0); |
230 | |
231 | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
232 | |
233 | compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); |
234 | |
235 | for (int i = 0; i < partition_count; i++) |
236 | { |
237 | partition_metrics& pm = pms[i]; |
238 | |
239 | line3 uncor_rgb_lines; |
240 | line3 samec_rgb_lines; // for LDR-RGB-scale |
241 | line3 rgb_luma_lines; // for HDR-RGB-scale |
242 | |
243 | processed_line3 uncor_rgb_plines; |
244 | processed_line3 samec_rgb_plines; |
245 | processed_line3 rgb_luma_plines; |
246 | processed_line3 luminance_plines; |
247 | |
248 | float uncorr_rgb_error; |
249 | float samechroma_rgb_error; |
250 | float rgb_luma_error; |
251 | float luminance_rgb_error; |
252 | float alpha_drop_error; |
253 | |
254 | uncor_rgb_lines.a = pm.avg; |
255 | uncor_rgb_lines.b = normalize_safe(pm.dir, unit3()); |
256 | |
257 | samec_rgb_lines.a = vfloat4::zero(); |
258 | samec_rgb_lines.b = normalize_safe(pm.avg, unit3()); |
259 | |
260 | rgb_luma_lines.a = pm.avg; |
261 | rgb_luma_lines.b = unit3(); |
262 | |
263 | uncor_rgb_plines.amod = uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b); |
264 | uncor_rgb_plines.bs = uncor_rgb_lines.b; |
265 | |
266 | // Same chroma always goes though zero, so this is simpler than the others |
267 | samec_rgb_plines.amod = vfloat4::zero(); |
268 | samec_rgb_plines.bs = samec_rgb_lines.b; |
269 | |
270 | rgb_luma_plines.amod = rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b); |
271 | rgb_luma_plines.bs = rgb_luma_lines.b; |
272 | |
273 | // Luminance always goes though zero, so this is simpler than the others |
274 | luminance_plines.amod = vfloat4::zero(); |
275 | luminance_plines.bs = unit3(); |
276 | |
277 | compute_error_squared_rgb_single_partition( |
278 | pi, i, blk, |
279 | uncor_rgb_plines, uncorr_rgb_error, |
280 | samec_rgb_plines, samechroma_rgb_error, |
281 | rgb_luma_plines, rgb_luma_error, |
282 | luminance_plines, luminance_rgb_error, |
283 | alpha_drop_error); |
284 | |
285 | // Determine if we can offset encode RGB lanes |
286 | vfloat4 endpt0 = ep.endpt0[i]; |
287 | vfloat4 endpt1 = ep.endpt1[i]; |
288 | vfloat4 endpt_diff = abs(endpt1 - endpt0); |
289 | vmask4 endpt_can_offset = endpt_diff < vfloat4(0.12f * 65535.0f); |
290 | bool can_offset_encode = (mask(endpt_can_offset) & 0x7) == 0x7; |
291 | |
292 | // Store out the settings |
293 | eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f; // empirical |
294 | eci[i].rgb_luma_error = (rgb_luma_error - uncorr_rgb_error) * 1.5f; // wild guess |
295 | eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f; // empirical |
296 | eci[i].alpha_drop_error = alpha_drop_error * 3.0f; |
297 | eci[i].can_offset_encode = can_offset_encode; |
298 | eci[i].can_blue_contract = !blk.is_luminance(); |
299 | } |
300 | } |
301 | |
302 | /** |
303 | * @brief For a given partition compute the error for every endpoint integer count and quant level. |
304 | * |
305 | * @param encode_hdr_rgb @c true if using HDR for RGB, @c false for LDR. |
306 | * @param encode_hdr_alpha @c true if using HDR for alpha, @c false for LDR. |
307 | * @param partition_index The partition index. |
308 | * @param pi The partition info. |
309 | * @param eci The encoding choice error metrics. |
310 | * @param ep The idealized endpoints. |
311 | * @param error_weight The resulting encoding choice error metrics. |
312 | * @param[out] best_error The best error for each integer count and quant level. |
313 | * @param[out] format_of_choice The preferred endpoint format for each integer count and quant level. |
314 | */ |
315 | static void compute_color_error_for_every_integer_count_and_quant_level( |
316 | bool encode_hdr_rgb, |
317 | bool encode_hdr_alpha, |
318 | int partition_index, |
319 | const partition_info& pi, |
320 | const encoding_choice_errors& eci, |
321 | const endpoints& ep, |
322 | vfloat4 error_weight, |
323 | float best_error[21][4], |
324 | uint8_t format_of_choice[21][4] |
325 | ) { |
326 | int partition_size = pi.partition_texel_count[partition_index]; |
327 | |
328 | static const float baseline_quant_error[21 - QUANT_6] { |
329 | (65536.0f * 65536.0f / 18.0f) / (5 * 5), |
330 | (65536.0f * 65536.0f / 18.0f) / (7 * 7), |
331 | (65536.0f * 65536.0f / 18.0f) / (9 * 9), |
332 | (65536.0f * 65536.0f / 18.0f) / (11 * 11), |
333 | (65536.0f * 65536.0f / 18.0f) / (15 * 15), |
334 | (65536.0f * 65536.0f / 18.0f) / (19 * 19), |
335 | (65536.0f * 65536.0f / 18.0f) / (23 * 23), |
336 | (65536.0f * 65536.0f / 18.0f) / (31 * 31), |
337 | (65536.0f * 65536.0f / 18.0f) / (39 * 39), |
338 | (65536.0f * 65536.0f / 18.0f) / (47 * 47), |
339 | (65536.0f * 65536.0f / 18.0f) / (63 * 63), |
340 | (65536.0f * 65536.0f / 18.0f) / (79 * 79), |
341 | (65536.0f * 65536.0f / 18.0f) / (95 * 95), |
342 | (65536.0f * 65536.0f / 18.0f) / (127 * 127), |
343 | (65536.0f * 65536.0f / 18.0f) / (159 * 159), |
344 | (65536.0f * 65536.0f / 18.0f) / (191 * 191), |
345 | (65536.0f * 65536.0f / 18.0f) / (255 * 255) |
346 | }; |
347 | |
348 | vfloat4 ep0 = ep.endpt0[partition_index]; |
349 | vfloat4 ep1 = ep.endpt1[partition_index]; |
350 | |
351 | float ep1_min = hmin_rgb_s(ep1); |
352 | ep1_min = astc::max(ep1_min, 0.0f); |
353 | |
354 | float error_weight_rgbsum = hadd_rgb_s(error_weight); |
355 | |
356 | float range_upper_limit_rgb = encode_hdr_rgb ? 61440.0f : 65535.0f; |
357 | float range_upper_limit_alpha = encode_hdr_alpha ? 61440.0f : 65535.0f; |
358 | |
359 | // It is possible to get endpoint colors significantly outside [0,upper-limit] even if the |
360 | // input data are safely contained in [0,upper-limit]; we need to add an error term for this |
361 | vfloat4 offset(range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_alpha); |
362 | vfloat4 ep0_range_error_high = max(ep0 - offset, 0.0f); |
363 | vfloat4 ep1_range_error_high = max(ep1 - offset, 0.0f); |
364 | |
365 | vfloat4 ep0_range_error_low = min(ep0, 0.0f); |
366 | vfloat4 ep1_range_error_low = min(ep1, 0.0f); |
367 | |
368 | vfloat4 sum_range_error = |
369 | (ep0_range_error_low * ep0_range_error_low) + |
370 | (ep1_range_error_low * ep1_range_error_low) + |
371 | (ep0_range_error_high * ep0_range_error_high) + |
372 | (ep1_range_error_high * ep1_range_error_high); |
373 | |
374 | float rgb_range_error = dot3_s(sum_range_error, error_weight) |
375 | * 0.5f * static_cast<float>(partition_size); |
376 | float alpha_range_error = sum_range_error.lane<3>() * error_weight.lane<3>() |
377 | * 0.5f * static_cast<float>(partition_size); |
378 | |
379 | if (encode_hdr_rgb) |
380 | { |
381 | |
382 | // Collect some statistics |
383 | float af, cf; |
384 | if (ep1.lane<0>() > ep1.lane<1>() && ep1.lane<0>() > ep1.lane<2>()) |
385 | { |
386 | af = ep1.lane<0>(); |
387 | cf = ep1.lane<0>() - ep0.lane<0>(); |
388 | } |
389 | else if (ep1.lane<1>() > ep1.lane<2>()) |
390 | { |
391 | af = ep1.lane<1>(); |
392 | cf = ep1.lane<1>() - ep0.lane<1>(); |
393 | } |
394 | else |
395 | { |
396 | af = ep1.lane<2>(); |
397 | cf = ep1.lane<2>() - ep0.lane<2>(); |
398 | } |
399 | |
400 | // Estimate of color-component spread in high endpoint color |
401 | float bf = af - ep1_min; |
402 | vfloat4 prd = (ep1 - vfloat4(cf)).swz<0, 1, 2>(); |
403 | vfloat4 pdif = prd - ep0.swz<0, 1, 2>(); |
404 | // Estimate of color-component spread in low endpoint color |
405 | float df = hmax_s(abs(pdif)); |
406 | |
407 | int b = static_cast<int>(bf); |
408 | int c = static_cast<int>(cf); |
409 | int d = static_cast<int>(df); |
410 | |
411 | // Determine which one of the 6 submodes is likely to be used in case of an RGBO-mode |
412 | int rgbo_mode = 5; // 7 bits per component |
413 | // mode 4: 8 7 6 |
414 | if (b < 32768 && c < 16384) |
415 | { |
416 | rgbo_mode = 4; |
417 | } |
418 | |
419 | // mode 3: 9 6 7 |
420 | if (b < 8192 && c < 16384) |
421 | { |
422 | rgbo_mode = 3; |
423 | } |
424 | |
425 | // mode 2: 10 5 8 |
426 | if (b < 2048 && c < 16384) |
427 | { |
428 | rgbo_mode = 2; |
429 | } |
430 | |
431 | // mode 1: 11 6 5 |
432 | if (b < 2048 && c < 1024) |
433 | { |
434 | rgbo_mode = 1; |
435 | } |
436 | |
437 | // mode 0: 11 5 7 |
438 | if (b < 1024 && c < 4096) |
439 | { |
440 | rgbo_mode = 0; |
441 | } |
442 | |
443 | // Determine which one of the 9 submodes is likely to be used in case of an RGB-mode. |
444 | int rgb_mode = 8; // 8 bits per component, except 7 bits for blue |
445 | |
446 | // mode 0: 9 7 6 7 |
447 | if (b < 16384 && c < 8192 && d < 8192) |
448 | { |
449 | rgb_mode = 0; |
450 | } |
451 | |
452 | // mode 1: 9 8 6 6 |
453 | if (b < 32768 && c < 8192 && d < 4096) |
454 | { |
455 | rgb_mode = 1; |
456 | } |
457 | |
458 | // mode 2: 10 6 7 7 |
459 | if (b < 4096 && c < 8192 && d < 4096) |
460 | { |
461 | rgb_mode = 2; |
462 | } |
463 | |
464 | // mode 3: 10 7 7 6 |
465 | if (b < 8192 && c < 8192 && d < 2048) |
466 | { |
467 | rgb_mode = 3; |
468 | } |
469 | |
470 | // mode 4: 11 8 6 5 |
471 | if (b < 8192 && c < 2048 && d < 512) |
472 | { |
473 | rgb_mode = 4; |
474 | } |
475 | |
476 | // mode 5: 11 6 8 6 |
477 | if (b < 2048 && c < 8192 && d < 1024) |
478 | { |
479 | rgb_mode = 5; |
480 | } |
481 | |
482 | // mode 6: 12 7 7 5 |
483 | if (b < 2048 && c < 2048 && d < 256) |
484 | { |
485 | rgb_mode = 6; |
486 | } |
487 | |
488 | // mode 7: 12 6 7 6 |
489 | if (b < 1024 && c < 2048 && d < 512) |
490 | { |
491 | rgb_mode = 7; |
492 | } |
493 | |
494 | static const float rgbo_error_scales[6] { 4.0f, 4.0f, 16.0f, 64.0f, 256.0f, 1024.0f }; |
495 | static const float rgb_error_scales[9] { 64.0f, 64.0f, 16.0f, 16.0f, 4.0f, 4.0f, 1.0f, 1.0f, 384.0f }; |
496 | |
497 | float mode7mult = rgbo_error_scales[rgbo_mode] * 0.0015f; // Empirically determined .... |
498 | float mode11mult = rgb_error_scales[rgb_mode] * 0.010f; // Empirically determined .... |
499 | |
500 | |
501 | float lum_high = hadd_rgb_s(ep1) * (1.0f / 3.0f); |
502 | float lum_low = hadd_rgb_s(ep0) * (1.0f / 3.0f); |
503 | float lumdif = lum_high - lum_low; |
504 | float mode23mult = lumdif < 960 ? 4.0f : lumdif < 3968 ? 16.0f : 128.0f; |
505 | |
506 | mode23mult *= 0.0005f; // Empirically determined .... |
507 | |
508 | // Pick among the available HDR endpoint modes |
509 | for (int i = QUANT_2; i < QUANT_16; i++) |
510 | { |
511 | best_error[i][3] = ERROR_CALC_DEFAULT; |
512 | best_error[i][2] = ERROR_CALC_DEFAULT; |
513 | best_error[i][1] = ERROR_CALC_DEFAULT; |
514 | best_error[i][0] = ERROR_CALC_DEFAULT; |
515 | |
516 | format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA); |
517 | format_of_choice[i][2] = FMT_HDR_RGB; |
518 | format_of_choice[i][1] = FMT_HDR_RGB_SCALE; |
519 | format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE; |
520 | } |
521 | |
522 | for (int i = QUANT_16; i <= QUANT_256; i++) |
523 | { |
524 | // The base_quant_error should depend on the scale-factor that would be used during |
525 | // actual encode of the color value |
526 | |
527 | float base_quant_error = baseline_quant_error[i - QUANT_6] * static_cast<float>(partition_size); |
528 | float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f; |
529 | float alpha_quantization_error = error_weight.lane<3>() * base_quant_error * 2.0f; |
530 | float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error; |
531 | |
532 | // For 8 integers, we have two encodings: one with HDR A and another one with LDR A |
533 | |
534 | float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error; |
535 | best_error[i][3] = full_hdr_rgba_error; |
536 | format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA); |
537 | |
538 | // For 6 integers, we have one HDR-RGB encoding |
539 | float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci.alpha_drop_error; |
540 | best_error[i][2] = full_hdr_rgb_error; |
541 | format_of_choice[i][2] = FMT_HDR_RGB; |
542 | |
543 | // For 4 integers, we have one HDR-RGB-Scale encoding |
544 | float hdr_rgb_scale_error = (rgb_quantization_error * mode7mult) + rgb_range_error + eci.alpha_drop_error + eci.rgb_luma_error; |
545 | |
546 | best_error[i][1] = hdr_rgb_scale_error; |
547 | format_of_choice[i][1] = FMT_HDR_RGB_SCALE; |
548 | |
549 | // For 2 integers, we assume luminance-with-large-range |
550 | float hdr_luminance_error = (rgb_quantization_error * mode23mult) + rgb_range_error + eci.alpha_drop_error + eci.luminance_error; |
551 | best_error[i][0] = hdr_luminance_error; |
552 | format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE; |
553 | } |
554 | } |
555 | else |
556 | { |
557 | for (int i = QUANT_2; i < QUANT_6; i++) |
558 | { |
559 | best_error[i][3] = ERROR_CALC_DEFAULT; |
560 | best_error[i][2] = ERROR_CALC_DEFAULT; |
561 | best_error[i][1] = ERROR_CALC_DEFAULT; |
562 | best_error[i][0] = ERROR_CALC_DEFAULT; |
563 | |
564 | format_of_choice[i][3] = FMT_RGBA; |
565 | format_of_choice[i][2] = FMT_RGB; |
566 | format_of_choice[i][1] = FMT_RGB_SCALE; |
567 | format_of_choice[i][0] = FMT_LUMINANCE; |
568 | } |
569 | |
570 | float base_quant_error_rgb = error_weight_rgbsum * static_cast<float>(partition_size); |
571 | float base_quant_error_a = error_weight.lane<3>() * static_cast<float>(partition_size); |
572 | float base_quant_error_rgba = base_quant_error_rgb + base_quant_error_a; |
573 | |
574 | float error_scale_bc_rgba = eci.can_blue_contract ? 0.625f : 1.0f; |
575 | float error_scale_oe_rgba = eci.can_offset_encode ? 0.5f : 1.0f; |
576 | |
577 | float error_scale_bc_rgb = eci.can_blue_contract ? 0.5f : 1.0f; |
578 | float error_scale_oe_rgb = eci.can_offset_encode ? 0.25f : 1.0f; |
579 | |
580 | // Pick among the available LDR endpoint modes |
581 | for (int i = QUANT_6; i <= QUANT_256; i++) |
582 | { |
583 | // Offset encoding not possible at higher quant levels |
584 | if (i >= QUANT_192) |
585 | { |
586 | error_scale_oe_rgba = 1.0f; |
587 | error_scale_oe_rgb = 1.0f; |
588 | } |
589 | |
590 | float base_quant_error = baseline_quant_error[i - QUANT_6]; |
591 | float quant_error_rgb = base_quant_error_rgb * base_quant_error; |
592 | float quant_error_rgba = base_quant_error_rgba * base_quant_error; |
593 | |
594 | // 8 integers can encode as RGBA+RGBA |
595 | float full_ldr_rgba_error = quant_error_rgba |
596 | * error_scale_bc_rgba |
597 | * error_scale_oe_rgba |
598 | + rgb_range_error |
599 | + alpha_range_error; |
600 | |
601 | best_error[i][3] = full_ldr_rgba_error; |
602 | format_of_choice[i][3] = FMT_RGBA; |
603 | |
604 | // 6 integers can encode as RGB+RGB or RGBS+AA |
605 | float full_ldr_rgb_error = quant_error_rgb |
606 | * error_scale_bc_rgb |
607 | * error_scale_oe_rgb |
608 | + rgb_range_error |
609 | + eci.alpha_drop_error; |
610 | |
611 | float rgbs_alpha_error = quant_error_rgba |
612 | + eci.rgb_scale_error |
613 | + rgb_range_error |
614 | + alpha_range_error; |
615 | |
616 | if (rgbs_alpha_error < full_ldr_rgb_error) |
617 | { |
618 | best_error[i][2] = rgbs_alpha_error; |
619 | format_of_choice[i][2] = FMT_RGB_SCALE_ALPHA; |
620 | } |
621 | else |
622 | { |
623 | best_error[i][2] = full_ldr_rgb_error; |
624 | format_of_choice[i][2] = FMT_RGB; |
625 | } |
626 | |
627 | // 4 integers can encode as RGBS or LA+LA |
628 | float ldr_rgbs_error = quant_error_rgb |
629 | + rgb_range_error |
630 | + eci.alpha_drop_error |
631 | + eci.rgb_scale_error; |
632 | |
633 | float lum_alpha_error = quant_error_rgba |
634 | + rgb_range_error |
635 | + alpha_range_error |
636 | + eci.luminance_error; |
637 | |
638 | if (ldr_rgbs_error < lum_alpha_error) |
639 | { |
640 | best_error[i][1] = ldr_rgbs_error; |
641 | format_of_choice[i][1] = FMT_RGB_SCALE; |
642 | } |
643 | else |
644 | { |
645 | best_error[i][1] = lum_alpha_error; |
646 | format_of_choice[i][1] = FMT_LUMINANCE_ALPHA; |
647 | } |
648 | |
649 | // 2 integers can encode as L+L |
650 | float luminance_error = quant_error_rgb |
651 | + rgb_range_error |
652 | + eci.alpha_drop_error |
653 | + eci.luminance_error; |
654 | |
655 | best_error[i][0] = luminance_error; |
656 | format_of_choice[i][0] = FMT_LUMINANCE; |
657 | } |
658 | } |
659 | } |
660 | |
661 | /** |
662 | * @brief For one partition compute the best format and quantization for a given bit count. |
663 | * |
664 | * @param best_combined_error The best error for each quant level and integer count. |
665 | * @param best_combined_format The best format for each quant level and integer count. |
666 | * @param bits_available The number of bits available for encoding. |
667 | * @param[out] best_quant_level The output best color quant level. |
668 | * @param[out] best_format The output best color format. |
669 | * |
670 | * @return The output error for the best pairing. |
671 | */ |
672 | static float one_partition_find_best_combination_for_bitcount( |
673 | const float best_combined_error[21][4], |
674 | const uint8_t best_combined_format[21][4], |
675 | int bits_available, |
676 | uint8_t& best_quant_level, |
677 | uint8_t& best_format |
678 | ) { |
679 | int best_integer_count = 0; |
680 | float best_integer_count_error = ERROR_CALC_DEFAULT; |
681 | |
682 | for (int integer_count = 1; integer_count <= 4; integer_count++) |
683 | { |
684 | // Compute the quantization level for a given number of integers and a given number of bits |
685 | int quant_level = quant_mode_table[integer_count][bits_available]; |
686 | |
687 | // Don't have enough bits to represent a given endpoint format at all! |
688 | if (quant_level < QUANT_6) |
689 | { |
690 | continue; |
691 | } |
692 | |
693 | float integer_count_error = best_combined_error[quant_level][integer_count - 1]; |
694 | if (integer_count_error < best_integer_count_error) |
695 | { |
696 | best_integer_count_error = integer_count_error; |
697 | best_integer_count = integer_count - 1; |
698 | } |
699 | } |
700 | |
701 | int ql = quant_mode_table[best_integer_count + 1][bits_available]; |
702 | |
703 | best_quant_level = static_cast<uint8_t>(ql); |
704 | best_format = FMT_LUMINANCE; |
705 | |
706 | if (ql >= QUANT_6) |
707 | { |
708 | best_format = best_combined_format[ql][best_integer_count]; |
709 | } |
710 | |
711 | return best_integer_count_error; |
712 | } |
713 | |
714 | /** |
715 | * @brief For 2 partitions compute the best format combinations for every pair of quant mode and integer count. |
716 | * |
717 | * @param best_error The best error for a single endpoint quant level and integer count. |
718 | * @param best_format The best format for a single endpoint quant level and integer count. |
719 | * @param[out] best_combined_error The best combined error pairings for the 2 partitions. |
720 | * @param[out] best_combined_format The best combined format pairings for the 2 partitions. |
721 | */ |
722 | static void two_partitions_find_best_combination_for_every_quantization_and_integer_count( |
723 | const float best_error[2][21][4], // indexed by (partition, quant-level, integer-pair-count-minus-1) |
724 | const uint8_t best_format[2][21][4], |
725 | float best_combined_error[21][7], // indexed by (quant-level, integer-pair-count-minus-2) |
726 | uint8_t best_combined_format[21][7][2] |
727 | ) { |
728 | for (int i = QUANT_2; i <= QUANT_256; i++) |
729 | { |
730 | for (int j = 0; j < 7; j++) |
731 | { |
732 | best_combined_error[i][j] = ERROR_CALC_DEFAULT; |
733 | } |
734 | } |
735 | |
736 | for (int quant = QUANT_6; quant <= QUANT_256; quant++) |
737 | { |
738 | for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair |
739 | { |
740 | for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair |
741 | { |
742 | int low2 = astc::min(i, j); |
743 | int high2 = astc::max(i, j); |
744 | if ((high2 - low2) > 1) |
745 | { |
746 | continue; |
747 | } |
748 | |
749 | int intcnt = i + j; |
750 | float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j], 1e10f); |
751 | if (errorterm <= best_combined_error[quant][intcnt]) |
752 | { |
753 | best_combined_error[quant][intcnt] = errorterm; |
754 | best_combined_format[quant][intcnt][0] = best_format[0][quant][i]; |
755 | best_combined_format[quant][intcnt][1] = best_format[1][quant][j]; |
756 | } |
757 | } |
758 | } |
759 | } |
760 | } |
761 | |
762 | /** |
763 | * @brief For 2 partitions compute the best format and quantization for a given bit count. |
764 | * |
765 | * @param best_combined_error The best error for each quant level and integer count. |
766 | * @param best_combined_format The best format for each quant level and integer count. |
767 | * @param bits_available The number of bits available for encoding. |
768 | * @param[out] best_quant_level The output best color quant level. |
769 | * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available. |
770 | * @param[out] best_formats The output best color formats. |
771 | * |
772 | * @return The output error for the best pairing. |
773 | */ |
774 | static float two_partitions_find_best_combination_for_bitcount( |
775 | float best_combined_error[21][7], |
776 | uint8_t best_combined_format[21][7][2], |
777 | int bits_available, |
778 | uint8_t& best_quant_level, |
779 | uint8_t& best_quant_level_mod, |
780 | uint8_t* best_formats |
781 | ) { |
782 | int best_integer_count = 0; |
783 | float best_integer_count_error = ERROR_CALC_DEFAULT; |
784 | |
785 | for (int integer_count = 2; integer_count <= 8; integer_count++) |
786 | { |
787 | // Compute the quantization level for a given number of integers and a given number of bits |
788 | int quant_level = quant_mode_table[integer_count][bits_available]; |
789 | |
790 | // Don't have enough bits to represent a given endpoint format at all! |
791 | if (quant_level < QUANT_6) |
792 | { |
793 | break; |
794 | } |
795 | |
796 | float integer_count_error = best_combined_error[quant_level][integer_count - 2]; |
797 | if (integer_count_error < best_integer_count_error) |
798 | { |
799 | best_integer_count_error = integer_count_error; |
800 | best_integer_count = integer_count; |
801 | } |
802 | } |
803 | |
804 | int ql = quant_mode_table[best_integer_count][bits_available]; |
805 | int ql_mod = quant_mode_table[best_integer_count][bits_available + 2]; |
806 | |
807 | best_quant_level = static_cast<uint8_t>(ql); |
808 | best_quant_level_mod = static_cast<uint8_t>(ql_mod); |
809 | |
810 | if (ql >= QUANT_6) |
811 | { |
812 | for (int i = 0; i < 2; i++) |
813 | { |
814 | best_formats[i] = best_combined_format[ql][best_integer_count - 2][i]; |
815 | } |
816 | } |
817 | else |
818 | { |
819 | for (int i = 0; i < 2; i++) |
820 | { |
821 | best_formats[i] = FMT_LUMINANCE; |
822 | } |
823 | } |
824 | |
825 | return best_integer_count_error; |
826 | } |
827 | |
828 | /** |
829 | * @brief For 3 partitions compute the best format combinations for every pair of quant mode and integer count. |
830 | * |
831 | * @param best_error The best error for a single endpoint quant level and integer count. |
832 | * @param best_format The best format for a single endpoint quant level and integer count. |
833 | * @param[out] best_combined_error The best combined error pairings for the 3 partitions. |
834 | * @param[out] best_combined_format The best combined format pairings for the 3 partitions. |
835 | */ |
836 | static void three_partitions_find_best_combination_for_every_quantization_and_integer_count( |
837 | const float best_error[3][21][4], // indexed by (partition, quant-level, integer-count) |
838 | const uint8_t best_format[3][21][4], |
839 | float best_combined_error[21][10], |
840 | uint8_t best_combined_format[21][10][3] |
841 | ) { |
842 | for (int i = QUANT_2; i <= QUANT_256; i++) |
843 | { |
844 | for (int j = 0; j < 10; j++) |
845 | { |
846 | best_combined_error[i][j] = ERROR_CALC_DEFAULT; |
847 | } |
848 | } |
849 | |
850 | for (int quant = QUANT_6; quant <= QUANT_256; quant++) |
851 | { |
852 | for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair |
853 | { |
854 | for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair |
855 | { |
856 | int low2 = astc::min(i, j); |
857 | int high2 = astc::max(i, j); |
858 | if ((high2 - low2) > 1) |
859 | { |
860 | continue; |
861 | } |
862 | |
863 | for (int k = 0; k < 4; k++) // integer-count for third endpoint-pair |
864 | { |
865 | int low3 = astc::min(k, low2); |
866 | int high3 = astc::max(k, high2); |
867 | if ((high3 - low3) > 1) |
868 | { |
869 | continue; |
870 | } |
871 | |
872 | int intcnt = i + j + k; |
873 | float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k], 1e10f); |
874 | if (errorterm <= best_combined_error[quant][intcnt]) |
875 | { |
876 | best_combined_error[quant][intcnt] = errorterm; |
877 | best_combined_format[quant][intcnt][0] = best_format[0][quant][i]; |
878 | best_combined_format[quant][intcnt][1] = best_format[1][quant][j]; |
879 | best_combined_format[quant][intcnt][2] = best_format[2][quant][k]; |
880 | } |
881 | } |
882 | } |
883 | } |
884 | } |
885 | } |
886 | |
887 | /** |
888 | * @brief For 3 partitions compute the best format and quantization for a given bit count. |
889 | * |
890 | * @param best_combined_error The best error for each quant level and integer count. |
891 | * @param best_combined_format The best format for each quant level and integer count. |
892 | * @param bits_available The number of bits available for encoding. |
893 | * @param[out] best_quant_level The output best color quant level. |
894 | * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available. |
895 | * @param[out] best_formats The output best color formats. |
896 | * |
897 | * @return The output error for the best pairing. |
898 | */ |
899 | static float three_partitions_find_best_combination_for_bitcount( |
900 | const float best_combined_error[21][10], |
901 | const uint8_t best_combined_format[21][10][3], |
902 | int bits_available, |
903 | uint8_t& best_quant_level, |
904 | uint8_t& best_quant_level_mod, |
905 | uint8_t* best_formats |
906 | ) { |
907 | int best_integer_count = 0; |
908 | float best_integer_count_error = ERROR_CALC_DEFAULT; |
909 | |
910 | for (int integer_count = 3; integer_count <= 9; integer_count++) |
911 | { |
912 | // Compute the quantization level for a given number of integers and a given number of bits |
913 | int quant_level = quant_mode_table[integer_count][bits_available]; |
914 | |
915 | // Don't have enough bits to represent a given endpoint format at all! |
916 | if (quant_level < QUANT_6) |
917 | { |
918 | break; |
919 | } |
920 | |
921 | float integer_count_error = best_combined_error[quant_level][integer_count - 3]; |
922 | if (integer_count_error < best_integer_count_error) |
923 | { |
924 | best_integer_count_error = integer_count_error; |
925 | best_integer_count = integer_count; |
926 | } |
927 | } |
928 | |
929 | int ql = quant_mode_table[best_integer_count][bits_available]; |
930 | int ql_mod = quant_mode_table[best_integer_count][bits_available + 5]; |
931 | |
932 | best_quant_level = static_cast<uint8_t>(ql); |
933 | best_quant_level_mod = static_cast<uint8_t>(ql_mod); |
934 | |
935 | if (ql >= QUANT_6) |
936 | { |
937 | for (int i = 0; i < 3; i++) |
938 | { |
939 | best_formats[i] = best_combined_format[ql][best_integer_count - 3][i]; |
940 | } |
941 | } |
942 | else |
943 | { |
944 | for (int i = 0; i < 3; i++) |
945 | { |
946 | best_formats[i] = FMT_LUMINANCE; |
947 | } |
948 | } |
949 | |
950 | return best_integer_count_error; |
951 | } |
952 | |
953 | /** |
954 | * @brief For 4 partitions compute the best format combinations for every pair of quant mode and integer count. |
955 | * |
956 | * @param best_error The best error for a single endpoint quant level and integer count. |
957 | * @param best_format The best format for a single endpoint quant level and integer count. |
958 | * @param[out] best_combined_error The best combined error pairings for the 4 partitions. |
959 | * @param[out] best_combined_format The best combined format pairings for the 4 partitions. |
960 | */ |
961 | static void four_partitions_find_best_combination_for_every_quantization_and_integer_count( |
962 | const float best_error[4][21][4], // indexed by (partition, quant-level, integer-count) |
963 | const uint8_t best_format[4][21][4], |
964 | float best_combined_error[21][13], |
965 | uint8_t best_combined_format[21][13][4] |
966 | ) { |
967 | for (int i = QUANT_2; i <= QUANT_256; i++) |
968 | { |
969 | for (int j = 0; j < 13; j++) |
970 | { |
971 | best_combined_error[i][j] = ERROR_CALC_DEFAULT; |
972 | } |
973 | } |
974 | |
975 | for (int quant = QUANT_6; quant <= QUANT_256; quant++) |
976 | { |
977 | for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair |
978 | { |
979 | for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair |
980 | { |
981 | int low2 = astc::min(i, j); |
982 | int high2 = astc::max(i, j); |
983 | if ((high2 - low2) > 1) |
984 | { |
985 | continue; |
986 | } |
987 | |
988 | for (int k = 0; k < 4; k++) // integer-count for third endpoint-pair |
989 | { |
990 | int low3 = astc::min(k, low2); |
991 | int high3 = astc::max(k, high2); |
992 | if ((high3 - low3) > 1) |
993 | { |
994 | continue; |
995 | } |
996 | |
997 | for (int l = 0; l < 4; l++) // integer-count for fourth endpoint-pair |
998 | { |
999 | int low4 = astc::min(l, low3); |
1000 | int high4 = astc::max(l, high3); |
1001 | if ((high4 - low4) > 1) |
1002 | { |
1003 | continue; |
1004 | } |
1005 | |
1006 | int intcnt = i + j + k + l; |
1007 | float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k] + best_error[3][quant][l], 1e10f); |
1008 | if (errorterm <= best_combined_error[quant][intcnt]) |
1009 | { |
1010 | best_combined_error[quant][intcnt] = errorterm; |
1011 | best_combined_format[quant][intcnt][0] = best_format[0][quant][i]; |
1012 | best_combined_format[quant][intcnt][1] = best_format[1][quant][j]; |
1013 | best_combined_format[quant][intcnt][2] = best_format[2][quant][k]; |
1014 | best_combined_format[quant][intcnt][3] = best_format[3][quant][l]; |
1015 | } |
1016 | } |
1017 | } |
1018 | } |
1019 | } |
1020 | } |
1021 | } |
1022 | |
1023 | /** |
1024 | * @brief For 4 partitions compute the best format and quantization for a given bit count. |
1025 | * |
1026 | * @param best_combined_error The best error for each quant level and integer count. |
1027 | * @param best_combined_format The best format for each quant level and integer count. |
1028 | * @param bits_available The number of bits available for encoding. |
1029 | * @param[out] best_quant_level The output best color quant level. |
1030 | * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available. |
1031 | * @param[out] best_formats The output best color formats. |
1032 | * |
1033 | * @return best_error The output error for the best pairing. |
1034 | */ |
1035 | static float four_partitions_find_best_combination_for_bitcount( |
1036 | const float best_combined_error[21][13], |
1037 | const uint8_t best_combined_format[21][13][4], |
1038 | int bits_available, |
1039 | uint8_t& best_quant_level, |
1040 | uint8_t& best_quant_level_mod, |
1041 | uint8_t* best_formats |
1042 | ) { |
1043 | int best_integer_count = 0; |
1044 | float best_integer_count_error = ERROR_CALC_DEFAULT; |
1045 | |
1046 | for (int integer_count = 4; integer_count <= 9; integer_count++) |
1047 | { |
1048 | // Compute the quantization level for a given number of integers and a given number of bits |
1049 | int quant_level = quant_mode_table[integer_count][bits_available]; |
1050 | |
1051 | // Don't have enough bits to represent a given endpoint format at all! |
1052 | if (quant_level < QUANT_6) |
1053 | { |
1054 | break; |
1055 | } |
1056 | |
1057 | float integer_count_error = best_combined_error[quant_level][integer_count - 4]; |
1058 | if (integer_count_error < best_integer_count_error) |
1059 | { |
1060 | best_integer_count_error = integer_count_error; |
1061 | best_integer_count = integer_count; |
1062 | } |
1063 | } |
1064 | |
1065 | int ql = quant_mode_table[best_integer_count][bits_available]; |
1066 | int ql_mod = quant_mode_table[best_integer_count][bits_available + 8]; |
1067 | |
1068 | best_quant_level = static_cast<uint8_t>(ql); |
1069 | best_quant_level_mod = static_cast<uint8_t>(ql_mod); |
1070 | |
1071 | if (ql >= QUANT_6) |
1072 | { |
1073 | for (int i = 0; i < 4; i++) |
1074 | { |
1075 | best_formats[i] = best_combined_format[ql][best_integer_count - 4][i]; |
1076 | } |
1077 | } |
1078 | else |
1079 | { |
1080 | for (int i = 0; i < 4; i++) |
1081 | { |
1082 | best_formats[i] = FMT_LUMINANCE; |
1083 | } |
1084 | } |
1085 | |
1086 | return best_integer_count_error; |
1087 | } |
1088 | |
1089 | /* See header for documentation. */ |
1090 | unsigned int compute_ideal_endpoint_formats( |
1091 | const partition_info& pi, |
1092 | const image_block& blk, |
1093 | const endpoints& ep, |
1094 | // bitcounts and errors computed for the various quantization methods |
1095 | const int8_t* qwt_bitcounts, |
1096 | const float* qwt_errors, |
1097 | unsigned int tune_candidate_limit, |
1098 | unsigned int start_block_mode, |
1099 | unsigned int end_block_mode, |
1100 | // output data |
1101 | uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS], |
1102 | int block_mode[TUNE_MAX_TRIAL_CANDIDATES], |
1103 | quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES], |
1104 | quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES], |
1105 | compression_working_buffers& tmpbuf |
1106 | ) { |
1107 | int partition_count = pi.partition_count; |
1108 | |
1109 | promise(partition_count > 0); |
1110 | |
1111 | bool encode_hdr_rgb = static_cast<bool>(blk.rgb_lns[0]); |
1112 | bool encode_hdr_alpha = static_cast<bool>(blk.alpha_lns[0]); |
1113 | |
1114 | // Compute the errors that result from various encoding choices (such as using luminance instead |
1115 | // of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on) |
1116 | encoding_choice_errors eci[BLOCK_MAX_PARTITIONS]; |
1117 | compute_encoding_choice_errors(blk, pi, ep, eci); |
1118 | |
1119 | float best_error[BLOCK_MAX_PARTITIONS][21][4]; |
1120 | uint8_t format_of_choice[BLOCK_MAX_PARTITIONS][21][4]; |
1121 | for (int i = 0; i < partition_count; i++) |
1122 | { |
1123 | compute_color_error_for_every_integer_count_and_quant_level( |
1124 | encode_hdr_rgb, encode_hdr_alpha, i, |
1125 | pi, eci[i], ep, blk.channel_weight, best_error[i], |
1126 | format_of_choice[i]); |
1127 | } |
1128 | |
1129 | float* errors_of_best_combination = tmpbuf.errors_of_best_combination; |
1130 | uint8_t* best_quant_levels = tmpbuf.best_quant_levels; |
1131 | uint8_t* best_quant_levels_mod = tmpbuf.best_quant_levels_mod; |
1132 | uint8_t (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats; |
1133 | |
1134 | // Ensure that the first iteration understep contains data that will never be picked |
1135 | vfloat clear_error(ERROR_CALC_DEFAULT); |
1136 | vint clear_quant(0); |
1137 | |
1138 | unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode); |
1139 | storea(clear_error, errors_of_best_combination + packed_start_block_mode); |
1140 | store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode); |
1141 | store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode); |
1142 | |
1143 | // Ensure that last iteration overstep contains data that will never be picked |
1144 | unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1); |
1145 | storea(clear_error, errors_of_best_combination + packed_end_block_mode); |
1146 | store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode); |
1147 | store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode); |
1148 | |
1149 | // Track a scalar best to avoid expensive search at least once ... |
1150 | float error_of_best_combination = ERROR_CALC_DEFAULT; |
1151 | int index_of_best_combination = -1; |
1152 | |
1153 | // The block contains 1 partition |
1154 | if (partition_count == 1) |
1155 | { |
1156 | for (unsigned int i = start_block_mode; i < end_block_mode; i++) |
1157 | { |
1158 | if (qwt_errors[i] >= ERROR_CALC_DEFAULT) |
1159 | { |
1160 | errors_of_best_combination[i] = ERROR_CALC_DEFAULT; |
1161 | continue; |
1162 | } |
1163 | |
1164 | float error_of_best = one_partition_find_best_combination_for_bitcount( |
1165 | best_error[0], format_of_choice[0], qwt_bitcounts[i], |
1166 | best_quant_levels[i], best_ep_formats[i][0]); |
1167 | |
1168 | float total_error = error_of_best + qwt_errors[i]; |
1169 | errors_of_best_combination[i] = total_error; |
1170 | best_quant_levels_mod[i] = best_quant_levels[i]; |
1171 | |
1172 | if (total_error < error_of_best_combination) |
1173 | { |
1174 | error_of_best_combination = total_error; |
1175 | index_of_best_combination = i; |
1176 | } |
1177 | } |
1178 | } |
1179 | // The block contains 2 partitions |
1180 | else if (partition_count == 2) |
1181 | { |
1182 | float combined_best_error[21][7]; |
1183 | uint8_t formats_of_choice[21][7][2]; |
1184 | |
1185 | two_partitions_find_best_combination_for_every_quantization_and_integer_count( |
1186 | best_error, format_of_choice, combined_best_error, formats_of_choice); |
1187 | |
1188 | assert(start_block_mode == 0); |
1189 | for (unsigned int i = 0; i < end_block_mode; i++) |
1190 | { |
1191 | if (qwt_errors[i] >= ERROR_CALC_DEFAULT) |
1192 | { |
1193 | errors_of_best_combination[i] = ERROR_CALC_DEFAULT; |
1194 | continue; |
1195 | } |
1196 | |
1197 | float error_of_best = two_partitions_find_best_combination_for_bitcount( |
1198 | combined_best_error, formats_of_choice, qwt_bitcounts[i], |
1199 | best_quant_levels[i], best_quant_levels_mod[i], |
1200 | best_ep_formats[i]); |
1201 | |
1202 | float total_error = error_of_best + qwt_errors[i]; |
1203 | errors_of_best_combination[i] = total_error; |
1204 | |
1205 | if (total_error < error_of_best_combination) |
1206 | { |
1207 | error_of_best_combination = total_error; |
1208 | index_of_best_combination = i; |
1209 | } |
1210 | } |
1211 | } |
1212 | // The block contains 3 partitions |
1213 | else if (partition_count == 3) |
1214 | { |
1215 | float combined_best_error[21][10]; |
1216 | uint8_t formats_of_choice[21][10][3]; |
1217 | |
1218 | three_partitions_find_best_combination_for_every_quantization_and_integer_count( |
1219 | best_error, format_of_choice, combined_best_error, formats_of_choice); |
1220 | |
1221 | assert(start_block_mode == 0); |
1222 | for (unsigned int i = 0; i < end_block_mode; i++) |
1223 | { |
1224 | if (qwt_errors[i] >= ERROR_CALC_DEFAULT) |
1225 | { |
1226 | errors_of_best_combination[i] = ERROR_CALC_DEFAULT; |
1227 | continue; |
1228 | } |
1229 | |
1230 | float error_of_best = three_partitions_find_best_combination_for_bitcount( |
1231 | combined_best_error, formats_of_choice, qwt_bitcounts[i], |
1232 | best_quant_levels[i], best_quant_levels_mod[i], |
1233 | best_ep_formats[i]); |
1234 | |
1235 | float total_error = error_of_best + qwt_errors[i]; |
1236 | errors_of_best_combination[i] = total_error; |
1237 | |
1238 | if (total_error < error_of_best_combination) |
1239 | { |
1240 | error_of_best_combination = total_error; |
1241 | index_of_best_combination = i; |
1242 | } |
1243 | } |
1244 | } |
1245 | // The block contains 4 partitions |
1246 | else // if (partition_count == 4) |
1247 | { |
1248 | assert(partition_count == 4); |
1249 | float combined_best_error[21][13]; |
1250 | uint8_t formats_of_choice[21][13][4]; |
1251 | |
1252 | four_partitions_find_best_combination_for_every_quantization_and_integer_count( |
1253 | best_error, format_of_choice, combined_best_error, formats_of_choice); |
1254 | |
1255 | assert(start_block_mode == 0); |
1256 | for (unsigned int i = 0; i < end_block_mode; i++) |
1257 | { |
1258 | if (qwt_errors[i] >= ERROR_CALC_DEFAULT) |
1259 | { |
1260 | errors_of_best_combination[i] = ERROR_CALC_DEFAULT; |
1261 | continue; |
1262 | } |
1263 | |
1264 | float error_of_best = four_partitions_find_best_combination_for_bitcount( |
1265 | combined_best_error, formats_of_choice, qwt_bitcounts[i], |
1266 | best_quant_levels[i], best_quant_levels_mod[i], |
1267 | best_ep_formats[i]); |
1268 | |
1269 | float total_error = error_of_best + qwt_errors[i]; |
1270 | errors_of_best_combination[i] = total_error; |
1271 | |
1272 | if (total_error < error_of_best_combination) |
1273 | { |
1274 | error_of_best_combination = total_error; |
1275 | index_of_best_combination = i; |
1276 | } |
1277 | } |
1278 | } |
1279 | |
1280 | int best_error_weights[TUNE_MAX_TRIAL_CANDIDATES]; |
1281 | |
1282 | // Fast path the first result and avoid the list search for trial 0 |
1283 | best_error_weights[0] = index_of_best_combination; |
1284 | if (index_of_best_combination >= 0) |
1285 | { |
1286 | errors_of_best_combination[index_of_best_combination] = ERROR_CALC_DEFAULT; |
1287 | } |
1288 | |
1289 | // Search the remaining results and pick the best candidate modes for trial 1+ |
1290 | for (unsigned int i = 1; i < tune_candidate_limit; i++) |
1291 | { |
1292 | vint vbest_error_index(-1); |
1293 | vfloat vbest_ep_error(ERROR_CALC_DEFAULT); |
1294 | |
1295 | start_block_mode = round_down_to_simd_multiple_vla(start_block_mode); |
1296 | vint lane_ids = vint::lane_id() + vint(start_block_mode); |
1297 | for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH) |
1298 | { |
1299 | vfloat err = vfloat(errors_of_best_combination + j); |
1300 | vmask mask = err < vbest_ep_error; |
1301 | vbest_ep_error = select(vbest_ep_error, err, mask); |
1302 | vbest_error_index = select(vbest_error_index, lane_ids, mask); |
1303 | lane_ids += vint(ASTCENC_SIMD_WIDTH); |
1304 | } |
1305 | |
1306 | // Pick best mode from the SIMD result, using lowest matching index to ensure invariance |
1307 | vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error); |
1308 | vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error); |
1309 | vbest_error_index = hmin(vbest_error_index); |
1310 | int best_error_index = vbest_error_index.lane<0>(); |
1311 | |
1312 | best_error_weights[i] = best_error_index; |
1313 | |
1314 | // Max the error for this candidate so we don't pick it again |
1315 | if (best_error_index >= 0) |
1316 | { |
1317 | errors_of_best_combination[best_error_index] = ERROR_CALC_DEFAULT; |
1318 | } |
1319 | // Early-out if no more candidates are valid |
1320 | else |
1321 | { |
1322 | break; |
1323 | } |
1324 | } |
1325 | |
1326 | for (unsigned int i = 0; i < tune_candidate_limit; i++) |
1327 | { |
1328 | if (best_error_weights[i] < 0) |
1329 | { |
1330 | return i; |
1331 | } |
1332 | |
1333 | block_mode[i] = best_error_weights[i]; |
1334 | |
1335 | quant_level[i] = static_cast<quant_method>(best_quant_levels[best_error_weights[i]]); |
1336 | quant_level_mod[i] = static_cast<quant_method>(best_quant_levels_mod[best_error_weights[i]]); |
1337 | |
1338 | assert(quant_level[i] >= QUANT_6 && quant_level[i] <= QUANT_256); |
1339 | assert(quant_level_mod[i] >= QUANT_6 && quant_level_mod[i] <= QUANT_256); |
1340 | |
1341 | for (int j = 0; j < partition_count; j++) |
1342 | { |
1343 | partition_format_specifiers[i][j] = best_ep_formats[best_error_weights[i]][j]; |
1344 | } |
1345 | } |
1346 | |
1347 | return tune_candidate_limit; |
1348 | } |
1349 | |
1350 | #endif |
1351 | |