1// SPDX-License-Identifier: Apache-2.0
2// ----------------------------------------------------------------------------
3// Copyright 2011-2023 Arm Limited
4//
5// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6// use this file except in compliance with the License. You may obtain a copy
7// of the License at:
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14// License for the specific language governing permissions and limitations
15// under the License.
16// ----------------------------------------------------------------------------
17
18#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20/**
21 * @brief Functions to compress a symbolic block.
22 */
23
24#include "astcenc_internal.h"
25#include "astcenc_diagnostic_trace.h"
26
27#include <cassert>
28
29/**
30 * @brief Merge two planes of endpoints into a single vector.
31 *
32 * @param ep_plane1 The endpoints for plane 1.
33 * @param ep_plane2 The endpoints for plane 2.
34 * @param component_plane2 The color component for plane 2.
35 * @param[out] result The merged output.
36 */
37static void merge_endpoints(
38 const endpoints& ep_plane1,
39 const endpoints& ep_plane2,
40 unsigned int component_plane2,
41 endpoints& result
42) {
43 unsigned int partition_count = ep_plane1.partition_count;
44 assert(partition_count == 1);
45
46 vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
47
48 result.partition_count = partition_count;
49 result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
50 result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
51}
52
53/**
54 * @brief Attempt to improve weights given a chosen configuration.
55 *
56 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
57 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
58 * down by one quantization step.
59 *
60 * This is a specialized function which only supports operating on undecimated weight grids,
61 * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
62 * is needed less often.
63 *
64 * @param decode_mode The decode mode (LDR, HDR).
65 * @param bsd The block size information.
66 * @param blk The image block color data to compress.
67 * @param[out] scb The symbolic compressed block output.
68 */
69static bool realign_weights_undecimated(
70 astcenc_profile decode_mode,
71 const block_size_descriptor& bsd,
72 const image_block& blk,
73 symbolic_compressed_block& scb
74) {
75 // Get the partition descriptor
76 unsigned int partition_count = scb.partition_count;
77 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
78
79 // Get the quantization table
80 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
81 unsigned int weight_quant_level = bm.quant_mode;
82 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
83
84 unsigned int max_plane = bm.is_dual_plane;
85 int plane2_component = scb.plane2_component;
86 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
87
88 // Decode the color endpoints
89 bool rgb_hdr;
90 bool alpha_hdr;
91 vint4 endpnt0[BLOCK_MAX_PARTITIONS];
92 vint4 endpnt1[BLOCK_MAX_PARTITIONS];
93 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
94 vfloat4 offset[BLOCK_MAX_PARTITIONS];
95
96 promise(partition_count > 0);
97
98 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
99 {
100 unpack_color_endpoints(decode_mode,
101 scb.color_formats[pa_idx],
102 scb.color_values[pa_idx],
103 rgb_hdr, alpha_hdr,
104 endpnt0[pa_idx],
105 endpnt1[pa_idx]);
106 }
107
108 uint8_t* dec_weights_uquant = scb.weights;
109 bool adjustments = false;
110
111 // For each plane and partition ...
112 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
113 {
114 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
115 {
116 // Compute the endpoint delta for all components in current plane
117 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
118 epd = select(epd, vint4::zero(), plane_mask);
119
120 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
121 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
122 }
123
124 // For each weight compute previous, current, and next errors
125 promise(bsd.texel_count > 0);
126 for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
127 {
128 int uqw = dec_weights_uquant[texel];
129
130 uint32_t prev_and_next = qat.prev_next_values[uqw];
131 int uqw_down = prev_and_next & 0xFF;
132 int uqw_up = (prev_and_next >> 8) & 0xFF;
133
134 // Interpolate the colors to create the diffs
135 float weight_base = static_cast<float>(uqw);
136 float weight_down = static_cast<float>(uqw_down - uqw);
137 float weight_up = static_cast<float>(uqw_up - uqw);
138
139 unsigned int partition = pi.partition_of_texel[texel];
140 vfloat4 color_offset = offset[partition];
141 vfloat4 color_base = endpnt0f[partition];
142
143 vfloat4 color = color_base + color_offset * weight_base;
144 vfloat4 orig_color = blk.texel(texel);
145 vfloat4 error_weight = blk.channel_weight;
146
147 vfloat4 color_diff = color - orig_color;
148 vfloat4 color_diff_down = color_diff + color_offset * weight_down;
149 vfloat4 color_diff_up = color_diff + color_offset * weight_up;
150
151 float error_base = dot_s(color_diff * color_diff, error_weight);
152 float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
153 float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
154
155 // Check if the prev or next error is better, and if so use it
156 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
157 {
158 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
159 adjustments = true;
160 }
161 else if ((error_down < error_base) && (uqw > 0))
162 {
163 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
164 adjustments = true;
165 }
166 }
167
168 // Prepare iteration for plane 2
169 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
170 plane_mask = ~plane_mask;
171 }
172
173 return adjustments;
174}
175
176/**
177 * @brief Attempt to improve weights given a chosen configuration.
178 *
179 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
180 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
181 * down by one quantization step.
182 *
183 * @param decode_mode The decode mode (LDR, HDR).
184 * @param bsd The block size information.
185 * @param blk The image block color data to compress.
186 * @param[out] scb The symbolic compressed block output.
187 */
188static bool realign_weights_decimated(
189 astcenc_profile decode_mode,
190 const block_size_descriptor& bsd,
191 const image_block& blk,
192 symbolic_compressed_block& scb
193) {
194 // Get the partition descriptor
195 unsigned int partition_count = scb.partition_count;
196 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
197
198 // Get the quantization table
199 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
200 unsigned int weight_quant_level = bm.quant_mode;
201 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
202
203 // Get the decimation table
204 const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
205 unsigned int weight_count = di.weight_count;
206 assert(weight_count != bsd.texel_count);
207
208 unsigned int max_plane = bm.is_dual_plane;
209 int plane2_component = scb.plane2_component;
210 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
211
212 // Decode the color endpoints
213 bool rgb_hdr;
214 bool alpha_hdr;
215 vint4 endpnt0[BLOCK_MAX_PARTITIONS];
216 vint4 endpnt1[BLOCK_MAX_PARTITIONS];
217 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
218 vfloat4 offset[BLOCK_MAX_PARTITIONS];
219
220 promise(partition_count > 0);
221 promise(weight_count > 0);
222
223 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
224 {
225 unpack_color_endpoints(decode_mode,
226 scb.color_formats[pa_idx],
227 scb.color_values[pa_idx],
228 rgb_hdr, alpha_hdr,
229 endpnt0[pa_idx],
230 endpnt1[pa_idx]);
231 }
232
233 uint8_t* dec_weights_uquant = scb.weights;
234 bool adjustments = false;
235
236 // For each plane and partition ...
237 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
238 {
239 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
240 {
241 // Compute the endpoint delta for all components in current plane
242 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
243 epd = select(epd, vint4::zero(), plane_mask);
244
245 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
246 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
247 }
248
249 // Create an unquantized weight grid for this decimation level
250 alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS];
251 for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
252 {
253 vint unquant_value(dec_weights_uquant + we_idx);
254 vfloat unquant_valuef = int_to_float(unquant_value);
255 storea(unquant_valuef, uq_weightsf + we_idx);
256 }
257
258 // For each weight compute previous, current, and next errors
259 for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
260 {
261 int uqw = dec_weights_uquant[we_idx];
262 uint32_t prev_and_next = qat.prev_next_values[uqw];
263
264 float uqw_base = uq_weightsf[we_idx];
265 float uqw_down = static_cast<float>(prev_and_next & 0xFF);
266 float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
267
268 float uqw_diff_down = uqw_down - uqw_base;
269 float uqw_diff_up = uqw_up - uqw_base;
270
271 vfloat4 error_basev = vfloat4::zero();
272 vfloat4 error_downv = vfloat4::zero();
273 vfloat4 error_upv = vfloat4::zero();
274
275 // Interpolate the colors to create the diffs
276 unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
277 promise(texels_to_evaluate > 0);
278 for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
279 {
280 unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
281
282 float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
283
284 float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
285 + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
286 + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
287 + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
288
289 // Ideally this is integer rounded, but IQ gain it isn't worth the overhead
290 // float weight = astc::flt_rd(weight_base + 0.5f);
291 // float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
292 // float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
293 float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
294 float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
295
296 unsigned int partition = pi.partition_of_texel[texel];
297 vfloat4 color_offset = offset[partition];
298 vfloat4 color_base = endpnt0f[partition];
299
300 vfloat4 color = color_base + color_offset * weight_base;
301 vfloat4 orig_color = blk.texel(texel);
302
303 vfloat4 color_diff = color - orig_color;
304 vfloat4 color_down_diff = color_diff + color_offset * weight_down;
305 vfloat4 color_up_diff = color_diff + color_offset * weight_up;
306
307 error_basev += color_diff * color_diff;
308 error_downv += color_down_diff * color_down_diff;
309 error_upv += color_up_diff * color_up_diff;
310 }
311
312 vfloat4 error_weight = blk.channel_weight;
313 float error_base = hadd_s(error_basev * error_weight);
314 float error_down = hadd_s(error_downv * error_weight);
315 float error_up = hadd_s(error_upv * error_weight);
316
317 // Check if the prev or next error is better, and if so use it
318 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
319 {
320 uq_weightsf[we_idx] = uqw_up;
321 dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
322 adjustments = true;
323 }
324 else if ((error_down < error_base) && (uqw > 0))
325 {
326 uq_weightsf[we_idx] = uqw_down;
327 dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
328 adjustments = true;
329 }
330 }
331
332 // Prepare iteration for plane 2
333 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
334 plane_mask = ~plane_mask;
335 }
336
337 return adjustments;
338}
339
340/**
341 * @brief Compress a block using a chosen partitioning and 1 plane of weights.
342 *
343 * @param config The compressor configuration.
344 * @param bsd The block size information.
345 * @param blk The image block color data to compress.
346 * @param only_always True if we only use "always" percentile block modes.
347 * @param tune_errorval_threshold The error value threshold.
348 * @param partition_count The partition count.
349 * @param partition_index The partition index if @c partition_count is 2-4.
350 * @param[out] scb The symbolic compressed block output.
351 * @param[out] tmpbuf The quantized weights for plane 1.
352 */
353static float compress_symbolic_block_for_partition_1plane(
354 const astcenc_config& config,
355 const block_size_descriptor& bsd,
356 const image_block& blk,
357 bool only_always,
358 float tune_errorval_threshold,
359 unsigned int partition_count,
360 unsigned int partition_index,
361 symbolic_compressed_block& scb,
362 compression_working_buffers& tmpbuf,
363 int quant_limit
364) {
365 promise(partition_count > 0);
366 promise(config.tune_candidate_limit > 0);
367 promise(config.tune_refinement_limit > 0);
368
369 int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
370
371 auto compute_difference = &compute_symbolic_block_difference_1plane;
372 if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
373 {
374 compute_difference = &compute_symbolic_block_difference_1plane_1partition;
375 }
376
377 const auto& pi = bsd.get_partition_info(partition_count, partition_index);
378
379 // Compute ideal weights and endpoint colors, with no quantization or decimation
380 endpoints_and_weights& ei = tmpbuf.ei1;
381 compute_ideal_colors_and_weights_1plane(blk, pi, ei);
382
383 // Compute ideal weights and endpoint colors for every decimation
384 float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
385 uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
386
387 // For each decimation mode, compute an ideal set of weights with no quantization
388 unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
389 : bsd.decimation_mode_count_selected;
390 promise(max_decimation_modes > 0);
391 for (unsigned int i = 0; i < max_decimation_modes; i++)
392 {
393 const auto& dm = bsd.get_decimation_mode(i);
394 if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
395 {
396 continue;
397 }
398
399 const auto& di = bsd.get_decimation_info(i);
400
401 compute_ideal_weights_for_decimation(
402 ei,
403 di,
404 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
405 }
406
407 // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
408 // weight pair, compute the smallest weight that will result in a color value greater than 1
409 vfloat4 min_ep(10.0f);
410 for (unsigned int i = 0; i < partition_count; i++)
411 {
412 vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
413
414 vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
415 min_ep = select(min_ep, ep, use_ep);
416 }
417
418 float min_wt_cutoff = hmin_s(min_ep);
419
420 // For each mode, use the angular method to compute a shift
421 compute_angular_endpoints_1plane(
422 only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
423
424 float* weight_low_value = tmpbuf.weight_low_value1;
425 float* weight_high_value = tmpbuf.weight_high_value1;
426 int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
427 float* qwt_errors = tmpbuf.qwt_errors;
428
429 // For each mode (which specifies a decimation and a quantization):
430 // * Compute number of bits needed for the quantized weights
431 // * Generate an optimized set of quantized weights
432 // * Compute quantization errors for the mode
433
434
435 static const int8_t free_bits_for_partition_count[4] {
436 115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
437 };
438
439 unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
440 : bsd.block_mode_count_1plane_selected;
441 promise(max_block_modes > 0);
442 for (unsigned int i = 0; i < max_block_modes; i++)
443 {
444 const block_mode& bm = bsd.block_modes[i];
445
446 if (bm.quant_mode > max_weight_quant)
447 {
448 qwt_errors[i] = 1e38f;
449 continue;
450 }
451
452 assert(!bm.is_dual_plane);
453 int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
454 if (bitcount <= 0)
455 {
456 qwt_errors[i] = 1e38f;
457 continue;
458 }
459
460 if (weight_high_value[i] > 1.02f * min_wt_cutoff)
461 {
462 weight_high_value[i] = 1.0f;
463 }
464
465 int decimation_mode = bm.decimation_mode;
466 const auto& di = bsd.get_decimation_info(decimation_mode);
467
468 qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
469
470 alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
471
472 // Generate the optimized set of weights for the weight mode
473 compute_quantized_weights_for_decimation(
474 di,
475 weight_low_value[i], weight_high_value[i],
476 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
477 dec_weights_uquantf,
478 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
479 bm.get_weight_quant_mode());
480
481 // Compute weight quantization errors for the block mode
482 qwt_errors[i] = compute_error_of_weight_set_1plane(
483 ei,
484 di,
485 dec_weights_uquantf);
486 }
487
488 // Decide the optimal combination of color endpoint encodings and weight encodings
489 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
490 int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
491
492 quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
493 quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
494
495 unsigned int candidate_count = compute_ideal_endpoint_formats(
496 pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
497 config.tune_candidate_limit, 0, max_block_modes,
498 partition_format_specifiers, block_mode_index,
499 color_quant_level, color_quant_level_mod, tmpbuf);
500
501 // Iterate over the N believed-to-be-best modes to find out which one is actually best
502 float best_errorval_in_mode = ERROR_CALC_DEFAULT;
503 float best_errorval_in_scb = scb.errorval;
504
505 for (unsigned int i = 0; i < candidate_count; i++)
506 {
507 TRACE_NODE(node0, "candidate");
508
509 const int bm_packed_index = block_mode_index[i];
510 assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
511 const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
512
513 int decimation_mode = qw_bm.decimation_mode;
514 const auto& di = bsd.get_decimation_info(decimation_mode);
515 promise(di.weight_count > 0);
516
517 trace_add_data("weight_x", di.weight_x);
518 trace_add_data("weight_y", di.weight_y);
519 trace_add_data("weight_z", di.weight_z);
520 trace_add_data("weight_quant", qw_bm.quant_mode);
521
522 // Recompute the ideal color endpoints before storing them
523 vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
524 vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
525
526 symbolic_compressed_block workscb;
527 endpoints workep = ei.ep;
528
529 uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
530
531 for (unsigned int j = 0; j < di.weight_count; j++)
532 {
533 workscb.weights[j] = u8_weight_src[j];
534 }
535
536 for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
537 {
538 recompute_ideal_colors_1plane(
539 blk, pi, di, workscb.weights,
540 workep, rgbs_colors, rgbo_colors);
541
542 // Quantize the chosen color, tracking if worth trying the mod value
543 bool all_same = color_quant_level[i] != color_quant_level_mod[i];
544 for (unsigned int j = 0; j < partition_count; j++)
545 {
546 workscb.color_formats[j] = pack_color_endpoints(
547 workep.endpt0[j],
548 workep.endpt1[j],
549 rgbs_colors[j],
550 rgbo_colors[j],
551 partition_format_specifiers[i][j],
552 workscb.color_values[j],
553 color_quant_level[i]);
554
555 all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
556 }
557
558 // If all the color endpoint modes are the same, we get a few more bits to store colors;
559 // let's see if we can take advantage of this: requantize all the colors and see if the
560 // endpoint modes remain the same.
561 workscb.color_formats_matched = 0;
562 if (partition_count >= 2 && all_same)
563 {
564 uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
565 uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
566 bool all_same_mod = true;
567 for (unsigned int j = 0; j < partition_count; j++)
568 {
569 color_formats_mod[j] = pack_color_endpoints(
570 workep.endpt0[j],
571 workep.endpt1[j],
572 rgbs_colors[j],
573 rgbo_colors[j],
574 partition_format_specifiers[i][j],
575 colorvals[j],
576 color_quant_level_mod[i]);
577
578 // Early out as soon as it's no longer possible to use mod
579 if (color_formats_mod[j] != color_formats_mod[0])
580 {
581 all_same_mod = false;
582 break;
583 }
584 }
585
586 if (all_same_mod)
587 {
588 workscb.color_formats_matched = 1;
589 for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
590 {
591 for (unsigned int k = 0; k < 8; k++)
592 {
593 workscb.color_values[j][k] = colorvals[j][k];
594 }
595
596 workscb.color_formats[j] = color_formats_mod[j];
597 }
598 }
599 }
600
601 // Store header fields
602 workscb.partition_count = static_cast<uint8_t>(partition_count);
603 workscb.partition_index = static_cast<uint16_t>(partition_index);
604 workscb.plane2_component = -1;
605 workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
606 workscb.block_mode = qw_bm.mode_index;
607 workscb.block_type = SYM_BTYPE_NONCONST;
608
609 // Pre-realign test
610 if (l == 0)
611 {
612 float errorval = compute_difference(config, bsd, workscb, blk);
613 if (errorval == -ERROR_CALC_DEFAULT)
614 {
615 errorval = -errorval;
616 workscb.block_type = SYM_BTYPE_ERROR;
617 }
618
619 trace_add_data("error_prerealign", errorval);
620 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
621
622 // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
623 // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
624 // drive a heuristic to skip blocks that are unlikely to catch up with the best
625 // block we have already.
626 unsigned int iters_remaining = config.tune_refinement_limit - l;
627 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
628 if (errorval > (threshold * best_errorval_in_scb))
629 {
630 break;
631 }
632
633 if (errorval < best_errorval_in_scb)
634 {
635 best_errorval_in_scb = errorval;
636 workscb.errorval = errorval;
637 scb = workscb;
638
639 if (errorval < tune_errorval_threshold)
640 {
641 // Skip remaining candidates - this is "good enough"
642 i = candidate_count;
643 break;
644 }
645 }
646 }
647
648 bool adjustments;
649 if (di.weight_count != bsd.texel_count)
650 {
651 adjustments = realign_weights_decimated(
652 config.profile, bsd, blk, workscb);
653 }
654 else
655 {
656 adjustments = realign_weights_undecimated(
657 config.profile, bsd, blk, workscb);
658 }
659
660 // Post-realign test
661 float errorval = compute_difference(config, bsd, workscb, blk);
662 if (errorval == -ERROR_CALC_DEFAULT)
663 {
664 errorval = -errorval;
665 workscb.block_type = SYM_BTYPE_ERROR;
666 }
667
668 trace_add_data("error_postrealign", errorval);
669 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
670
671 // Average refinement improvement is 3.5% per iteration, so skip blocks that are
672 // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
673 // give benefit of the doubt ...
674 unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
675 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
676 if (errorval > (threshold * best_errorval_in_scb))
677 {
678 break;
679 }
680
681 if (errorval < best_errorval_in_scb)
682 {
683 best_errorval_in_scb = errorval;
684 workscb.errorval = errorval;
685 scb = workscb;
686
687 if (errorval < tune_errorval_threshold)
688 {
689 // Skip remaining candidates - this is "good enough"
690 i = candidate_count;
691 break;
692 }
693 }
694
695 if (!adjustments)
696 {
697 break;
698 }
699 }
700 }
701
702 return best_errorval_in_mode;
703}
704
705/**
706 * @brief Compress a block using a chosen partitioning and 2 planes of weights.
707 *
708 * @param config The compressor configuration.
709 * @param bsd The block size information.
710 * @param blk The image block color data to compress.
711 * @param tune_errorval_threshold The error value threshold.
712 * @param plane2_component The component index for the second plane of weights.
713 * @param[out] scb The symbolic compressed block output.
714 * @param[out] tmpbuf The quantized weights for plane 1.
715 */
716static float compress_symbolic_block_for_partition_2planes(
717 const astcenc_config& config,
718 const block_size_descriptor& bsd,
719 const image_block& blk,
720 float tune_errorval_threshold,
721 unsigned int plane2_component,
722 symbolic_compressed_block& scb,
723 compression_working_buffers& tmpbuf,
724 int quant_limit
725) {
726 promise(config.tune_candidate_limit > 0);
727 promise(config.tune_refinement_limit > 0);
728 promise(bsd.decimation_mode_count_selected > 0);
729
730 int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
731
732 // Compute ideal weights and endpoint colors, with no quantization or decimation
733 endpoints_and_weights& ei1 = tmpbuf.ei1;
734 endpoints_and_weights& ei2 = tmpbuf.ei2;
735
736 compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
737
738 // Compute ideal weights and endpoint colors for every decimation
739 float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
740 uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
741
742 // For each decimation mode, compute an ideal set of weights with no quantization
743 for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
744 {
745 const auto& dm = bsd.get_decimation_mode(i);
746 if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
747 {
748 continue;
749 }
750
751 const auto& di = bsd.get_decimation_info(i);
752
753 compute_ideal_weights_for_decimation(
754 ei1,
755 di,
756 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
757
758 compute_ideal_weights_for_decimation(
759 ei2,
760 di,
761 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
762 }
763
764 // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
765 // weight pair, compute the smallest weight that will result in a color value greater than 1
766 vfloat4 min_ep1(10.0f);
767 vfloat4 min_ep2(10.0f);
768
769 vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
770 vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
771 min_ep1 = select(min_ep1, ep1, use_ep1);
772
773 vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
774 vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
775 min_ep2 = select(min_ep2, ep2, use_ep2);
776
777 vfloat4 err_max(ERROR_CALC_DEFAULT);
778 vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
779
780 // Set the plane2 component to max error in ep1
781 min_ep1 = select(min_ep1, err_max, err_mask);
782
783 float min_wt_cutoff1 = hmin_s(min_ep1);
784
785 // Set the minwt2 to the plane2 component min in ep2
786 float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
787
788 compute_angular_endpoints_2planes(
789 bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
790
791 // For each mode (which specifies a decimation and a quantization):
792 // * Compute number of bits needed for the quantized weights
793 // * Generate an optimized set of quantized weights
794 // * Compute quantization errors for the mode
795
796 float* weight_low_value1 = tmpbuf.weight_low_value1;
797 float* weight_high_value1 = tmpbuf.weight_high_value1;
798 float* weight_low_value2 = tmpbuf.weight_low_value2;
799 float* weight_high_value2 = tmpbuf.weight_high_value2;
800
801 int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
802 float* qwt_errors = tmpbuf.qwt_errors;
803
804 unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
805 unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
806
807 for (unsigned int i = start_2plane; i < end_2plane; i++)
808 {
809 const block_mode& bm = bsd.block_modes[i];
810 assert(bm.is_dual_plane);
811
812 if (bm.quant_mode > max_weight_quant)
813 {
814 qwt_errors[i] = 1e38f;
815 continue;
816 }
817
818 qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
819
820 if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
821 {
822 weight_high_value1[i] = 1.0f;
823 }
824
825 if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
826 {
827 weight_high_value2[i] = 1.0f;
828 }
829
830 unsigned int decimation_mode = bm.decimation_mode;
831 const auto& di = bsd.get_decimation_info(decimation_mode);
832
833 alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
834
835 // Generate the optimized set of weights for the mode
836 compute_quantized_weights_for_decimation(
837 di,
838 weight_low_value1[i],
839 weight_high_value1[i],
840 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
841 dec_weights_uquantf,
842 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
843 bm.get_weight_quant_mode());
844
845 compute_quantized_weights_for_decimation(
846 di,
847 weight_low_value2[i],
848 weight_high_value2[i],
849 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
850 dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
851 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
852 bm.get_weight_quant_mode());
853
854 // Compute weight quantization errors for the block mode
855 qwt_errors[i] = compute_error_of_weight_set_2planes(
856 ei1,
857 ei2,
858 di,
859 dec_weights_uquantf,
860 dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
861 }
862
863 // Decide the optimal combination of color endpoint encodings and weight encodings
864 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
865 int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
866
867 quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
868 quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
869
870 endpoints epm;
871 merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
872
873 const auto& pi = bsd.get_partition_info(1, 0);
874 unsigned int candidate_count = compute_ideal_endpoint_formats(
875 pi, blk, epm, qwt_bitcounts, qwt_errors,
876 config.tune_candidate_limit,
877 bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
878 partition_format_specifiers, block_mode_index,
879 color_quant_level, color_quant_level_mod, tmpbuf);
880
881 // Iterate over the N believed-to-be-best modes to find out which one is actually best
882 float best_errorval_in_mode = ERROR_CALC_DEFAULT;
883 float best_errorval_in_scb = scb.errorval;
884
885 for (unsigned int i = 0; i < candidate_count; i++)
886 {
887 TRACE_NODE(node0, "candidate");
888
889 const int bm_packed_index = block_mode_index[i];
890 assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
891 bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
892 const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
893
894 int decimation_mode = qw_bm.decimation_mode;
895 const auto& di = bsd.get_decimation_info(decimation_mode);
896 promise(di.weight_count > 0);
897
898 trace_add_data("weight_x", di.weight_x);
899 trace_add_data("weight_y", di.weight_y);
900 trace_add_data("weight_z", di.weight_z);
901 trace_add_data("weight_quant", qw_bm.quant_mode);
902
903 vfloat4 rgbs_color;
904 vfloat4 rgbo_color;
905
906 symbolic_compressed_block workscb;
907 endpoints workep = epm;
908
909 uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
910 uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
911
912 for (int j = 0; j < di.weight_count; j++)
913 {
914 workscb.weights[j] = u8_weight1_src[j];
915 workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
916 }
917
918 for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
919 {
920 recompute_ideal_colors_2planes(
921 blk, bsd, di,
922 workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
923 workep, rgbs_color, rgbo_color, plane2_component);
924
925 // Quantize the chosen color
926 workscb.color_formats[0] = pack_color_endpoints(
927 workep.endpt0[0],
928 workep.endpt1[0],
929 rgbs_color, rgbo_color,
930 partition_format_specifiers[i][0],
931 workscb.color_values[0],
932 color_quant_level[i]);
933
934 // Store header fields
935 workscb.partition_count = 1;
936 workscb.partition_index = 0;
937 workscb.quant_mode = color_quant_level[i];
938 workscb.color_formats_matched = 0;
939 workscb.block_mode = qw_bm.mode_index;
940 workscb.plane2_component = static_cast<int8_t>(plane2_component);
941 workscb.block_type = SYM_BTYPE_NONCONST;
942
943 // Pre-realign test
944 if (l == 0)
945 {
946 float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
947 if (errorval == -ERROR_CALC_DEFAULT)
948 {
949 errorval = -errorval;
950 workscb.block_type = SYM_BTYPE_ERROR;
951 }
952
953 trace_add_data("error_prerealign", errorval);
954 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
955
956 // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
957 // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
958 // drive a heuristic to skip blocks that are unlikely to catch up with the best
959 // block we have already.
960 unsigned int iters_remaining = config.tune_refinement_limit - l;
961 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
962 if (errorval > (threshold * best_errorval_in_scb))
963 {
964 break;
965 }
966
967 if (errorval < best_errorval_in_scb)
968 {
969 best_errorval_in_scb = errorval;
970 workscb.errorval = errorval;
971 scb = workscb;
972
973 if (errorval < tune_errorval_threshold)
974 {
975 // Skip remaining candidates - this is "good enough"
976 i = candidate_count;
977 break;
978 }
979 }
980 }
981
982 // Perform a final pass over the weights to try to improve them.
983 bool adjustments;
984 if (di.weight_count != bsd.texel_count)
985 {
986 adjustments = realign_weights_decimated(
987 config.profile, bsd, blk, workscb);
988 }
989 else
990 {
991 adjustments = realign_weights_undecimated(
992 config.profile, bsd, blk, workscb);
993 }
994
995 // Post-realign test
996 float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
997 if (errorval == -ERROR_CALC_DEFAULT)
998 {
999 errorval = -errorval;
1000 workscb.block_type = SYM_BTYPE_ERROR;
1001 }
1002
1003 trace_add_data("error_postrealign", errorval);
1004 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1005
1006 // Average refinement improvement is 3.5% per iteration, so skip blocks that are
1007 // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1008 // give benefit of the doubt ...
1009 unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1010 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
1011 if (errorval > (threshold * best_errorval_in_scb))
1012 {
1013 break;
1014 }
1015
1016 if (errorval < best_errorval_in_scb)
1017 {
1018 best_errorval_in_scb = errorval;
1019 workscb.errorval = errorval;
1020 scb = workscb;
1021
1022 if (errorval < tune_errorval_threshold)
1023 {
1024 // Skip remaining candidates - this is "good enough"
1025 i = candidate_count;
1026 break;
1027 }
1028 }
1029
1030 if (!adjustments)
1031 {
1032 break;
1033 }
1034 }
1035 }
1036
1037 return best_errorval_in_mode;
1038}
1039
1040/**
1041 * @brief Determine the lowest cross-channel correlation factor.
1042 *
1043 * @param texels_per_block The number of texels in a block.
1044 * @param blk The image block color data to compress.
1045 *
1046 * @return Return the lowest correlation factor.
1047 */
1048static float prepare_block_statistics(
1049 int texels_per_block,
1050 const image_block& blk
1051) {
1052 // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1053 // of the matrix. The matrix is symmetric, so this is all we need for this use case.
1054 float rs = 0.0f;
1055 float gs = 0.0f;
1056 float bs = 0.0f;
1057 float as = 0.0f;
1058 float rr_var = 0.0f;
1059 float gg_var = 0.0f;
1060 float bb_var = 0.0f;
1061 float aa_var = 0.0f;
1062 float rg_cov = 0.0f;
1063 float rb_cov = 0.0f;
1064 float ra_cov = 0.0f;
1065 float gb_cov = 0.0f;
1066 float ga_cov = 0.0f;
1067 float ba_cov = 0.0f;
1068
1069 float weight_sum = 0.0f;
1070
1071 promise(texels_per_block > 0);
1072 for (int i = 0; i < texels_per_block; i++)
1073 {
1074 float weight = hadd_s(blk.channel_weight) / 4.0f;
1075 assert(weight >= 0.0f);
1076 weight_sum += weight;
1077
1078 float r = blk.data_r[i];
1079 float g = blk.data_g[i];
1080 float b = blk.data_b[i];
1081 float a = blk.data_a[i];
1082
1083 float rw = r * weight;
1084 rs += rw;
1085 rr_var += r * rw;
1086 rg_cov += g * rw;
1087 rb_cov += b * rw;
1088 ra_cov += a * rw;
1089
1090 float gw = g * weight;
1091 gs += gw;
1092 gg_var += g * gw;
1093 gb_cov += b * gw;
1094 ga_cov += a * gw;
1095
1096 float bw = b * weight;
1097 bs += bw;
1098 bb_var += b * bw;
1099 ba_cov += a * bw;
1100
1101 float aw = a * weight;
1102 as += aw;
1103 aa_var += a * aw;
1104 }
1105
1106 float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1107
1108 rr_var -= rs * (rs * rpt);
1109 rg_cov -= gs * (rs * rpt);
1110 rb_cov -= bs * (rs * rpt);
1111 ra_cov -= as * (rs * rpt);
1112
1113 gg_var -= gs * (gs * rpt);
1114 gb_cov -= bs * (gs * rpt);
1115 ga_cov -= as * (gs * rpt);
1116
1117 bb_var -= bs * (bs * rpt);
1118 ba_cov -= as * (bs * rpt);
1119
1120 aa_var -= as * (as * rpt);
1121
1122 // These will give a NaN if a channel is constant - these are fixed up in the next step
1123 rg_cov *= astc::rsqrt(rr_var * gg_var);
1124 rb_cov *= astc::rsqrt(rr_var * bb_var);
1125 ra_cov *= astc::rsqrt(rr_var * aa_var);
1126 gb_cov *= astc::rsqrt(gg_var * bb_var);
1127 ga_cov *= astc::rsqrt(gg_var * aa_var);
1128 ba_cov *= astc::rsqrt(bb_var * aa_var);
1129
1130 if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1131 if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1132 if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1133 if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1134 if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1135 if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1136
1137 float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
1138 lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
1139 lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
1140 lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
1141 lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov));
1142
1143 // Diagnostic trace points
1144 trace_add_data("min_r", blk.data_min.lane<0>());
1145 trace_add_data("max_r", blk.data_max.lane<0>());
1146 trace_add_data("min_g", blk.data_min.lane<1>());
1147 trace_add_data("max_g", blk.data_max.lane<1>());
1148 trace_add_data("min_b", blk.data_min.lane<2>());
1149 trace_add_data("max_b", blk.data_max.lane<2>());
1150 trace_add_data("min_a", blk.data_min.lane<3>());
1151 trace_add_data("max_a", blk.data_max.lane<3>());
1152 trace_add_data("cov_rg", fabsf(rg_cov));
1153 trace_add_data("cov_rb", fabsf(rb_cov));
1154 trace_add_data("cov_ra", fabsf(ra_cov));
1155 trace_add_data("cov_gb", fabsf(gb_cov));
1156 trace_add_data("cov_ga", fabsf(ga_cov));
1157 trace_add_data("cov_ba", fabsf(ba_cov));
1158
1159 return lowest_correlation;
1160}
1161
1162/* See header for documentation. */
1163void compress_block(
1164 const astcenc_contexti& ctx,
1165 const image_block& blk,
1166 physical_compressed_block& pcb,
1167 compression_working_buffers& tmpbuf)
1168{
1169 astcenc_profile decode_mode = ctx.config.profile;
1170 symbolic_compressed_block scb;
1171 const block_size_descriptor& bsd = *ctx.bsd;
1172 float lowest_correl;
1173
1174 TRACE_NODE(node0, "block");
1175 trace_add_data("pos_x", blk.xpos);
1176 trace_add_data("pos_y", blk.ypos);
1177 trace_add_data("pos_z", blk.zpos);
1178
1179 // Set stricter block targets for luminance data as we have more bits to play with
1180 bool block_is_l = blk.is_luminance();
1181 float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1182
1183 // Set slightly stricter block targets for lumalpha data as we have more bits to play with
1184 bool block_is_la = blk.is_luminancealpha();
1185 float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1186
1187 bool block_skip_two_plane = false;
1188 int max_partitions = ctx.config.tune_partition_count_limit;
1189
1190 unsigned int requested_partition_indices[3] {
1191 ctx.config.tune_2partition_index_limit,
1192 ctx.config.tune_3partition_index_limit,
1193 ctx.config.tune_4partition_index_limit
1194 };
1195
1196 unsigned int requested_partition_trials[3] {
1197 ctx.config.tune_2partitioning_candidate_limit,
1198 ctx.config.tune_3partitioning_candidate_limit,
1199 ctx.config.tune_4partitioning_candidate_limit
1200 };
1201
1202#if defined(ASTCENC_DIAGNOSTICS)
1203 // Do this early in diagnostic builds so we can dump uniform metrics
1204 // for every block. Do it later in release builds to avoid redundant work!
1205 float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1206 float error_threshold = ctx.config.tune_db_limit
1207 * error_weight_sum
1208 * block_is_l_scale
1209 * block_is_la_scale;
1210
1211 lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1212 trace_add_data("lowest_correl", lowest_correl);
1213 trace_add_data("tune_error_threshold", error_threshold);
1214#endif
1215
1216 // Detected a constant-color block
1217 if (all(blk.data_min == blk.data_max))
1218 {
1219 TRACE_NODE(node1, "pass");
1220 trace_add_data("partition_count", 0);
1221 trace_add_data("plane_count", 1);
1222
1223 scb.partition_count = 0;
1224
1225 // Encode as FP16 if using HDR
1226 if ((decode_mode == ASTCENC_PRF_HDR) ||
1227 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1228 {
1229 scb.block_type = SYM_BTYPE_CONST_F16;
1230 vint4 color_f16 = float_to_float16(blk.origin_texel);
1231 store(color_f16, scb.constant_color);
1232 }
1233 // Encode as UNORM16 if NOT using HDR
1234 else
1235 {
1236 scb.block_type = SYM_BTYPE_CONST_U16;
1237 vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1238 vint4 color_u16 = float_to_int_rtn(color_f32);
1239 store(color_u16, scb.constant_color);
1240 }
1241
1242 trace_add_data("exit", "quality hit");
1243
1244 symbolic_to_physical(bsd, scb, pcb);
1245 return;
1246 }
1247
1248#if !defined(ASTCENC_DIAGNOSTICS)
1249 float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1250 float error_threshold = ctx.config.tune_db_limit
1251 * error_weight_sum
1252 * block_is_l_scale
1253 * block_is_la_scale;
1254#endif
1255
1256 // Set SCB and mode errors to a very high error value
1257 scb.errorval = ERROR_CALC_DEFAULT;
1258 scb.block_type = SYM_BTYPE_ERROR;
1259
1260 float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1261 ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1262 };
1263
1264 float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1265 0.0f,
1266 ctx.config.tune_2partition_early_out_limit_factor,
1267 ctx.config.tune_3partition_early_out_limit_factor,
1268 0.0f
1269 };
1270
1271 // Trial using 1 plane of weights and 1 partition.
1272
1273 // Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1274 // mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1275 // optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1276 // compression and slightly reduces image quality.
1277
1278 float errorval_mult[2] {
1279 1.0f / ctx.config.tune_mse_overshoot,
1280 1.0f
1281 };
1282
1283 static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
1284
1285 // Only enable MODE0 fast path (trial 0) if 2D, and more than 25 texels
1286 int start_trial = 1;
1287 if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1))
1288 {
1289 start_trial = 0;
1290 }
1291
1292 int quant_limit = QUANT_32;
1293 for (int i = start_trial; i < 2; i++)
1294 {
1295 TRACE_NODE(node1, "pass");
1296 trace_add_data("partition_count", 1);
1297 trace_add_data("plane_count", 1);
1298 trace_add_data("search_mode", i);
1299
1300 float errorval = compress_symbolic_block_for_partition_1plane(
1301 ctx.config, bsd, blk, i == 0,
1302 error_threshold * errorval_mult[i] * errorval_overshoot,
1303 1, 0, scb, tmpbuf, QUANT_32);
1304
1305 // Record the quant level so we can use the filter later searches
1306 const auto& bm = bsd.get_block_mode(scb.block_mode);
1307 quant_limit = bm.get_weight_quant_mode();
1308
1309 best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1310 if (errorval < (error_threshold * errorval_mult[i]))
1311 {
1312 trace_add_data("exit", "quality hit");
1313 goto END_OF_TESTS;
1314 }
1315 }
1316
1317#if !defined(ASTCENC_DIAGNOSTICS)
1318 lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1319#endif
1320
1321 block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1322
1323 // Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1324 // alpha is the most likely to be non-correlated if it is present in the data.
1325 for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1326 {
1327 TRACE_NODE(node1, "pass");
1328 trace_add_data("partition_count", 1);
1329 trace_add_data("plane_count", 2);
1330 trace_add_data("plane_component", i);
1331
1332 if (block_skip_two_plane)
1333 {
1334 trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1335 continue;
1336 }
1337
1338 if (blk.grayscale && i != 3)
1339 {
1340 trace_add_data("skip", "grayscale block");
1341 continue;
1342 }
1343
1344 if (blk.is_constant_channel(i))
1345 {
1346 trace_add_data("skip", "constant component");
1347 continue;
1348 }
1349
1350 float errorval = compress_symbolic_block_for_partition_2planes(
1351 ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1352 i, scb, tmpbuf, quant_limit);
1353
1354 // If attempting two planes is much worse than the best one plane result
1355 // then further two plane searches are unlikely to help so move on ...
1356 if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
1357 {
1358 break;
1359 }
1360
1361 if (errorval < error_threshold)
1362 {
1363 trace_add_data("exit", "quality hit");
1364 goto END_OF_TESTS;
1365 }
1366 }
1367
1368 // Find best blocks for 2, 3 and 4 partitions
1369 for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1370 {
1371 unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1372
1373 unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1374
1375 unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1376 requested_trials = astc::min(requested_trials, requested_indices);
1377
1378 unsigned int actual_trials = find_best_partition_candidates(
1379 bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1380
1381 float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1382
1383 for (unsigned int i = 0; i < actual_trials; i++)
1384 {
1385 TRACE_NODE(node1, "pass");
1386 trace_add_data("partition_count", partition_count);
1387 trace_add_data("partition_index", partition_indices[i]);
1388 trace_add_data("plane_count", 1);
1389 trace_add_data("search_mode", i);
1390
1391 float errorval = compress_symbolic_block_for_partition_1plane(
1392 ctx.config, bsd, blk, false,
1393 error_threshold * errorval_overshoot,
1394 partition_count, partition_indices[i],
1395 scb, tmpbuf, quant_limit);
1396
1397 best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1398
1399 // If using N partitions doesn't improve much over using N-1 partitions then skip trying
1400 // N+1. Error can dramatically improve if the data is correlated or non-correlated and
1401 // aligns with a partitioning that suits that encoding, so for this inner loop check add
1402 // a large error scale because the "other" trial could be a lot better.
1403 float best_error = best_errorvals_for_pcount[partition_count - 1];
1404 float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
1405 if (best_error > (best_error_in_prev * best_error_scale))
1406 {
1407 trace_add_data("skip", "tune_partition_early_out_limit_factor");
1408 goto END_OF_TESTS;
1409 }
1410
1411 if (errorval < error_threshold)
1412 {
1413 trace_add_data("exit", "quality hit");
1414 goto END_OF_TESTS;
1415 }
1416 }
1417
1418 // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1419 float best_error = best_errorvals_for_pcount[partition_count - 1];
1420 float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1421 if (best_error > (best_error_in_prev * best_error_scale))
1422 {
1423 trace_add_data("skip", "tune_partition_early_out_limit_factor");
1424 goto END_OF_TESTS;
1425 }
1426 }
1427
1428 trace_add_data("exit", "quality not hit");
1429
1430END_OF_TESTS:
1431 // If we still have an error block then convert to something we can encode
1432 // TODO: Do something more sensible here, such as average color block
1433 if (scb.block_type == SYM_BTYPE_ERROR)
1434 {
1435#if defined(ASTCENC_DIAGNOSTICS)
1436 static bool printed_once = false;
1437 if (!printed_once)
1438 {
1439 printed_once = true;
1440 printf("WARN: At least one block failed to find a valid encoding.\n"
1441 " Try increasing compression quality settings.\n\n");
1442 }
1443#endif
1444
1445 scb.block_type = SYM_BTYPE_CONST_U16;
1446 vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1447 vint4 color_u16 = float_to_int_rtn(color_f32);
1448 store(color_u16, scb.constant_color);
1449 }
1450
1451 // Compress to a physical block
1452 symbolic_to_physical(bsd, scb, pcb);
1453}
1454
1455#endif
1456