astcenc_compress_symbolic.cpp source code [Godot/thirdparty/astcenc/astcenc_compress_symbolic.cpp]

1	// SPDX-License-Identifier: Apache-2.0
2	// ----------------------------------------------------------------------------
3	// Copyright 2011-2023 Arm Limited
4	//
5	// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6	// use this file except in compliance with the License. You may obtain a copy
7	// of the License at:
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing, software
12	// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13	// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14	// License for the specific language governing permissions and limitations
15	// under the License.
16	// ----------------------------------------------------------------------------
17
18	#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20	/**
21	* @brief Functions to compress a symbolic block.
22	*/
23
24	#include "astcenc_internal.h"
25	#include "astcenc_diagnostic_trace.h"
26
27	#include <cassert>
28
29	/**
30	* @brief Merge two planes of endpoints into a single vector.
31	*
32	* @param ep_plane1 The endpoints for plane 1.
33	* @param ep_plane2 The endpoints for plane 2.
34	* @param component_plane2 The color component for plane 2.
35	* @param[out] result The merged output.
36	*/
37	static void merge_endpoints(
38	const endpoints& ep_plane1,
39	const endpoints& ep_plane2,
40	unsigned int component_plane2,
41	endpoints& result
42	) {
43	unsigned int partition_count = ep_plane1.partition_count;
44	assert(partition_count == `1`);
45
46	vmask4 sep_mask = vint4::lane_id() == vint4 (component_plane2);
47
48	result.partition_count = partition_count;
49	result.endpt0[`0`] = select(ep_plane1.endpt0[`0`], ep_plane2.endpt0[`0`], sep_mask);
50	result.endpt1[`0`] = select(ep_plane1.endpt1[`0`], ep_plane2.endpt1[`0`], sep_mask);
51	}
52
53	/**
54	* @brief Attempt to improve weights given a chosen configuration.
55	*
56	* Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
57	* partition and per plane) and attempt to improve image quality by moving each weight up by one or
58	* down by one quantization step.
59	*
60	* This is a specialized function which only supports operating on undecimated weight grids,
61	* therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
62	* is needed less often.
63	*
64	* @param decode_mode The decode mode (LDR, HDR).
65	* @param bsd The block size information.
66	* @param blk The image block color data to compress.
67	* @param[out] scb The symbolic compressed block output.
68	*/
69	static bool realign_weights_undecimated(
70	astcenc_profile decode_mode,
71	const block_size_descriptor& bsd,
72	const image_block& blk,
73	symbolic_compressed_block& scb
74	) {
75	// Get the partition descriptor
76	unsigned int partition_count = scb.partition_count;
77	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
78
79	// Get the quantization table
80	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
81	unsigned int weight_quant_level = bm.quant_mode;
82	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
83
84	unsigned int max_plane = bm.is_dual_plane;
85	int plane2_component = scb.plane2_component;
86	vmask4 plane_mask = vint4::lane_id() == vint4 (plane2_component);
87
88	// Decode the color endpoints
89	bool rgb_hdr;
90	bool alpha_hdr;
91	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
92	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
93	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
94	vfloat4 offset[BLOCK_MAX_PARTITIONS];
95
96	promise(partition_count > `0`);
97
98	for (unsigned int pa_idx = `0`; pa_idx < partition_count; pa_idx++)
99	{
100	unpack_color_endpoints(decode_mode,
101	scb.color_formats[pa_idx],
102	scb.color_values[pa_idx],
103	rgb_hdr, alpha_hdr,
104	endpnt0[pa_idx],
105	endpnt1[pa_idx]);
106	}
107
108	uint8_t* dec_weights_uquant = scb.weights;
109	bool adjustments = false;
110
111	// For each plane and partition ...
112	for (unsigned int pl_idx = `0`; pl_idx <= max_plane; pl_idx++)
113	{
114	for (unsigned int pa_idx = `0`; pa_idx < partition_count; pa_idx++)
115	{
116	// Compute the endpoint delta for all components in current plane
117	vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
118	epd = select(epd, vint4::zero(), plane_mask);
119
120	endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
121	offset[pa_idx] = int_to_float(epd) * (`1.0f` / `64.0f`);
122	}
123
124	// For each weight compute previous, current, and next errors
125	promise(bsd.texel_count > `0`);
126	for (unsigned int texel = `0`; texel < bsd.texel_count; texel++)
127	{
128	int uqw = dec_weights_uquant[texel];
129
130	uint32_t prev_and_next = qat.prev_next_values[uqw];
131	int uqw_down = prev_and_next & `0xFF`;
132	int uqw_up = (prev_and_next >> `8`) & `0xFF`;
133
134	// Interpolate the colors to create the diffs
135	float weight_base = static_cast<float>(uqw);
136	float weight_down = static_cast<float>(uqw_down - uqw);
137	float weight_up = static_cast<float>(uqw_up - uqw);
138
139	unsigned int partition = pi.partition_of_texel[texel];
140	vfloat4 color_offset = offset[partition];
141	vfloat4 color_base = endpnt0f[partition];
142
143	vfloat4 color = color_base + color_offset * weight_base;
144	vfloat4 orig_color = blk.texel(texel);
145	vfloat4 error_weight = blk.channel_weight;
146
147	vfloat4 color_diff = color - orig_color;
148	vfloat4 color_diff_down = color_diff + color_offset * weight_down;
149	vfloat4 color_diff_up = color_diff + color_offset * weight_up;
150
151	float error_base = dot_s(color_diff * color_diff, error_weight);
152	float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
153	float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
154
155	// Check if the prev or next error is better, and if so use it
156	if ((error_up < error_base) && (error_up < error_down) && (uqw < `64`))
157	{
158	dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
159	adjustments = true;
160	}
161	else if ((error_down < error_base) && (uqw > `0`))
162	{
163	dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
164	adjustments = true;
165	}
166	}
167
168	// Prepare iteration for plane 2
169	dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
170	plane_mask = ~plane_mask;
171	}
172
173	return adjustments;
174	}
175
176	/**
177	* @brief Attempt to improve weights given a chosen configuration.
178	*
179	* Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
180	* partition and per plane) and attempt to improve image quality by moving each weight up by one or
181	* down by one quantization step.
182	*
183	* @param decode_mode The decode mode (LDR, HDR).
184	* @param bsd The block size information.
185	* @param blk The image block color data to compress.
186	* @param[out] scb The symbolic compressed block output.
187	*/
188	static bool realign_weights_decimated(
189	astcenc_profile decode_mode,
190	const block_size_descriptor& bsd,
191	const image_block& blk,
192	symbolic_compressed_block& scb
193	) {
194	// Get the partition descriptor
195	unsigned int partition_count = scb.partition_count;
196	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
197
198	// Get the quantization table
199	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
200	unsigned int weight_quant_level = bm.quant_mode;
201	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
202
203	// Get the decimation table
204	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
205	unsigned int weight_count = di.weight_count;
206	assert(weight_count != bsd.texel_count);
207
208	unsigned int max_plane = bm.is_dual_plane;
209	int plane2_component = scb.plane2_component;
210	vmask4 plane_mask = vint4::lane_id() == vint4 (plane2_component);
211
212	// Decode the color endpoints
213	bool rgb_hdr;
214	bool alpha_hdr;
215	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
216	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
217	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
218	vfloat4 offset[BLOCK_MAX_PARTITIONS];
219
220	promise(partition_count > `0`);
221	promise(weight_count > `0`);
222
223	for (unsigned int pa_idx = `0`; pa_idx < partition_count; pa_idx++)
224	{
225	unpack_color_endpoints(decode_mode,
226	scb.color_formats[pa_idx],
227	scb.color_values[pa_idx],
228	rgb_hdr, alpha_hdr,
229	endpnt0[pa_idx],
230	endpnt1[pa_idx]);
231	}
232
233	uint8_t* dec_weights_uquant = scb.weights;
234	bool adjustments = false;
235
236	// For each plane and partition ...
237	for (unsigned int pl_idx = `0`; pl_idx <= max_plane; pl_idx++)
238	{
239	for (unsigned int pa_idx = `0`; pa_idx < partition_count; pa_idx++)
240	{
241	// Compute the endpoint delta for all components in current plane
242	vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
243	epd = select(epd, vint4::zero(), plane_mask);
244
245	endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
246	offset[pa_idx] = int_to_float(epd) * (`1.0f` / `64.0f`);
247	}
248
249	// Create an unquantized weight grid for this decimation level
250	alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS];
251	for (unsigned int we_idx = `0`; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
252	{
253	vint unquant_value(dec_weights_uquant + we_idx);
254	vfloat unquant_valuef = int_to_float(unquant_value);
255	storea(unquant_valuef, uq_weightsf + we_idx);
256	}
257
258	// For each weight compute previous, current, and next errors
259	for (unsigned int we_idx = `0`; we_idx < weight_count; we_idx++)
260	{
261	int uqw = dec_weights_uquant[we_idx];
262	uint32_t prev_and_next = qat.prev_next_values[uqw];
263
264	float uqw_base = uq_weightsf[we_idx];
265	float uqw_down = static_cast<float>(prev_and_next & `0xFF`);
266	float uqw_up = static_cast<float>((prev_and_next >> `8`) & `0xFF`);
267
268	float uqw_diff_down = uqw_down - uqw_base;
269	float uqw_diff_up = uqw_up - uqw_base;
270
271	vfloat4 error_basev = vfloat4::zero();
272	vfloat4 error_downv = vfloat4::zero();
273	vfloat4 error_upv = vfloat4::zero();
274
275	// Interpolate the colors to create the diffs
276	unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
277	promise(texels_to_evaluate > `0`);
278	for (unsigned int te_idx = `0`; te_idx < texels_to_evaluate; te_idx++)
279	{
280	unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
281
282	float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
283
284	float weight_base = (uq_weightsf[di.texel_weights_tr[`0`][texel]] * di.texel_weight_contribs_float_tr[`0`][texel]
285	+ uq_weightsf[di.texel_weights_tr[`1`][texel]] * di.texel_weight_contribs_float_tr[`1`][texel])
286	+ (uq_weightsf[di.texel_weights_tr[`2`][texel]] * di.texel_weight_contribs_float_tr[`2`][texel]
287	+ uq_weightsf[di.texel_weights_tr[`3`][texel]] * di.texel_weight_contribs_float_tr[`3`][texel]);
288
289	// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
290	// float weight = astc::flt_rd(weight_base + 0.5f);
291	// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down tw_base) - weight;*
292	// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up tw_base) - weight;*
293	float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
294	float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
295
296	unsigned int partition = pi.partition_of_texel[texel];
297	vfloat4 color_offset = offset[partition];
298	vfloat4 color_base = endpnt0f[partition];
299
300	vfloat4 color = color_base + color_offset * weight_base;
301	vfloat4 orig_color = blk.texel(texel);
302
303	vfloat4 color_diff = color - orig_color;
304	vfloat4 color_down_diff = color_diff + color_offset * weight_down;
305	vfloat4 color_up_diff = color_diff + color_offset * weight_up;
306
307	error_basev += color_diff * color_diff;
308	error_downv += color_down_diff * color_down_diff;
309	error_upv += color_up_diff * color_up_diff;
310	}
311
312	vfloat4 error_weight = blk.channel_weight;
313	float error_base = hadd_s(error_basev * error_weight);
314	float error_down = hadd_s(error_downv * error_weight);
315	float error_up = hadd_s(error_upv * error_weight);
316
317	// Check if the prev or next error is better, and if so use it
318	if ((error_up < error_base) && (error_up < error_down) && (uqw < `64`))
319	{
320	uq_weightsf[we_idx] = uqw_up;
321	dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
322	adjustments = true;
323	}
324	else if ((error_down < error_base) && (uqw > `0`))
325	{
326	uq_weightsf[we_idx] = uqw_down;
327	dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
328	adjustments = true;
329	}
330	}
331
332	// Prepare iteration for plane 2
333	dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
334	plane_mask = ~plane_mask;
335	}
336
337	return adjustments;
338	}
339
340	/**
341	* @brief Compress a block using a chosen partitioning and 1 plane of weights.
342	*
343	* @param config The compressor configuration.
344	* @param bsd The block size information.
345	* @param blk The image block color data to compress.
346	* @param only_always True if we only use "always" percentile block modes.
347	* @param tune_errorval_threshold The error value threshold.
348	* @param partition_count The partition count.
349	* @param partition_index The partition index if @c partition_count is 2-4.
350	* @param[out] scb The symbolic compressed block output.
351	* @param[out] tmpbuf The quantized weights for plane 1.
352	*/
353	static float compress_symbolic_block_for_partition_1plane(
354	const astcenc_config& config,
355	const block_size_descriptor& bsd,
356	const image_block& blk,
357	bool only_always,
358	float tune_errorval_threshold,
359	unsigned int partition_count,
360	unsigned int partition_index,
361	symbolic_compressed_block& scb,
362	compression_working_buffers& tmpbuf,
363	int quant_limit
364	) {
365	promise(partition_count > `0`);
366	promise(config.tune_candidate_limit > `0`);
367	promise(config.tune_refinement_limit > `0`);
368
369	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
370
371	auto compute_difference = &compute_symbolic_block_difference_1plane;
372	if ((partition_count == `1`) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
373	{
374	compute_difference = &compute_symbolic_block_difference_1plane_1partition;
375	}
376
377	const auto& pi = bsd.get_partition_info(partition_count, partition_index);
378
379	// Compute ideal weights and endpoint colors, with no quantization or decimation
380	endpoints_and_weights& ei = tmpbuf.ei1;
381	compute_ideal_colors_and_weights_1plane(blk, pi, ei);
382
383	// Compute ideal weights and endpoint colors for every decimation
384	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
385	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
386
387	// For each decimation mode, compute an ideal set of weights with no quantization
388	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
389	: bsd.decimation_mode_count_selected;
390	promise(max_decimation_modes > `0`);
391	for (unsigned int i = `0`; i < max_decimation_modes; i++)
392	{
393	const auto& dm = bsd.get_decimation_mode(i);
394	if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
395	{
396	continue;
397	}
398
399	const auto& di = bsd.get_decimation_info(i);
400
401	compute_ideal_weights_for_decimation(
402	ei,
403	di,
404	dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
405	}
406
407	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
408	// weight pair, compute the smallest weight that will result in a color value greater than 1
409	vfloat4 min_ep(`10.0f`);
410	for (unsigned int i = `0`; i < partition_count; i++)
411	{
412	vfloat4 ep = (vfloat4 (`1.0f`) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
413
414	vmask4 use_ep = (ep > vfloat4 (`0.5f`)) & (ep < min_ep);
415	min_ep = select(min_ep, ep, use_ep);
416	}
417
418	float min_wt_cutoff = hmin_s(min_ep);
419
420	// For each mode, use the angular method to compute a shift
421	compute_angular_endpoints_1plane(
422	only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
423
424	float* weight_low_value = tmpbuf.weight_low_value1;
425	float* weight_high_value = tmpbuf.weight_high_value1;
426	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
427	float* qwt_errors = tmpbuf.qwt_errors;
428
429	// For each mode (which specifies a decimation and a quantization):
430	// Compute number of bits needed for the quantized weights*
431	// Generate an optimized set of quantized weights*
432	// Compute quantization errors for the mode*
433
434
435	static const int8_t free_bits_for_partition_count[`4`] {
436	`115` - `4`, `111` - `4` - PARTITION_INDEX_BITS, `108` - `4` - PARTITION_INDEX_BITS, `105` - `4` - PARTITION_INDEX_BITS
437	};
438
439	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
440	: bsd.block_mode_count_1plane_selected;
441	promise(max_block_modes > `0`);
442	for (unsigned int i = `0`; i < max_block_modes; i++)
443	{
444	const block_mode& bm = bsd.block_modes[i];
445
446	if (bm.quant_mode > max_weight_quant)
447	{
448	qwt_errors[i] = `1e38f`;
449	continue;
450	}
451
452	assert(!bm.is_dual_plane);
453	int bitcount = free_bits_for_partition_count[partition_count - `1`] - bm.weight_bits;
454	if (bitcount <= `0`)
455	{
456	qwt_errors[i] = `1e38f`;
457	continue;
458	}
459
460	if (weight_high_value[i] > `1.02f` * min_wt_cutoff)
461	{
462	weight_high_value[i] = `1.0f`;
463	}
464
465	int decimation_mode = bm.decimation_mode;
466	const auto& di = bsd.get_decimation_info(decimation_mode);
467
468	qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
469
470	alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
471
472	// Generate the optimized set of weights for the weight mode
473	compute_quantized_weights_for_decimation(
474	di,
475	weight_low_value[i], weight_high_value[i],
476	dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
477	dec_weights_uquantf,
478	dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
479	bm.get_weight_quant_mode());
480
481	// Compute weight quantization errors for the block mode
482	qwt_errors[i] = compute_error_of_weight_set_1plane(
483	ei,
484	di,
485	dec_weights_uquantf);
486	}
487
488	// Decide the optimal combination of color endpoint encodings and weight encodings
489	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
490	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
491
492	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
493	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
494
495	unsigned int candidate_count = compute_ideal_endpoint_formats(
496	pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
497	config.tune_candidate_limit, `0`, max_block_modes,
498	partition_format_specifiers, block_mode_index,
499	color_quant_level, color_quant_level_mod, tmpbuf);
500
501	// Iterate over the N believed-to-be-best modes to find out which one is actually best
502	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
503	float best_errorval_in_scb = scb.errorval;
504
505	for (unsigned int i = `0`; i < candidate_count; i++)
506	{
507	TRACE_NODE(node0, "candidate");
508
509	const int bm_packed_index = block_mode_index[i];
510	assert(bm_packed_index >= `0` && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
511	const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
512
513	int decimation_mode = qw_bm.decimation_mode;
514	const auto& di = bsd.get_decimation_info(decimation_mode);
515	promise(di.weight_count > `0`);
516
517	trace_add_data("weight_x", di.weight_x);
518	trace_add_data("weight_y", di.weight_y);
519	trace_add_data("weight_z", di.weight_z);
520	trace_add_data("weight_quant", qw_bm.quant_mode);
521
522	// Recompute the ideal color endpoints before storing them
523	vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
524	vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
525
526	symbolic_compressed_block workscb;
527	endpoints workep = ei.ep;
528
529	uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
530
531	for (unsigned int j = `0`; j < di.weight_count; j++)
532	{
533	workscb.weights[j] = u8_weight_src[j];
534	}
535
536	for (unsigned int l = `0`; l < config.tune_refinement_limit; l++)
537	{
538	recompute_ideal_colors_1plane(
539	blk, pi, di, workscb.weights,
540	workep, rgbs_colors, rgbo_colors);
541
542	// Quantize the chosen color, tracking if worth trying the mod value
543	bool all_same = color_quant_level[i] != color_quant_level_mod[i];
544	for (unsigned int j = `0`; j < partition_count; j++)
545	{
546	workscb.color_formats[j] = pack_color_endpoints(
547	workep.endpt0[j],
548	workep.endpt1[j],
549	rgbs_colors[j],
550	rgbo_colors[j],
551	partition_format_specifiers[i][j],
552	workscb.color_values[j],
553	color_quant_level[i]);
554
555	all_same = all_same && workscb.color_formats[j] == workscb.color_formats[`0`];
556	}
557
558	// If all the color endpoint modes are the same, we get a few more bits to store colors;
559	// let's see if we can take advantage of this: requantize all the colors and see if the
560	// endpoint modes remain the same.
561	workscb.color_formats_matched = `0`;
562	if (partition_count >= `2` && all_same)
563	{
564	uint8_t colorvals[BLOCK_MAX_PARTITIONS][`8`];
565	uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { `0` };
566	bool all_same_mod = true;
567	for (unsigned int j = `0`; j < partition_count; j++)
568	{
569	color_formats_mod[j] = pack_color_endpoints(
570	workep.endpt0[j],
571	workep.endpt1[j],
572	rgbs_colors[j],
573	rgbo_colors[j],
574	partition_format_specifiers[i][j],
575	colorvals[j],
576	color_quant_level_mod[i]);
577
578	// Early out as soon as it's no longer possible to use mod
579	if (color_formats_mod[j] != color_formats_mod[`0`])
580	{
581	all_same_mod = false;
582	break;
583	}
584	}
585
586	if (all_same_mod)
587	{
588	workscb.color_formats_matched = `1`;
589	for (unsigned int j = `0`; j < BLOCK_MAX_PARTITIONS; j++)
590	{
591	for (unsigned int k = `0`; k < `8`; k++)
592	{
593	workscb.color_values[j][k] = colorvals[j][k];
594	}
595
596	workscb.color_formats[j] = color_formats_mod[j];
597	}
598	}
599	}
600
601	// Store header fields
602	workscb.partition_count = static_cast<uint8_t>(partition_count);
603	workscb.partition_index = static_cast<uint16_t>(partition_index);
604	workscb.plane2_component = -`1`;
605	workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
606	workscb.block_mode = qw_bm.mode_index;
607	workscb.block_type = SYM_BTYPE_NONCONST;
608
609	// Pre-realign test
610	if (l == `0`)
611	{
612	float errorval = compute_difference(config, bsd, workscb, blk);
613	if (errorval == -ERROR_CALC_DEFAULT)
614	{
615	errorval = -errorval;
616	workscb.block_type = SYM_BTYPE_ERROR;
617	}
618
619	trace_add_data("error_prerealign", errorval);
620	best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
621
622	// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
623	// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
624	// drive a heuristic to skip blocks that are unlikely to catch up with the best
625	// block we have already.
626	unsigned int iters_remaining = config.tune_refinement_limit - l;
627	float threshold = (`0.045f` * static_cast<float>(iters_remaining)) + `1.08f`;
628	if (errorval > (threshold * best_errorval_in_scb))
629	{
630	break;
631	}
632
633	if (errorval < best_errorval_in_scb)
634	{
635	best_errorval_in_scb = errorval;
636	workscb.errorval = errorval;
637	scb = workscb;
638
639	if (errorval < tune_errorval_threshold)
640	{
641	// Skip remaining candidates - this is "good enough"
642	i = candidate_count;
643	break;
644	}
645	}
646	}
647
648	bool adjustments;
649	if (di.weight_count != bsd.texel_count)
650	{
651	adjustments = realign_weights_decimated(
652	config.profile, bsd, blk, workscb);
653	}
654	else
655	{
656	adjustments = realign_weights_undecimated(
657	config.profile, bsd, blk, workscb);
658	}
659
660	// Post-realign test
661	float errorval = compute_difference(config, bsd, workscb, blk);
662	if (errorval == -ERROR_CALC_DEFAULT)
663	{
664	errorval = -errorval;
665	workscb.block_type = SYM_BTYPE_ERROR;
666	}
667
668	trace_add_data("error_postrealign", errorval);
669	best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
670
671	// Average refinement improvement is 3.5% per iteration, so skip blocks that are
672	// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
673	// give benefit of the doubt ...
674	unsigned int iters_remaining = config.tune_refinement_limit - `1` - l;
675	float threshold = (`0.045f` * static_cast<float>(iters_remaining)) + `1.0f`;
676	if (errorval > (threshold * best_errorval_in_scb))
677	{
678	break;
679	}
680
681	if (errorval < best_errorval_in_scb)
682	{
683	best_errorval_in_scb = errorval;
684	workscb.errorval = errorval;
685	scb = workscb;
686
687	if (errorval < tune_errorval_threshold)
688	{
689	// Skip remaining candidates - this is "good enough"
690	i = candidate_count;
691	break;
692	}
693	}
694
695	if (!adjustments)
696	{
697	break;
698	}
699	}
700	}
701
702	return best_errorval_in_mode;
703	}
704
705	/**
706	* @brief Compress a block using a chosen partitioning and 2 planes of weights.
707	*
708	* @param config The compressor configuration.
709	* @param bsd The block size information.
710	* @param blk The image block color data to compress.
711	* @param tune_errorval_threshold The error value threshold.
712	* @param plane2_component The component index for the second plane of weights.
713	* @param[out] scb The symbolic compressed block output.
714	* @param[out] tmpbuf The quantized weights for plane 1.
715	*/
716	static float compress_symbolic_block_for_partition_2planes(
717	const astcenc_config& config,
718	const block_size_descriptor& bsd,
719	const image_block& blk,
720	float tune_errorval_threshold,
721	unsigned int plane2_component,
722	symbolic_compressed_block& scb,
723	compression_working_buffers& tmpbuf,
724	int quant_limit
725	) {
726	promise(config.tune_candidate_limit > `0`);
727	promise(config.tune_refinement_limit > `0`);
728	promise(bsd.decimation_mode_count_selected > `0`);
729
730	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
731
732	// Compute ideal weights and endpoint colors, with no quantization or decimation
733	endpoints_and_weights& ei1 = tmpbuf.ei1;
734	endpoints_and_weights& ei2 = tmpbuf.ei2;
735
736	compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
737
738	// Compute ideal weights and endpoint colors for every decimation
739	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
740	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
741
742	// For each decimation mode, compute an ideal set of weights with no quantization
743	for (unsigned int i = `0`; i < bsd.decimation_mode_count_selected; i++)
744	{
745	const auto& dm = bsd.get_decimation_mode(i);
746	if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
747	{
748	continue;
749	}
750
751	const auto& di = bsd.get_decimation_info(i);
752
753	compute_ideal_weights_for_decimation(
754	ei1,
755	di,
756	dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
757
758	compute_ideal_weights_for_decimation(
759	ei2,
760	di,
761	dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
762	}
763
764	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
765	// weight pair, compute the smallest weight that will result in a color value greater than 1
766	vfloat4 min_ep1(`10.0f`);
767	vfloat4 min_ep2(`10.0f`);
768
769	vfloat4 ep1 = (vfloat4 (`1.0f`) - ei1.ep.endpt0[`0`]) / (ei1.ep.endpt1[`0`] - ei1.ep.endpt0[`0`]);
770	vmask4 use_ep1 = (ep1 > vfloat4 (`0.5f`)) & (ep1 < min_ep1);
771	min_ep1 = select(min_ep1, ep1, use_ep1);
772
773	vfloat4 ep2 = (vfloat4 (`1.0f`) - ei2.ep.endpt0[`0`]) / (ei2.ep.endpt1[`0`] - ei2.ep.endpt0[`0`]);
774	vmask4 use_ep2 = (ep2 > vfloat4 (`0.5f`)) & (ep2 < min_ep2);
775	min_ep2 = select(min_ep2, ep2, use_ep2);
776
777	vfloat4 err_max(ERROR_CALC_DEFAULT);
778	vmask4 err_mask = vint4::lane_id() == vint4 (plane2_component);
779
780	// Set the plane2 component to max error in ep1
781	min_ep1 = select(min_ep1, err_max, err_mask);
782
783	float min_wt_cutoff1 = hmin_s(min_ep1);
784
785	// Set the minwt2 to the plane2 component min in ep2
786	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
787
788	compute_angular_endpoints_2planes(
789	bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
790
791	// For each mode (which specifies a decimation and a quantization):
792	// Compute number of bits needed for the quantized weights*
793	// Generate an optimized set of quantized weights*
794	// Compute quantization errors for the mode*
795
796	float* weight_low_value1 = tmpbuf.weight_low_value1;
797	float* weight_high_value1 = tmpbuf.weight_high_value1;
798	float* weight_low_value2 = tmpbuf.weight_low_value2;
799	float* weight_high_value2 = tmpbuf.weight_high_value2;
800
801	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
802	float* qwt_errors = tmpbuf.qwt_errors;
803
804	unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
805	unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
806
807	for (unsigned int i = start_2plane; i < end_2plane; i++)
808	{
809	const block_mode& bm = bsd.block_modes[i];
810	assert(bm.is_dual_plane);
811
812	if (bm.quant_mode > max_weight_quant)
813	{
814	qwt_errors[i] = `1e38f`;
815	continue;
816	}
817
818	qwt_bitcounts[i] = static_cast<int8_t>(`109` - bm.weight_bits);
819
820	if (weight_high_value1[i] > `1.02f` * min_wt_cutoff1)
821	{
822	weight_high_value1[i] = `1.0f`;
823	}
824
825	if (weight_high_value2[i] > `1.02f` * min_wt_cutoff2)
826	{
827	weight_high_value2[i] = `1.0f`;
828	}
829
830	unsigned int decimation_mode = bm.decimation_mode;
831	const auto& di = bsd.get_decimation_info(decimation_mode);
832
833	alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
834
835	// Generate the optimized set of weights for the mode
836	compute_quantized_weights_for_decimation(
837	di,
838	weight_low_value1[i],
839	weight_high_value1[i],
840	dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
841	dec_weights_uquantf,
842	dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
843	bm.get_weight_quant_mode());
844
845	compute_quantized_weights_for_decimation(
846	di,
847	weight_low_value2[i],
848	weight_high_value2[i],
849	dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
850	dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
851	dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
852	bm.get_weight_quant_mode());
853
854	// Compute weight quantization errors for the block mode
855	qwt_errors[i] = compute_error_of_weight_set_2planes(
856	ei1,
857	ei2,
858	di,
859	dec_weights_uquantf,
860	dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
861	}
862
863	// Decide the optimal combination of color endpoint encodings and weight encodings
864	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
865	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
866
867	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
868	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
869
870	endpoints epm;
871	merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
872
873	const auto& pi = bsd.get_partition_info(`1`, `0`);
874	unsigned int candidate_count = compute_ideal_endpoint_formats(
875	pi, blk, epm, qwt_bitcounts, qwt_errors,
876	config.tune_candidate_limit,
877	bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
878	partition_format_specifiers, block_mode_index,
879	color_quant_level, color_quant_level_mod, tmpbuf);
880
881	// Iterate over the N believed-to-be-best modes to find out which one is actually best
882	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
883	float best_errorval_in_scb = scb.errorval;
884
885	for (unsigned int i = `0`; i < candidate_count; i++)
886	{
887	TRACE_NODE(node0, "candidate");
888
889	const int bm_packed_index = block_mode_index[i];
890	assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
891	bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
892	const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
893
894	int decimation_mode = qw_bm.decimation_mode;
895	const auto& di = bsd.get_decimation_info(decimation_mode);
896	promise(di.weight_count > `0`);
897
898	trace_add_data("weight_x", di.weight_x);
899	trace_add_data("weight_y", di.weight_y);
900	trace_add_data("weight_z", di.weight_z);
901	trace_add_data("weight_quant", qw_bm.quant_mode);
902
903	vfloat4 rgbs_color;
904	vfloat4 rgbo_color;
905
906	symbolic_compressed_block workscb;
907	endpoints workep = epm;
908
909	uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
910	uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
911
912	for (int j = `0`; j < di.weight_count; j++)
913	{
914	workscb.weights[j] = u8_weight1_src[j];
915	workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
916	}
917
918	for (unsigned int l = `0`; l < config.tune_refinement_limit; l++)
919	{
920	recompute_ideal_colors_2planes(
921	blk, bsd, di,
922	workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
923	workep, rgbs_color, rgbo_color, plane2_component);
924
925	// Quantize the chosen color
926	workscb.color_formats[`0`] = pack_color_endpoints(
927	workep.endpt0[`0`],
928	workep.endpt1[`0`],
929	rgbs_color, rgbo_color,
930	partition_format_specifiers[i][`0`],
931	workscb.color_values[`0`],
932	color_quant_level[i]);
933
934	// Store header fields
935	workscb.partition_count = `1`;
936	workscb.partition_index = `0`;
937	workscb.quant_mode = color_quant_level[i];
938	workscb.color_formats_matched = `0`;
939	workscb.block_mode = qw_bm.mode_index;
940	workscb.plane2_component = static_cast<int8_t>(plane2_component);
941	workscb.block_type = SYM_BTYPE_NONCONST;
942
943	// Pre-realign test
944	if (l == `0`)
945	{
946	float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
947	if (errorval == -ERROR_CALC_DEFAULT)
948	{
949	errorval = -errorval;
950	workscb.block_type = SYM_BTYPE_ERROR;
951	}
952
953	trace_add_data("error_prerealign", errorval);
954	best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
955
956	// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
957	// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
958	// drive a heuristic to skip blocks that are unlikely to catch up with the best
959	// block we have already.
960	unsigned int iters_remaining = config.tune_refinement_limit - l;
961	float threshold = (`0.045f` * static_cast<float>(iters_remaining)) + `1.08f`;
962	if (errorval > (threshold * best_errorval_in_scb))
963	{
964	break;
965	}
966
967	if (errorval < best_errorval_in_scb)
968	{
969	best_errorval_in_scb = errorval;
970	workscb.errorval = errorval;
971	scb = workscb;
972
973	if (errorval < tune_errorval_threshold)
974	{
975	// Skip remaining candidates - this is "good enough"
976	i = candidate_count;
977	break;
978	}
979	}
980	}
981
982	// Perform a final pass over the weights to try to improve them.
983	bool adjustments;
984	if (di.weight_count != bsd.texel_count)
985	{
986	adjustments = realign_weights_decimated(
987	config.profile, bsd, blk, workscb);
988	}
989	else
990	{
991	adjustments = realign_weights_undecimated(
992	config.profile, bsd, blk, workscb);
993	}
994
995	// Post-realign test
996	float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
997	if (errorval == -ERROR_CALC_DEFAULT)
998	{
999	errorval = -errorval;
1000	workscb.block_type = SYM_BTYPE_ERROR;
1001	}
1002
1003	trace_add_data("error_postrealign", errorval);
1004	best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1005
1006	// Average refinement improvement is 3.5% per iteration, so skip blocks that are
1007	// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1008	// give benefit of the doubt ...
1009	unsigned int iters_remaining = config.tune_refinement_limit - `1` - l;
1010	float threshold = (`0.045f` * static_cast<float>(iters_remaining)) + `1.0f`;
1011	if (errorval > (threshold * best_errorval_in_scb))
1012	{
1013	break;
1014	}
1015
1016	if (errorval < best_errorval_in_scb)
1017	{
1018	best_errorval_in_scb = errorval;
1019	workscb.errorval = errorval;
1020	scb = workscb;
1021
1022	if (errorval < tune_errorval_threshold)
1023	{
1024	// Skip remaining candidates - this is "good enough"
1025	i = candidate_count;
1026	break;
1027	}
1028	}
1029
1030	if (!adjustments)
1031	{
1032	break;
1033	}
1034	}
1035	}
1036
1037	return best_errorval_in_mode;
1038	}
1039
1040	/**
1041	* @brief Determine the lowest cross-channel correlation factor.
1042	*
1043	* @param texels_per_block The number of texels in a block.
1044	* @param blk The image block color data to compress.
1045	*
1046	* @return Return the lowest correlation factor.
1047	*/
1048	static float prepare_block_statistics(
1049	int texels_per_block,
1050	const image_block& blk
1051	) {
1052	// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1053	// of the matrix. The matrix is symmetric, so this is all we need for this use case.
1054	float rs = `0.0f`;
1055	float gs = `0.0f`;
1056	float bs = `0.0f`;
1057	float as = `0.0f`;
1058	float rr_var = `0.0f`;
1059	float gg_var = `0.0f`;
1060	float bb_var = `0.0f`;
1061	float aa_var = `0.0f`;
1062	float rg_cov = `0.0f`;
1063	float rb_cov = `0.0f`;
1064	float ra_cov = `0.0f`;
1065	float gb_cov = `0.0f`;
1066	float ga_cov = `0.0f`;
1067	float ba_cov = `0.0f`;
1068
1069	float weight_sum = `0.0f`;
1070
1071	promise(texels_per_block > `0`);
1072	for (int i = `0`; i < texels_per_block; i++)
1073	{
1074	float weight = hadd_s(blk.channel_weight) / `4.0f`;
1075	assert(weight >= `0.0f`);
1076	weight_sum += weight;
1077
1078	float r = blk.data_r[i];
1079	float g = blk.data_g[i];
1080	float b = blk.data_b[i];
1081	float a = blk.data_a[i];
1082
1083	float rw = r * weight;
1084	rs += rw;
1085	rr_var += r * rw;
1086	rg_cov += g * rw;
1087	rb_cov += b * rw;
1088	ra_cov += a * rw;
1089
1090	float gw = g * weight;
1091	gs += gw;
1092	gg_var += g * gw;
1093	gb_cov += b * gw;
1094	ga_cov += a * gw;
1095
1096	float bw = b * weight;
1097	bs += bw;
1098	bb_var += b * bw;
1099	ba_cov += a * bw;
1100
1101	float aw = a * weight;
1102	as += aw;
1103	aa_var += a * aw;
1104	}
1105
1106	float rpt = `1.0f` / astc::max(weight_sum, `1e-7f`);
1107
1108	rr_var -= rs * (rs * rpt);
1109	rg_cov -= gs * (rs * rpt);
1110	rb_cov -= bs * (rs * rpt);
1111	ra_cov -= as * (rs * rpt);
1112
1113	gg_var -= gs * (gs * rpt);
1114	gb_cov -= bs * (gs * rpt);
1115	ga_cov -= as * (gs * rpt);
1116
1117	bb_var -= bs * (bs * rpt);
1118	ba_cov -= as * (bs * rpt);
1119
1120	aa_var -= as * (as * rpt);
1121
1122	// These will give a NaN if a channel is constant - these are fixed up in the next step
1123	rg_cov = astc::rsqrt(rr_var gg_var);
1124	rb_cov = astc::rsqrt(rr_var bb_var);
1125	ra_cov = astc::rsqrt(rr_var aa_var);
1126	gb_cov = astc::rsqrt(gg_var bb_var);
1127	ga_cov = astc::rsqrt(gg_var aa_var);
1128	ba_cov = astc::rsqrt(bb_var aa_var);
1129
1130	if (astc::isnan(rg_cov)) rg_cov = `1.0f`;
1131	if (astc::isnan(rb_cov)) rb_cov = `1.0f`;
1132	if (astc::isnan(ra_cov)) ra_cov = `1.0f`;
1133	if (astc::isnan(gb_cov)) gb_cov = `1.0f`;
1134	if (astc::isnan(ga_cov)) ga_cov = `1.0f`;
1135	if (astc::isnan(ba_cov)) ba_cov = `1.0f`;
1136
1137	float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
1138	lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
1139	lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
1140	lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
1141	lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov));
1142
1143	// Diagnostic trace points
1144	trace_add_data("min_r", blk.data_min.lane<`0`>());
1145	trace_add_data("max_r", blk.data_max.lane<`0`>());
1146	trace_add_data("min_g", blk.data_min.lane<`1`>());
1147	trace_add_data("max_g", blk.data_max.lane<`1`>());
1148	trace_add_data("min_b", blk.data_min.lane<`2`>());
1149	trace_add_data("max_b", blk.data_max.lane<`2`>());
1150	trace_add_data("min_a", blk.data_min.lane<`3`>());
1151	trace_add_data("max_a", blk.data_max.lane<`3`>());
1152	trace_add_data("cov_rg", fabsf(rg_cov));
1153	trace_add_data("cov_rb", fabsf(rb_cov));
1154	trace_add_data("cov_ra", fabsf(ra_cov));
1155	trace_add_data("cov_gb", fabsf(gb_cov));
1156	trace_add_data("cov_ga", fabsf(ga_cov));
1157	trace_add_data("cov_ba", fabsf(ba_cov));
1158
1159	return lowest_correlation;
1160	}
1161
1162	/ See header for documentation. /
1163	void compress_block(
1164	const astcenc_contexti& ctx,
1165	const image_block& blk,
1166	physical_compressed_block& pcb,
1167	compression_working_buffers& tmpbuf)
1168	{
1169	astcenc_profile decode_mode = ctx.config.profile;
1170	symbolic_compressed_block scb;
1171	const block_size_descriptor& bsd = *ctx.bsd;
1172	float lowest_correl;
1173
1174	TRACE_NODE(node0, "block");
1175	trace_add_data("pos_x", blk.xpos);
1176	trace_add_data("pos_y", blk.ypos);
1177	trace_add_data("pos_z", blk.zpos);
1178
1179	// Set stricter block targets for luminance data as we have more bits to play with
1180	bool block_is_l = blk.is_luminance();
1181	float block_is_l_scale = block_is_l ? `1.0f` / `1.5f` : `1.0f`;
1182
1183	// Set slightly stricter block targets for lumalpha data as we have more bits to play with
1184	bool block_is_la = blk.is_luminancealpha();
1185	float block_is_la_scale = block_is_la ? `1.0f` / `1.05f` : `1.0f`;
1186
1187	bool block_skip_two_plane = false;
1188	int max_partitions = ctx.config.tune_partition_count_limit;
1189
1190	unsigned int requested_partition_indices[`3`] {
1191	ctx.config.tune_2partition_index_limit,
1192	ctx.config.tune_3partition_index_limit,
1193	ctx.config.tune_4partition_index_limit
1194	};
1195
1196	unsigned int requested_partition_trials[`3`] {
1197	ctx.config.tune_2partitioning_candidate_limit,
1198	ctx.config.tune_3partitioning_candidate_limit,
1199	ctx.config.tune_4partitioning_candidate_limit
1200	};
1201
1202	#if defined(ASTCENC_DIAGNOSTICS)
1203	// Do this early in diagnostic builds so we can dump uniform metrics
1204	// for every block. Do it later in release builds to avoid redundant work!
1205	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1206	float error_threshold = ctx.config.tune_db_limit
1207	* error_weight_sum
1208	* block_is_l_scale
1209	* block_is_la_scale;
1210
1211	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1212	trace_add_data("lowest_correl", lowest_correl);
1213	trace_add_data("tune_error_threshold", error_threshold);
1214	#endif
1215
1216	// Detected a constant-color block
1217	if (all(blk.data_min == blk.data_max))
1218	{
1219	TRACE_NODE(node1, "pass");
1220	trace_add_data("partition_count", `0`);
1221	trace_add_data("plane_count", `1`);
1222
1223	scb.partition_count = `0`;
1224
1225	// Encode as FP16 if using HDR
1226	if ((decode_mode == ASTCENC_PRF_HDR) \|\|
1227	(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1228	{
1229	scb.block_type = SYM_BTYPE_CONST_F16;
1230	vint4 color_f16 = float_to_float16(blk.origin_texel);
1231	store(color_f16, scb.constant_color);
1232	}
1233	// Encode as UNORM16 if NOT using HDR
1234	else
1235	{
1236	scb.block_type = SYM_BTYPE_CONST_U16;
1237	vfloat4 color_f32 = clamp(`0.0f`, `1.0f`, blk.origin_texel) * `65535.0f`;
1238	vint4 color_u16 = float_to_int_rtn(color_f32);
1239	store(color_u16, scb.constant_color);
1240	}
1241
1242	trace_add_data("exit", "quality hit");
1243
1244	symbolic_to_physical(bsd, scb, pcb);
1245	return;
1246	}
1247
1248	#if !defined(ASTCENC_DIAGNOSTICS)
1249	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1250	float error_threshold = ctx.config.tune_db_limit
1251	* error_weight_sum
1252	* block_is_l_scale
1253	* block_is_la_scale;
1254	#endif
1255
1256	// Set SCB and mode errors to a very high error value
1257	scb.errorval = ERROR_CALC_DEFAULT;
1258	scb.block_type = SYM_BTYPE_ERROR;
1259
1260	float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1261	ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1262	};
1263
1264	float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1265	`0.0f`,
1266	ctx.config.tune_2partition_early_out_limit_factor,
1267	ctx.config.tune_3partition_early_out_limit_factor,
1268	`0.0f`
1269	};
1270
1271	// Trial using 1 plane of weights and 1 partition.
1272
1273	// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1274	// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1275	// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1276	// compression and slightly reduces image quality.
1277
1278	float errorval_mult[`2`] {
1279	`1.0f` / ctx.config.tune_mse_overshoot,
1280	`1.0f`
1281	};
1282
1283	static const float errorval_overshoot = `1.0f` / ctx.config.tune_mse_overshoot;
1284
1285	// Only enable MODE0 fast path (trial 0) if 2D, and more than 25 texels
1286	int start_trial = `1`;
1287	if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == `1`))
1288	{
1289	start_trial = `0`;
1290	}
1291
1292	int quant_limit = QUANT_32;
1293	for (int i = start_trial; i < `2`; i++)
1294	{
1295	TRACE_NODE(node1, "pass");
1296	trace_add_data("partition_count", `1`);
1297	trace_add_data("plane_count", `1`);
1298	trace_add_data("search_mode", i);
1299
1300	float errorval = compress_symbolic_block_for_partition_1plane(
1301	ctx.config, bsd, blk, i == `0`,
1302	error_threshold * errorval_mult[i] * errorval_overshoot,
1303	`1`, `0`, scb, tmpbuf, QUANT_32);
1304
1305	// Record the quant level so we can use the filter later searches
1306	const auto& bm = bsd.get_block_mode(scb.block_mode);
1307	quant_limit = bm.get_weight_quant_mode();
1308
1309	best_errorvals_for_pcount[`0`] = astc::min(best_errorvals_for_pcount[`0`], errorval);
1310	if (errorval < (error_threshold * errorval_mult[i]))
1311	{
1312	trace_add_data("exit", "quality hit");
1313	goto END_OF_TESTS;
1314	}
1315	}
1316
1317	#if !defined(ASTCENC_DIAGNOSTICS)
1318	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1319	#endif
1320
1321	block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1322
1323	// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1324	// alpha is the most likely to be non-correlated if it is present in the data.
1325	for (int i = BLOCK_MAX_COMPONENTS - `1`; i >= `0`; i--)
1326	{
1327	TRACE_NODE(node1, "pass");
1328	trace_add_data("partition_count", `1`);
1329	trace_add_data("plane_count", `2`);
1330	trace_add_data("plane_component", i);
1331
1332	if (block_skip_two_plane)
1333	{
1334	trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1335	continue;
1336	}
1337
1338	if (blk.grayscale && i != `3`)
1339	{
1340	trace_add_data("skip", "grayscale block");
1341	continue;
1342	}
1343
1344	if (blk.is_constant_channel(i))
1345	{
1346	trace_add_data("skip", "constant component");
1347	continue;
1348	}
1349
1350	float errorval = compress_symbolic_block_for_partition_2planes(
1351	ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1352	i, scb, tmpbuf, quant_limit);
1353
1354	// If attempting two planes is much worse than the best one plane result
1355	// then further two plane searches are unlikely to help so move on ...
1356	if (errorval > (best_errorvals_for_pcount[`0`] * `1.85f`))
1357	{
1358	break;
1359	}
1360
1361	if (errorval < error_threshold)
1362	{
1363	trace_add_data("exit", "quality hit");
1364	goto END_OF_TESTS;
1365	}
1366	}
1367
1368	// Find best blocks for 2, 3 and 4 partitions
1369	for (int partition_count = `2`; partition_count <= max_partitions; partition_count++)
1370	{
1371	unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1372
1373	unsigned int requested_indices = requested_partition_indices[partition_count - `2`];
1374
1375	unsigned int requested_trials = requested_partition_trials[partition_count - `2`];
1376	requested_trials = astc::min(requested_trials, requested_indices);
1377
1378	unsigned int actual_trials = find_best_partition_candidates(
1379	bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1380
1381	float best_error_in_prev = best_errorvals_for_pcount[partition_count - `2`];
1382
1383	for (unsigned int i = `0`; i < actual_trials; i++)
1384	{
1385	TRACE_NODE(node1, "pass");
1386	trace_add_data("partition_count", partition_count);
1387	trace_add_data("partition_index", partition_indices[i]);
1388	trace_add_data("plane_count", `1`);
1389	trace_add_data("search_mode", i);
1390
1391	float errorval = compress_symbolic_block_for_partition_1plane(
1392	ctx.config, bsd, blk, false,
1393	error_threshold * errorval_overshoot,
1394	partition_count, partition_indices[i],
1395	scb, tmpbuf, quant_limit);
1396
1397	best_errorvals_for_pcount[partition_count - `1`] = astc::min(best_errorvals_for_pcount[partition_count - `1`], errorval);
1398
1399	// If using N partitions doesn't improve much over using N-1 partitions then skip trying
1400	// N+1. Error can dramatically improve if the data is correlated or non-correlated and
1401	// aligns with a partitioning that suits that encoding, so for this inner loop check add
1402	// a large error scale because the "other" trial could be a lot better.
1403	float best_error = best_errorvals_for_pcount[partition_count - `1`];
1404	float best_error_scale = exit_thresholds_for_pcount[partition_count - `1`] * `1.85f`;
1405	if (best_error > (best_error_in_prev * best_error_scale))
1406	{
1407	trace_add_data("skip", "tune_partition_early_out_limit_factor");
1408	goto END_OF_TESTS;
1409	}
1410
1411	if (errorval < error_threshold)
1412	{
1413	trace_add_data("exit", "quality hit");
1414	goto END_OF_TESTS;
1415	}
1416	}
1417
1418	// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1419	float best_error = best_errorvals_for_pcount[partition_count - `1`];
1420	float best_error_scale = exit_thresholds_for_pcount[partition_count - `1`];
1421	if (best_error > (best_error_in_prev * best_error_scale))
1422	{
1423	trace_add_data("skip", "tune_partition_early_out_limit_factor");
1424	goto END_OF_TESTS;
1425	}
1426	}
1427
1428	trace_add_data("exit", "quality not hit");
1429
1430	END_OF_TESTS:
1431	// If we still have an error block then convert to something we can encode
1432	// TODO: Do something more sensible here, such as average color block
1433	if (scb.block_type == SYM_BTYPE_ERROR)
1434	{
1435	#if defined(ASTCENC_DIAGNOSTICS)
1436	static bool printed_once = false;
1437	if (!printed_once)
1438	{
1439	printed_once = true;
1440	printf("WARN: At least one block failed to find a valid encoding.\n"
1441	" Try increasing compression quality settings.\n\n");
1442	}
1443	#endif
1444
1445	scb.block_type = SYM_BTYPE_CONST_U16;
1446	vfloat4 color_f32 = clamp(`0.0f`, `1.0f`, blk.origin_texel) * `65535.0f`;
1447	vint4 color_u16 = float_to_int_rtn(color_f32);
1448	store(color_u16, scb.constant_color);
1449	}
1450
1451	// Compress to a physical block
1452	symbolic_to_physical(bsd, scb, pcb);
1453	}
1454
1455	#endif
1456

Browse the source code of Godot/thirdparty/astcenc/astcenc_compress_symbolic.cpp