astcenc_pick_best_endpoint_format.cpp source code [Godot/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp]

1	// SPDX-License-Identifier: Apache-2.0
2	// ----------------------------------------------------------------------------
3	// Copyright 2011-2022 Arm Limited
4	//
5	// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6	// use this file except in compliance with the License. You may obtain a copy
7	// of the License at:
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing, software
12	// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13	// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14	// License for the specific language governing permissions and limitations
15	// under the License.
16	// ----------------------------------------------------------------------------
17
18	#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20	/**
21	* @brief Functions for finding best endpoint format.
22	*
23	* We assume there are two independent sources of error in any given partition:
24	*
25	* - Encoding choice errors
26	* - Quantization errors
27	*
28	* Encoding choice errors are caused by encoder decisions. For example:
29	*
30	* - Using luminance instead of separate RGB components.
31	* - Using a constant 1.0 alpha instead of storing an alpha component.
32	* - Using RGB+scale instead of storing two full RGB endpoints.
33	*
34	* Quantization errors occur due to the limited precision we use for storage. These errors generally
35	* scale with quantization level, but are not actually independent of color encoding. In particular:
36	*
37	* - If we can use offset encoding then quantization error is halved.
38	* - If we can use blue-contraction then quantization error for RG is halved.
39	* - If we use HDR endpoints the quantization error is higher.
40	*
41	* Apart from these effects, we assume the error is proportional to the quantization step size.
42	*/
43
44
45	#include "astcenc_internal.h"
46	#include "astcenc_vecmathlib.h"
47
48	#include <assert.h>
49
50	/**
51	* @brief Compute the errors of the endpoint line options for one partition.
52	*
53	* Uncorrelated data assumes storing completely independent RGBA channels for each endpoint. Same
54	* chroma data assumes storing RGBA endpoints which pass though the origin (LDR only). RGBL data
55	* assumes storing RGB + lumashift (HDR only). Luminance error assumes storing RGB channels as a
56	* single value.
57	*
58	*
59	* @param pi The partition info data.
60	* @param partition_index The partition index to compule the error for.
61	* @param blk The image block.
62	* @param uncor_pline The endpoint line assuming uncorrelated endpoints.
63	* @param[out] uncor_err The computed error for the uncorrelated endpoint line.
64	* @param samec_pline The endpoint line assuming the same chroma for both endpoints.
65	* @param[out] samec_err The computed error for the uncorrelated endpoint line.
66	* @param rgbl_pline The endpoint line assuming RGB + lumashift data.
67	* @param[out] rgbl_err The computed error for the RGB + lumashift endpoint line.
68	* @param l_pline The endpoint line assuming luminance data.
69	* @param[out] l_err The computed error for the luminance endpoint line.
70	* @param[out] a_drop_err The computed error for dropping the alpha component.
71	*/
72	static void compute_error_squared_rgb_single_partition(
73	const partition_info& pi,
74	int partition_index,
75	const image_block& blk,
76	const processed_line3& uncor_pline,
77	float& uncor_err,
78	const processed_line3& samec_pline,
79	float& samec_err,
80	const processed_line3& rgbl_pline,
81	float& rgbl_err,
82	const processed_line3& l_pline,
83	float& l_err,
84	float& a_drop_err
85	) {
86	vfloat4 ews = blk.channel_weight;
87
88	unsigned int texel_count = pi.partition_texel_count[partition_index];
89	const uint8_t* texel_indexes = pi.texels_of_partition[partition_index];
90	promise(texel_count > `0`);
91
92	vfloatacc a_drop_errv = vfloatacc::zero();
93	vfloat default_a(blk.get_default_alpha());
94
95	vfloatacc uncor_errv = vfloatacc::zero();
96	vfloat uncor_bs0(uncor_pline.bs.lane<`0`>());
97	vfloat uncor_bs1(uncor_pline.bs.lane<`1`>());
98	vfloat uncor_bs2(uncor_pline.bs.lane<`2`>());
99
100	vfloat uncor_amod0(uncor_pline.amod.lane<`0`>());
101	vfloat uncor_amod1(uncor_pline.amod.lane<`1`>());
102	vfloat uncor_amod2(uncor_pline.amod.lane<`2`>());
103
104	vfloatacc samec_errv = vfloatacc::zero();
105	vfloat samec_bs0(samec_pline.bs.lane<`0`>());
106	vfloat samec_bs1(samec_pline.bs.lane<`1`>());
107	vfloat samec_bs2(samec_pline.bs.lane<`2`>());
108
109	vfloatacc rgbl_errv = vfloatacc::zero();
110	vfloat rgbl_bs0(rgbl_pline.bs.lane<`0`>());
111	vfloat rgbl_bs1(rgbl_pline.bs.lane<`1`>());
112	vfloat rgbl_bs2(rgbl_pline.bs.lane<`2`>());
113
114	vfloat rgbl_amod0(rgbl_pline.amod.lane<`0`>());
115	vfloat rgbl_amod1(rgbl_pline.amod.lane<`1`>());
116	vfloat rgbl_amod2(rgbl_pline.amod.lane<`2`>());
117
118	vfloatacc l_errv = vfloatacc::zero();
119	vfloat l_bs0(l_pline.bs.lane<`0`>());
120	vfloat l_bs1(l_pline.bs.lane<`1`>());
121	vfloat l_bs2(l_pline.bs.lane<`2`>());
122
123	vint lane_ids = vint::lane_id();
124	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
125	{
126	vint tix(texel_indexes + i);
127
128	vmask mask = lane_ids < vint (texel_count);
129	lane_ids += vint (ASTCENC_SIMD_WIDTH);
130
131	// Compute the error that arises from just ditching alpha
132	vfloat data_a = gatherf(blk.data_a, tix);
133	vfloat alpha_diff = data_a - default_a;
134	alpha_diff = alpha_diff * alpha_diff;
135
136	haccumulate(a_drop_errv, alpha_diff, mask);
137
138	vfloat data_r = gatherf(blk.data_r, tix);
139	vfloat data_g = gatherf(blk.data_g, tix);
140	vfloat data_b = gatherf(blk.data_b, tix);
141
142	// Compute uncorrelated error
143	vfloat param = data_r * uncor_bs0
144	+ data_g * uncor_bs1
145	+ data_b * uncor_bs2;
146
147	vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r;
148	vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g;
149	vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b;
150
151	vfloat error = dist0 * dist0 * ews.lane<`0`>()
152	+ dist1 * dist1 * ews.lane<`1`>()
153	+ dist2 * dist2 * ews.lane<`2`>();
154
155	haccumulate(uncor_errv, error, mask);
156
157	// Compute same chroma error - no "amod", its always zero
158	param = data_r * samec_bs0
159	+ data_g * samec_bs1
160	+ data_b * samec_bs2;
161
162	dist0 = (param * samec_bs0) - data_r;
163	dist1 = (param * samec_bs1) - data_g;
164	dist2 = (param * samec_bs2) - data_b;
165
166	error = dist0 * dist0 * ews.lane<`0`>()
167	+ dist1 * dist1 * ews.lane<`1`>()
168	+ dist2 * dist2 * ews.lane<`2`>();
169
170	haccumulate(samec_errv, error, mask);
171
172	// Compute rgbl error
173	param = data_r * rgbl_bs0
174	+ data_g * rgbl_bs1
175	+ data_b * rgbl_bs2;
176
177	dist0 = (rgbl_amod0 + param * rgbl_bs0) - data_r;
178	dist1 = (rgbl_amod1 + param * rgbl_bs1) - data_g;
179	dist2 = (rgbl_amod2 + param * rgbl_bs2) - data_b;
180
181	error = dist0 * dist0 * ews.lane<`0`>()
182	+ dist1 * dist1 * ews.lane<`1`>()
183	+ dist2 * dist2 * ews.lane<`2`>();
184
185	haccumulate(rgbl_errv, error, mask);
186
187	// Compute luma error - no "amod", its always zero
188	param = data_r * l_bs0
189	+ data_g * l_bs1
190	+ data_b * l_bs2;
191
192	dist0 = (param * l_bs0) - data_r;
193	dist1 = (param * l_bs1) - data_g;
194	dist2 = (param * l_bs2) - data_b;
195
196	error = dist0 * dist0 * ews.lane<`0`>()
197	+ dist1 * dist1 * ews.lane<`1`>()
198	+ dist2 * dist2 * ews.lane<`2`>();
199
200	haccumulate(l_errv, error, mask);
201	}
202
203	a_drop_err = hadd_s(a_drop_errv) * ews.lane<`3`>();
204	uncor_err = hadd_s(uncor_errv);
205	samec_err = hadd_s(samec_errv);
206	rgbl_err = hadd_s(rgbl_errv);
207	l_err = hadd_s(l_errv);
208	}
209
210	/**
211	* @brief For a given set of input colors and partitioning determine endpoint encode errors.
212	*
213	* This function determines the color error that results from RGB-scale encoding (LDR only),
214	* RGB-lumashift encoding (HDR only), luminance-encoding, and alpha drop. Also determines whether
215	* the endpoints are eligible for offset encoding or blue-contraction
216	*
217	* @param blk The image block.
218	* @param pi The partition info data.
219	* @param ep The idealized endpoints.
220	* @param[out] eci The resulting encoding choice error metrics.
221	*/
222	static void compute_encoding_choice_errors(
223	const image_block& blk,
224	const partition_info& pi,
225	const endpoints& ep,
226	encoding_choice_errors eci[BLOCK_MAX_PARTITIONS])
227	{
228	int partition_count = pi.partition_count;
229	promise(partition_count > `0`);
230
231	partition_metrics pms[BLOCK_MAX_PARTITIONS];
232
233	compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
234
235	for (int i = `0`; i < partition_count; i++)
236	{
237	partition_metrics& pm = pms[i];
238
239	line3 uncor_rgb_lines;
240	line3 samec_rgb_lines; // for LDR-RGB-scale
241	line3 rgb_luma_lines; // for HDR-RGB-scale
242
243	processed_line3 uncor_rgb_plines;
244	processed_line3 samec_rgb_plines;
245	processed_line3 rgb_luma_plines;
246	processed_line3 luminance_plines;
247
248	float uncorr_rgb_error;
249	float samechroma_rgb_error;
250	float rgb_luma_error;
251	float luminance_rgb_error;
252	float alpha_drop_error;
253
254	uncor_rgb_lines.a = pm.avg;
255	uncor_rgb_lines.b = normalize_safe(pm.dir, unit3());
256
257	samec_rgb_lines.a = vfloat4::zero();
258	samec_rgb_lines.b = normalize_safe(pm.avg, unit3());
259
260	rgb_luma_lines.a = pm.avg;
261	rgb_luma_lines.b = unit3();
262
263	uncor_rgb_plines.amod = uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b);
264	uncor_rgb_plines.bs = uncor_rgb_lines.b;
265
266	// Same chroma always goes though zero, so this is simpler than the others
267	samec_rgb_plines.amod = vfloat4::zero();
268	samec_rgb_plines.bs = samec_rgb_lines.b;
269
270	rgb_luma_plines.amod = rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b);
271	rgb_luma_plines.bs = rgb_luma_lines.b;
272
273	// Luminance always goes though zero, so this is simpler than the others
274	luminance_plines.amod = vfloat4::zero();
275	luminance_plines.bs = unit3();
276
277	compute_error_squared_rgb_single_partition(
278	pi, i, blk,
279	uncor_rgb_plines, uncorr_rgb_error,
280	samec_rgb_plines, samechroma_rgb_error,
281	rgb_luma_plines, rgb_luma_error,
282	luminance_plines, luminance_rgb_error,
283	alpha_drop_error);
284
285	// Determine if we can offset encode RGB lanes
286	vfloat4 endpt0 = ep.endpt0[i];
287	vfloat4 endpt1 = ep.endpt1[i];
288	vfloat4 endpt_diff = abs(endpt1 - endpt0);
289	vmask4 endpt_can_offset = endpt_diff < vfloat4 (`0.12f` * `65535.0f`);
290	bool can_offset_encode = (mask(endpt_can_offset) & `0x7`) == `0x7`;
291
292	// Store out the settings
293	eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * `0.7f`; // empirical
294	eci[i].rgb_luma_error = (rgb_luma_error - uncorr_rgb_error) * `1.5f`; // wild guess
295	eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * `3.0f`; // empirical
296	eci[i].alpha_drop_error = alpha_drop_error * `3.0f`;
297	eci[i].can_offset_encode = can_offset_encode;
298	eci[i].can_blue_contract = !blk.is_luminance();
299	}
300	}
301
302	/**
303	* @brief For a given partition compute the error for every endpoint integer count and quant level.
304	*
305	* @param encode_hdr_rgb @c true if using HDR for RGB, @c false for LDR.
306	* @param encode_hdr_alpha @c true if using HDR for alpha, @c false for LDR.
307	* @param partition_index The partition index.
308	* @param pi The partition info.
309	* @param eci The encoding choice error metrics.
310	* @param ep The idealized endpoints.
311	* @param error_weight The resulting encoding choice error metrics.
312	* @param[out] best_error The best error for each integer count and quant level.
313	* @param[out] format_of_choice The preferred endpoint format for each integer count and quant level.
314	*/
315	static void compute_color_error_for_every_integer_count_and_quant_level(
316	bool encode_hdr_rgb,
317	bool encode_hdr_alpha,
318	int partition_index,
319	const partition_info& pi,
320	const encoding_choice_errors& eci,
321	const endpoints& ep,
322	vfloat4 error_weight,
323	float best_error[`21`][`4`],
324	uint8_t format_of_choice[`21`][`4`]
325	) {
326	int partition_size = pi.partition_texel_count[partition_index];
327
328	static const float baseline_quant_error[`21` - QUANT_6] {
329	(`65536.0f` * `65536.0f` / `18.0f`) / (`5` * `5`),
330	(`65536.0f` * `65536.0f` / `18.0f`) / (`7` * `7`),
331	(`65536.0f` * `65536.0f` / `18.0f`) / (`9` * `9`),
332	(`65536.0f` * `65536.0f` / `18.0f`) / (`11` * `11`),
333	(`65536.0f` * `65536.0f` / `18.0f`) / (`15` * `15`),
334	(`65536.0f` * `65536.0f` / `18.0f`) / (`19` * `19`),
335	(`65536.0f` * `65536.0f` / `18.0f`) / (`23` * `23`),
336	(`65536.0f` * `65536.0f` / `18.0f`) / (`31` * `31`),
337	(`65536.0f` * `65536.0f` / `18.0f`) / (`39` * `39`),
338	(`65536.0f` * `65536.0f` / `18.0f`) / (`47` * `47`),
339	(`65536.0f` * `65536.0f` / `18.0f`) / (`63` * `63`),
340	(`65536.0f` * `65536.0f` / `18.0f`) / (`79` * `79`),
341	(`65536.0f` * `65536.0f` / `18.0f`) / (`95` * `95`),
342	(`65536.0f` * `65536.0f` / `18.0f`) / (`127` * `127`),
343	(`65536.0f` * `65536.0f` / `18.0f`) / (`159` * `159`),
344	(`65536.0f` * `65536.0f` / `18.0f`) / (`191` * `191`),
345	(`65536.0f` * `65536.0f` / `18.0f`) / (`255` * `255`)
346	};
347
348	vfloat4 ep0 = ep.endpt0[partition_index];
349	vfloat4 ep1 = ep.endpt1[partition_index];
350
351	float ep1_min = hmin_rgb_s(ep1);
352	ep1_min = astc::max(ep1_min, `0.0f`);
353
354	float error_weight_rgbsum = hadd_rgb_s(error_weight);
355
356	float range_upper_limit_rgb = encode_hdr_rgb ? `61440.0f` : `65535.0f`;
357	float range_upper_limit_alpha = encode_hdr_alpha ? `61440.0f` : `65535.0f`;
358
359	// It is possible to get endpoint colors significantly outside [0,upper-limit] even if the
360	// input data are safely contained in [0,upper-limit]; we need to add an error term for this
361	vfloat4 offset(range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_alpha);
362	vfloat4 ep0_range_error_high = max(ep0 - offset, `0.0f`);
363	vfloat4 ep1_range_error_high = max(ep1 - offset, `0.0f`);
364
365	vfloat4 ep0_range_error_low = min(ep0, `0.0f`);
366	vfloat4 ep1_range_error_low = min(ep1, `0.0f`);
367
368	vfloat4 sum_range_error =
369	(ep0_range_error_low * ep0_range_error_low) +
370	(ep1_range_error_low * ep1_range_error_low) +
371	(ep0_range_error_high * ep0_range_error_high) +
372	(ep1_range_error_high * ep1_range_error_high);
373
374	float rgb_range_error = dot3_s(sum_range_error, error_weight)
375	* `0.5f` * static_cast<float>(partition_size);
376	float alpha_range_error = sum_range_error.lane<`3`>() * error_weight.lane<`3`>()
377	* `0.5f` * static_cast<float>(partition_size);
378
379	if (encode_hdr_rgb)
380	{
381
382	// Collect some statistics
383	float af, cf;
384	if (ep1.lane<`0`>() > ep1.lane<`1`>() && ep1.lane<`0`>() > ep1.lane<`2`>())
385	{
386	af = ep1.lane<`0`>();
387	cf = ep1.lane<`0`>() - ep0.lane<`0`>();
388	}
389	else if (ep1.lane<`1`>() > ep1.lane<`2`>())
390	{
391	af = ep1.lane<`1`>();
392	cf = ep1.lane<`1`>() - ep0.lane<`1`>();
393	}
394	else
395	{
396	af = ep1.lane<`2`>();
397	cf = ep1.lane<`2`>() - ep0.lane<`2`>();
398	}
399
400	// Estimate of color-component spread in high endpoint color
401	float bf = af - ep1_min;
402	vfloat4 prd = (ep1 - vfloat4 (cf)).swz<`0`, `1`, `2`>();
403	vfloat4 pdif = prd - ep0.swz<`0`, `1`, `2`>();
404	// Estimate of color-component spread in low endpoint color
405	float df = hmax_s(abs(pdif));
406
407	int b = static_cast<int>(bf);
408	int c = static_cast<int>(cf);
409	int d = static_cast<int>(df);
410
411	// Determine which one of the 6 submodes is likely to be used in case of an RGBO-mode
412	int rgbo_mode = `5`; // 7 bits per component
413	// mode 4: 8 7 6
414	if (b < `32768` && c < `16384`)
415	{
416	rgbo_mode = `4`;
417	}
418
419	// mode 3: 9 6 7
420	if (b < `8192` && c < `16384`)
421	{
422	rgbo_mode = `3`;
423	}
424
425	// mode 2: 10 5 8
426	if (b < `2048` && c < `16384`)
427	{
428	rgbo_mode = `2`;
429	}
430
431	// mode 1: 11 6 5
432	if (b < `2048` && c < `1024`)
433	{
434	rgbo_mode = `1`;
435	}
436
437	// mode 0: 11 5 7
438	if (b < `1024` && c < `4096`)
439	{
440	rgbo_mode = `0`;
441	}
442
443	// Determine which one of the 9 submodes is likely to be used in case of an RGB-mode.
444	int rgb_mode = `8`; // 8 bits per component, except 7 bits for blue
445
446	// mode 0: 9 7 6 7
447	if (b < `16384` && c < `8192` && d < `8192`)
448	{
449	rgb_mode = `0`;
450	}
451
452	// mode 1: 9 8 6 6
453	if (b < `32768` && c < `8192` && d < `4096`)
454	{
455	rgb_mode = `1`;
456	}
457
458	// mode 2: 10 6 7 7
459	if (b < `4096` && c < `8192` && d < `4096`)
460	{
461	rgb_mode = `2`;
462	}
463
464	// mode 3: 10 7 7 6
465	if (b < `8192` && c < `8192` && d < `2048`)
466	{
467	rgb_mode = `3`;
468	}
469
470	// mode 4: 11 8 6 5
471	if (b < `8192` && c < `2048` && d < `512`)
472	{
473	rgb_mode = `4`;
474	}
475
476	// mode 5: 11 6 8 6
477	if (b < `2048` && c < `8192` && d < `1024`)
478	{
479	rgb_mode = `5`;
480	}
481
482	// mode 6: 12 7 7 5
483	if (b < `2048` && c < `2048` && d < `256`)
484	{
485	rgb_mode = `6`;
486	}
487
488	// mode 7: 12 6 7 6
489	if (b < `1024` && c < `2048` && d < `512`)
490	{
491	rgb_mode = `7`;
492	}
493
494	static const float rgbo_error_scales[`6`] { `4.0f`, `4.0f`, `16.0f`, `64.0f`, `256.0f`, `1024.0f` };
495	static const float rgb_error_scales[`9`] { `64.0f`, `64.0f`, `16.0f`, `16.0f`, `4.0f`, `4.0f`, `1.0f`, `1.0f`, `384.0f` };
496
497	float mode7mult = rgbo_error_scales[rgbo_mode] * `0.0015f`; // Empirically determined ....
498	float mode11mult = rgb_error_scales[rgb_mode] * `0.010f`; // Empirically determined ....
499
500
501	float lum_high = hadd_rgb_s(ep1) * (`1.0f` / `3.0f`);
502	float lum_low = hadd_rgb_s(ep0) * (`1.0f` / `3.0f`);
503	float lumdif = lum_high - lum_low;
504	float mode23mult = lumdif < `960` ? `4.0f` : lumdif < `3968` ? `16.0f` : `128.0f`;
505
506	mode23mult = `0.0005f`; // Empirically determined ....*
507
508	// Pick among the available HDR endpoint modes
509	for (int i = QUANT_2; i < QUANT_16; i++)
510	{
511	best_error[i][`3`] = ERROR_CALC_DEFAULT;
512	best_error[i][`2`] = ERROR_CALC_DEFAULT;
513	best_error[i][`1`] = ERROR_CALC_DEFAULT;
514	best_error[i][`0`] = ERROR_CALC_DEFAULT;
515
516	format_of_choice[i][`3`] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
517	format_of_choice[i][`2`] = FMT_HDR_RGB;
518	format_of_choice[i][`1`] = FMT_HDR_RGB_SCALE;
519	format_of_choice[i][`0`] = FMT_HDR_LUMINANCE_LARGE_RANGE;
520	}
521
522	for (int i = QUANT_16; i <= QUANT_256; i++)
523	{
524	// The base_quant_error should depend on the scale-factor that would be used during
525	// actual encode of the color value
526
527	float base_quant_error = baseline_quant_error[i - QUANT_6] * static_cast<float>(partition_size);
528	float rgb_quantization_error = error_weight_rgbsum * base_quant_error * `2.0f`;
529	float alpha_quantization_error = error_weight.lane<`3`>() * base_quant_error * `2.0f`;
530	float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;
531
532	// For 8 integers, we have two encodings: one with HDR A and another one with LDR A
533
534	float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error;
535	best_error[i][`3`] = full_hdr_rgba_error;
536	format_of_choice[i][`3`] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
537
538	// For 6 integers, we have one HDR-RGB encoding
539	float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci.alpha_drop_error;
540	best_error[i][`2`] = full_hdr_rgb_error;
541	format_of_choice[i][`2`] = FMT_HDR_RGB;
542
543	// For 4 integers, we have one HDR-RGB-Scale encoding
544	float hdr_rgb_scale_error = (rgb_quantization_error * mode7mult) + rgb_range_error + eci.alpha_drop_error + eci.rgb_luma_error;
545
546	best_error[i][`1`] = hdr_rgb_scale_error;
547	format_of_choice[i][`1`] = FMT_HDR_RGB_SCALE;
548
549	// For 2 integers, we assume luminance-with-large-range
550	float hdr_luminance_error = (rgb_quantization_error * mode23mult) + rgb_range_error + eci.alpha_drop_error + eci.luminance_error;
551	best_error[i][`0`] = hdr_luminance_error;
552	format_of_choice[i][`0`] = FMT_HDR_LUMINANCE_LARGE_RANGE;
553	}
554	}
555	else
556	{
557	for (int i = QUANT_2; i < QUANT_6; i++)
558	{
559	best_error[i][`3`] = ERROR_CALC_DEFAULT;
560	best_error[i][`2`] = ERROR_CALC_DEFAULT;
561	best_error[i][`1`] = ERROR_CALC_DEFAULT;
562	best_error[i][`0`] = ERROR_CALC_DEFAULT;
563
564	format_of_choice[i][`3`] = FMT_RGBA;
565	format_of_choice[i][`2`] = FMT_RGB;
566	format_of_choice[i][`1`] = FMT_RGB_SCALE;
567	format_of_choice[i][`0`] = FMT_LUMINANCE;
568	}
569
570	float base_quant_error_rgb = error_weight_rgbsum * static_cast<float>(partition_size);
571	float base_quant_error_a = error_weight.lane<`3`>() * static_cast<float>(partition_size);
572	float base_quant_error_rgba = base_quant_error_rgb + base_quant_error_a;
573
574	float error_scale_bc_rgba = eci.can_blue_contract ? `0.625f` : `1.0f`;
575	float error_scale_oe_rgba = eci.can_offset_encode ? `0.5f` : `1.0f`;
576
577	float error_scale_bc_rgb = eci.can_blue_contract ? `0.5f` : `1.0f`;
578	float error_scale_oe_rgb = eci.can_offset_encode ? `0.25f` : `1.0f`;
579
580	// Pick among the available LDR endpoint modes
581	for (int i = QUANT_6; i <= QUANT_256; i++)
582	{
583	// Offset encoding not possible at higher quant levels
584	if (i >= QUANT_192)
585	{
586	error_scale_oe_rgba = `1.0f`;
587	error_scale_oe_rgb = `1.0f`;
588	}
589
590	float base_quant_error = baseline_quant_error[i - QUANT_6];
591	float quant_error_rgb = base_quant_error_rgb * base_quant_error;
592	float quant_error_rgba = base_quant_error_rgba * base_quant_error;
593
594	// 8 integers can encode as RGBA+RGBA
595	float full_ldr_rgba_error = quant_error_rgba
596	* error_scale_bc_rgba
597	* error_scale_oe_rgba
598	+ rgb_range_error
599	+ alpha_range_error;
600
601	best_error[i][`3`] = full_ldr_rgba_error;
602	format_of_choice[i][`3`] = FMT_RGBA;
603
604	// 6 integers can encode as RGB+RGB or RGBS+AA
605	float full_ldr_rgb_error = quant_error_rgb
606	* error_scale_bc_rgb
607	* error_scale_oe_rgb
608	+ rgb_range_error
609	+ eci.alpha_drop_error;
610
611	float rgbs_alpha_error = quant_error_rgba
612	+ eci.rgb_scale_error
613	+ rgb_range_error
614	+ alpha_range_error;
615
616	if (rgbs_alpha_error < full_ldr_rgb_error)
617	{
618	best_error[i][`2`] = rgbs_alpha_error;
619	format_of_choice[i][`2`] = FMT_RGB_SCALE_ALPHA;
620	}
621	else
622	{
623	best_error[i][`2`] = full_ldr_rgb_error;
624	format_of_choice[i][`2`] = FMT_RGB;
625	}
626
627	// 4 integers can encode as RGBS or LA+LA
628	float ldr_rgbs_error = quant_error_rgb
629	+ rgb_range_error
630	+ eci.alpha_drop_error
631	+ eci.rgb_scale_error;
632
633	float lum_alpha_error = quant_error_rgba
634	+ rgb_range_error
635	+ alpha_range_error
636	+ eci.luminance_error;
637
638	if (ldr_rgbs_error < lum_alpha_error)
639	{
640	best_error[i][`1`] = ldr_rgbs_error;
641	format_of_choice[i][`1`] = FMT_RGB_SCALE;
642	}
643	else
644	{
645	best_error[i][`1`] = lum_alpha_error;
646	format_of_choice[i][`1`] = FMT_LUMINANCE_ALPHA;
647	}
648
649	// 2 integers can encode as L+L
650	float luminance_error = quant_error_rgb
651	+ rgb_range_error
652	+ eci.alpha_drop_error
653	+ eci.luminance_error;
654
655	best_error[i][`0`] = luminance_error;
656	format_of_choice[i][`0`] = FMT_LUMINANCE;
657	}
658	}
659	}
660
661	/**
662	* @brief For one partition compute the best format and quantization for a given bit count.
663	*
664	* @param best_combined_error The best error for each quant level and integer count.
665	* @param best_combined_format The best format for each quant level and integer count.
666	* @param bits_available The number of bits available for encoding.
667	* @param[out] best_quant_level The output best color quant level.
668	* @param[out] best_format The output best color format.
669	*
670	* @return The output error for the best pairing.
671	*/
672	static float one_partition_find_best_combination_for_bitcount(
673	const float best_combined_error[`21`][`4`],
674	const uint8_t best_combined_format[`21`][`4`],
675	int bits_available,
676	uint8_t& best_quant_level,
677	uint8_t& best_format
678	) {
679	int best_integer_count = `0`;
680	float best_integer_count_error = ERROR_CALC_DEFAULT;
681
682	for (int integer_count = `1`; integer_count <= `4`; integer_count++)
683	{
684	// Compute the quantization level for a given number of integers and a given number of bits
685	int quant_level = quant_mode_table[integer_count][bits_available];
686
687	// Don't have enough bits to represent a given endpoint format at all!
688	if (quant_level < QUANT_6)
689	{
690	continue;
691	}
692
693	float integer_count_error = best_combined_error[quant_level][integer_count - `1`];
694	if (integer_count_error < best_integer_count_error)
695	{
696	best_integer_count_error = integer_count_error;
697	best_integer_count = integer_count - `1`;
698	}
699	}
700
701	int ql = quant_mode_table[best_integer_count + `1`][bits_available];
702
703	best_quant_level = static_cast<uint8_t>(ql);
704	best_format = FMT_LUMINANCE;
705
706	if (ql >= QUANT_6)
707	{
708	best_format = best_combined_format[ql][best_integer_count];
709	}
710
711	return best_integer_count_error;
712	}
713
714	/**
715	* @brief For 2 partitions compute the best format combinations for every pair of quant mode and integer count.
716	*
717	* @param best_error The best error for a single endpoint quant level and integer count.
718	* @param best_format The best format for a single endpoint quant level and integer count.
719	* @param[out] best_combined_error The best combined error pairings for the 2 partitions.
720	* @param[out] best_combined_format The best combined format pairings for the 2 partitions.
721	*/
722	static void two_partitions_find_best_combination_for_every_quantization_and_integer_count(
723	const float best_error[`2`][`21`][`4`], // indexed by (partition, quant-level, integer-pair-count-minus-1)
724	const uint8_t best_format[`2`][`21`][`4`],
725	float best_combined_error[`21`][`7`], // indexed by (quant-level, integer-pair-count-minus-2)
726	uint8_t best_combined_format[`21`][`7`][`2`]
727	) {
728	for (int i = QUANT_2; i <= QUANT_256; i++)
729	{
730	for (int j = `0`; j < `7`; j++)
731	{
732	best_combined_error[i][j] = ERROR_CALC_DEFAULT;
733	}
734	}
735
736	for (int quant = QUANT_6; quant <= QUANT_256; quant++)
737	{
738	for (int i = `0`; i < `4`; i++) // integer-count for first endpoint-pair
739	{
740	for (int j = `0`; j < `4`; j++) // integer-count for second endpoint-pair
741	{
742	int low2 = astc::min(i, j);
743	int high2 = astc::max(i, j);
744	if ((high2 - low2) > `1`)
745	{
746	continue;
747	}
748
749	int intcnt = i + j;
750	float errorterm = astc::min(best_error[`0`][quant][i] + best_error[`1`][quant][j], `1e10f`);
751	if (errorterm <= best_combined_error[quant][intcnt])
752	{
753	best_combined_error[quant][intcnt] = errorterm;
754	best_combined_format[quant][intcnt][`0`] = best_format[`0`][quant][i];
755	best_combined_format[quant][intcnt][`1`] = best_format[`1`][quant][j];
756	}
757	}
758	}
759	}
760	}
761
762	/**
763	* @brief For 2 partitions compute the best format and quantization for a given bit count.
764	*
765	* @param best_combined_error The best error for each quant level and integer count.
766	* @param best_combined_format The best format for each quant level and integer count.
767	* @param bits_available The number of bits available for encoding.
768	* @param[out] best_quant_level The output best color quant level.
769	* @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available.
770	* @param[out] best_formats The output best color formats.
771	*
772	* @return The output error for the best pairing.
773	*/
774	static float two_partitions_find_best_combination_for_bitcount(
775	float best_combined_error[`21`][`7`],
776	uint8_t best_combined_format[`21`][`7`][`2`],
777	int bits_available,
778	uint8_t& best_quant_level,
779	uint8_t& best_quant_level_mod,
780	uint8_t* best_formats
781	) {
782	int best_integer_count = `0`;
783	float best_integer_count_error = ERROR_CALC_DEFAULT;
784
785	for (int integer_count = `2`; integer_count <= `8`; integer_count++)
786	{
787	// Compute the quantization level for a given number of integers and a given number of bits
788	int quant_level = quant_mode_table[integer_count][bits_available];
789
790	// Don't have enough bits to represent a given endpoint format at all!
791	if (quant_level < QUANT_6)
792	{
793	break;
794	}
795
796	float integer_count_error = best_combined_error[quant_level][integer_count - `2`];
797	if (integer_count_error < best_integer_count_error)
798	{
799	best_integer_count_error = integer_count_error;
800	best_integer_count = integer_count;
801	}
802	}
803
804	int ql = quant_mode_table[best_integer_count][bits_available];
805	int ql_mod = quant_mode_table[best_integer_count][bits_available + `2`];
806
807	best_quant_level = static_cast<uint8_t>(ql);
808	best_quant_level_mod = static_cast<uint8_t>(ql_mod);
809
810	if (ql >= QUANT_6)
811	{
812	for (int i = `0`; i < `2`; i++)
813	{
814	best_formats[i] = best_combined_format[ql][best_integer_count - `2`][i];
815	}
816	}
817	else
818	{
819	for (int i = `0`; i < `2`; i++)
820	{
821	best_formats[i] = FMT_LUMINANCE;
822	}
823	}
824
825	return best_integer_count_error;
826	}
827
828	/**
829	* @brief For 3 partitions compute the best format combinations for every pair of quant mode and integer count.
830	*
831	* @param best_error The best error for a single endpoint quant level and integer count.
832	* @param best_format The best format for a single endpoint quant level and integer count.
833	* @param[out] best_combined_error The best combined error pairings for the 3 partitions.
834	* @param[out] best_combined_format The best combined format pairings for the 3 partitions.
835	*/
836	static void three_partitions_find_best_combination_for_every_quantization_and_integer_count(
837	const float best_error[`3`][`21`][`4`], // indexed by (partition, quant-level, integer-count)
838	const uint8_t best_format[`3`][`21`][`4`],
839	float best_combined_error[`21`][`10`],
840	uint8_t best_combined_format[`21`][`10`][`3`]
841	) {
842	for (int i = QUANT_2; i <= QUANT_256; i++)
843	{
844	for (int j = `0`; j < `10`; j++)
845	{
846	best_combined_error[i][j] = ERROR_CALC_DEFAULT;
847	}
848	}
849
850	for (int quant = QUANT_6; quant <= QUANT_256; quant++)
851	{
852	for (int i = `0`; i < `4`; i++) // integer-count for first endpoint-pair
853	{
854	for (int j = `0`; j < `4`; j++) // integer-count for second endpoint-pair
855	{
856	int low2 = astc::min(i, j);
857	int high2 = astc::max(i, j);
858	if ((high2 - low2) > `1`)
859	{
860	continue;
861	}
862
863	for (int k = `0`; k < `4`; k++) // integer-count for third endpoint-pair
864	{
865	int low3 = astc::min(k, low2);
866	int high3 = astc::max(k, high2);
867	if ((high3 - low3) > `1`)
868	{
869	continue;
870	}
871
872	int intcnt = i + j + k;
873	float errorterm = astc::min(best_error[`0`][quant][i] + best_error[`1`][quant][j] + best_error[`2`][quant][k], `1e10f`);
874	if (errorterm <= best_combined_error[quant][intcnt])
875	{
876	best_combined_error[quant][intcnt] = errorterm;
877	best_combined_format[quant][intcnt][`0`] = best_format[`0`][quant][i];
878	best_combined_format[quant][intcnt][`1`] = best_format[`1`][quant][j];
879	best_combined_format[quant][intcnt][`2`] = best_format[`2`][quant][k];
880	}
881	}
882	}
883	}
884	}
885	}
886
887	/**
888	* @brief For 3 partitions compute the best format and quantization for a given bit count.
889	*
890	* @param best_combined_error The best error for each quant level and integer count.
891	* @param best_combined_format The best format for each quant level and integer count.
892	* @param bits_available The number of bits available for encoding.
893	* @param[out] best_quant_level The output best color quant level.
894	* @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available.
895	* @param[out] best_formats The output best color formats.
896	*
897	* @return The output error for the best pairing.
898	*/
899	static float three_partitions_find_best_combination_for_bitcount(
900	const float best_combined_error[`21`][`10`],
901	const uint8_t best_combined_format[`21`][`10`][`3`],
902	int bits_available,
903	uint8_t& best_quant_level,
904	uint8_t& best_quant_level_mod,
905	uint8_t* best_formats
906	) {
907	int best_integer_count = `0`;
908	float best_integer_count_error = ERROR_CALC_DEFAULT;
909
910	for (int integer_count = `3`; integer_count <= `9`; integer_count++)
911	{
912	// Compute the quantization level for a given number of integers and a given number of bits
913	int quant_level = quant_mode_table[integer_count][bits_available];
914
915	// Don't have enough bits to represent a given endpoint format at all!
916	if (quant_level < QUANT_6)
917	{
918	break;
919	}
920
921	float integer_count_error = best_combined_error[quant_level][integer_count - `3`];
922	if (integer_count_error < best_integer_count_error)
923	{
924	best_integer_count_error = integer_count_error;
925	best_integer_count = integer_count;
926	}
927	}
928
929	int ql = quant_mode_table[best_integer_count][bits_available];
930	int ql_mod = quant_mode_table[best_integer_count][bits_available + `5`];
931
932	best_quant_level = static_cast<uint8_t>(ql);
933	best_quant_level_mod = static_cast<uint8_t>(ql_mod);
934
935	if (ql >= QUANT_6)
936	{
937	for (int i = `0`; i < `3`; i++)
938	{
939	best_formats[i] = best_combined_format[ql][best_integer_count - `3`][i];
940	}
941	}
942	else
943	{
944	for (int i = `0`; i < `3`; i++)
945	{
946	best_formats[i] = FMT_LUMINANCE;
947	}
948	}
949
950	return best_integer_count_error;
951	}
952
953	/**
954	* @brief For 4 partitions compute the best format combinations for every pair of quant mode and integer count.
955	*
956	* @param best_error The best error for a single endpoint quant level and integer count.
957	* @param best_format The best format for a single endpoint quant level and integer count.
958	* @param[out] best_combined_error The best combined error pairings for the 4 partitions.
959	* @param[out] best_combined_format The best combined format pairings for the 4 partitions.
960	*/
961	static void four_partitions_find_best_combination_for_every_quantization_and_integer_count(
962	const float best_error[`4`][`21`][`4`], // indexed by (partition, quant-level, integer-count)
963	const uint8_t best_format[`4`][`21`][`4`],
964	float best_combined_error[`21`][`13`],
965	uint8_t best_combined_format[`21`][`13`][`4`]
966	) {
967	for (int i = QUANT_2; i <= QUANT_256; i++)
968	{
969	for (int j = `0`; j < `13`; j++)
970	{
971	best_combined_error[i][j] = ERROR_CALC_DEFAULT;
972	}
973	}
974
975	for (int quant = QUANT_6; quant <= QUANT_256; quant++)
976	{
977	for (int i = `0`; i < `4`; i++) // integer-count for first endpoint-pair
978	{
979	for (int j = `0`; j < `4`; j++) // integer-count for second endpoint-pair
980	{
981	int low2 = astc::min(i, j);
982	int high2 = astc::max(i, j);
983	if ((high2 - low2) > `1`)
984	{
985	continue;
986	}
987
988	for (int k = `0`; k < `4`; k++) // integer-count for third endpoint-pair
989	{
990	int low3 = astc::min(k, low2);
991	int high3 = astc::max(k, high2);
992	if ((high3 - low3) > `1`)
993	{
994	continue;
995	}
996
997	for (int l = `0`; l < `4`; l++) // integer-count for fourth endpoint-pair
998	{
999	int low4 = astc::min(l, low3);
1000	int high4 = astc::max(l, high3);
1001	if ((high4 - low4) > `1`)
1002	{
1003	continue;
1004	}
1005
1006	int intcnt = i + j + k + l;
1007	float errorterm = astc::min(best_error[`0`][quant][i] + best_error[`1`][quant][j] + best_error[`2`][quant][k] + best_error[`3`][quant][l], `1e10f`);
1008	if (errorterm <= best_combined_error[quant][intcnt])
1009	{
1010	best_combined_error[quant][intcnt] = errorterm;
1011	best_combined_format[quant][intcnt][`0`] = best_format[`0`][quant][i];
1012	best_combined_format[quant][intcnt][`1`] = best_format[`1`][quant][j];
1013	best_combined_format[quant][intcnt][`2`] = best_format[`2`][quant][k];
1014	best_combined_format[quant][intcnt][`3`] = best_format[`3`][quant][l];
1015	}
1016	}
1017	}
1018	}
1019	}
1020	}
1021	}
1022
1023	/**
1024	* @brief For 4 partitions compute the best format and quantization for a given bit count.
1025	*
1026	* @param best_combined_error The best error for each quant level and integer count.
1027	* @param best_combined_format The best format for each quant level and integer count.
1028	* @param bits_available The number of bits available for encoding.
1029	* @param[out] best_quant_level The output best color quant level.
1030	* @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available.
1031	* @param[out] best_formats The output best color formats.
1032	*
1033	* @return best_error The output error for the best pairing.
1034	*/
1035	static float four_partitions_find_best_combination_for_bitcount(
1036	const float best_combined_error[`21`][`13`],
1037	const uint8_t best_combined_format[`21`][`13`][`4`],
1038	int bits_available,
1039	uint8_t& best_quant_level,
1040	uint8_t& best_quant_level_mod,
1041	uint8_t* best_formats
1042	) {
1043	int best_integer_count = `0`;
1044	float best_integer_count_error = ERROR_CALC_DEFAULT;
1045
1046	for (int integer_count = `4`; integer_count <= `9`; integer_count++)
1047	{
1048	// Compute the quantization level for a given number of integers and a given number of bits
1049	int quant_level = quant_mode_table[integer_count][bits_available];
1050
1051	// Don't have enough bits to represent a given endpoint format at all!
1052	if (quant_level < QUANT_6)
1053	{
1054	break;
1055	}
1056
1057	float integer_count_error = best_combined_error[quant_level][integer_count - `4`];
1058	if (integer_count_error < best_integer_count_error)
1059	{
1060	best_integer_count_error = integer_count_error;
1061	best_integer_count = integer_count;
1062	}
1063	}
1064
1065	int ql = quant_mode_table[best_integer_count][bits_available];
1066	int ql_mod = quant_mode_table[best_integer_count][bits_available + `8`];
1067
1068	best_quant_level = static_cast<uint8_t>(ql);
1069	best_quant_level_mod = static_cast<uint8_t>(ql_mod);
1070
1071	if (ql >= QUANT_6)
1072	{
1073	for (int i = `0`; i < `4`; i++)
1074	{
1075	best_formats[i] = best_combined_format[ql][best_integer_count - `4`][i];
1076	}
1077	}
1078	else
1079	{
1080	for (int i = `0`; i < `4`; i++)
1081	{
1082	best_formats[i] = FMT_LUMINANCE;
1083	}
1084	}
1085
1086	return best_integer_count_error;
1087	}
1088
1089	/ See header for documentation. /
1090	unsigned int compute_ideal_endpoint_formats(
1091	const partition_info& pi,
1092	const image_block& blk,
1093	const endpoints& ep,
1094	// bitcounts and errors computed for the various quantization methods
1095	const int8_t* qwt_bitcounts,
1096	const float* qwt_errors,
1097	unsigned int tune_candidate_limit,
1098	unsigned int start_block_mode,
1099	unsigned int end_block_mode,
1100	// output data
1101	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
1102	int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
1103	quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
1104	quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
1105	compression_working_buffers& tmpbuf
1106	) {
1107	int partition_count = pi.partition_count;
1108
1109	promise(partition_count > `0`);
1110
1111	bool encode_hdr_rgb = static_cast<bool>(blk.rgb_lns[`0`]);
1112	bool encode_hdr_alpha = static_cast<bool>(blk.alpha_lns[`0`]);
1113
1114	// Compute the errors that result from various encoding choices (such as using luminance instead
1115	// of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on)
1116	encoding_choice_errors eci[BLOCK_MAX_PARTITIONS];
1117	compute_encoding_choice_errors(blk, pi, ep, eci);
1118
1119	float best_error[BLOCK_MAX_PARTITIONS][`21`][`4`];
1120	uint8_t format_of_choice[BLOCK_MAX_PARTITIONS][`21`][`4`];
1121	for (int i = `0`; i < partition_count; i++)
1122	{
1123	compute_color_error_for_every_integer_count_and_quant_level(
1124	encode_hdr_rgb, encode_hdr_alpha, i,
1125	pi, eci[i], ep, blk.channel_weight, best_error[i],
1126	format_of_choice[i]);
1127	}
1128
1129	float* errors_of_best_combination = tmpbuf.errors_of_best_combination;
1130	uint8_t* best_quant_levels = tmpbuf.best_quant_levels;
1131	uint8_t* best_quant_levels_mod = tmpbuf.best_quant_levels_mod;
1132	uint8_t (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;
1133
1134	// Ensure that the first iteration understep contains data that will never be picked
1135	vfloat clear_error(ERROR_CALC_DEFAULT);
1136	vint clear_quant(`0`);
1137
1138	unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
1139	storea(clear_error, errors_of_best_combination + packed_start_block_mode);
1140	store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode);
1141	store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode);
1142
1143	// Ensure that last iteration overstep contains data that will never be picked
1144	unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - `1`);
1145	storea(clear_error, errors_of_best_combination + packed_end_block_mode);
1146	store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode);
1147	store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode);
1148
1149	// Track a scalar best to avoid expensive search at least once ...
1150	float error_of_best_combination = ERROR_CALC_DEFAULT;
1151	int index_of_best_combination = -`1`;
1152
1153	// The block contains 1 partition
1154	if (partition_count == `1`)
1155	{
1156	for (unsigned int i = start_block_mode; i < end_block_mode; i++)
1157	{
1158	if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1159	{
1160	errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1161	continue;
1162	}
1163
1164	float error_of_best = one_partition_find_best_combination_for_bitcount(
1165	best_error[`0`], format_of_choice[`0`], qwt_bitcounts[i],
1166	best_quant_levels[i], best_ep_formats[i][`0`]);
1167
1168	float total_error = error_of_best + qwt_errors[i];
1169	errors_of_best_combination[i] = total_error;
1170	best_quant_levels_mod[i] = best_quant_levels[i];
1171
1172	if (total_error < error_of_best_combination)
1173	{
1174	error_of_best_combination = total_error;
1175	index_of_best_combination = i;
1176	}
1177	}
1178	}
1179	// The block contains 2 partitions
1180	else if (partition_count == `2`)
1181	{
1182	float combined_best_error[`21`][`7`];
1183	uint8_t formats_of_choice[`21`][`7`][`2`];
1184
1185	two_partitions_find_best_combination_for_every_quantization_and_integer_count(
1186	best_error, format_of_choice, combined_best_error, formats_of_choice);
1187
1188	assert(start_block_mode == `0`);
1189	for (unsigned int i = `0`; i < end_block_mode; i++)
1190	{
1191	if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1192	{
1193	errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1194	continue;
1195	}
1196
1197	float error_of_best = two_partitions_find_best_combination_for_bitcount(
1198	combined_best_error, formats_of_choice, qwt_bitcounts[i],
1199	best_quant_levels[i], best_quant_levels_mod[i],
1200	best_ep_formats[i]);
1201
1202	float total_error = error_of_best + qwt_errors[i];
1203	errors_of_best_combination[i] = total_error;
1204
1205	if (total_error < error_of_best_combination)
1206	{
1207	error_of_best_combination = total_error;
1208	index_of_best_combination = i;
1209	}
1210	}
1211	}
1212	// The block contains 3 partitions
1213	else if (partition_count == `3`)
1214	{
1215	float combined_best_error[`21`][`10`];
1216	uint8_t formats_of_choice[`21`][`10`][`3`];
1217
1218	three_partitions_find_best_combination_for_every_quantization_and_integer_count(
1219	best_error, format_of_choice, combined_best_error, formats_of_choice);
1220
1221	assert(start_block_mode == `0`);
1222	for (unsigned int i = `0`; i < end_block_mode; i++)
1223	{
1224	if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1225	{
1226	errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1227	continue;
1228	}
1229
1230	float error_of_best = three_partitions_find_best_combination_for_bitcount(
1231	combined_best_error, formats_of_choice, qwt_bitcounts[i],
1232	best_quant_levels[i], best_quant_levels_mod[i],
1233	best_ep_formats[i]);
1234
1235	float total_error = error_of_best + qwt_errors[i];
1236	errors_of_best_combination[i] = total_error;
1237
1238	if (total_error < error_of_best_combination)
1239	{
1240	error_of_best_combination = total_error;
1241	index_of_best_combination = i;
1242	}
1243	}
1244	}
1245	// The block contains 4 partitions
1246	else // if (partition_count == 4)
1247	{
1248	assert(partition_count == `4`);
1249	float combined_best_error[`21`][`13`];
1250	uint8_t formats_of_choice[`21`][`13`][`4`];
1251
1252	four_partitions_find_best_combination_for_every_quantization_and_integer_count(
1253	best_error, format_of_choice, combined_best_error, formats_of_choice);
1254
1255	assert(start_block_mode == `0`);
1256	for (unsigned int i = `0`; i < end_block_mode; i++)
1257	{
1258	if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1259	{
1260	errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1261	continue;
1262	}
1263
1264	float error_of_best = four_partitions_find_best_combination_for_bitcount(
1265	combined_best_error, formats_of_choice, qwt_bitcounts[i],
1266	best_quant_levels[i], best_quant_levels_mod[i],
1267	best_ep_formats[i]);
1268
1269	float total_error = error_of_best + qwt_errors[i];
1270	errors_of_best_combination[i] = total_error;
1271
1272	if (total_error < error_of_best_combination)
1273	{
1274	error_of_best_combination = total_error;
1275	index_of_best_combination = i;
1276	}
1277	}
1278	}
1279
1280	int best_error_weights[TUNE_MAX_TRIAL_CANDIDATES];
1281
1282	// Fast path the first result and avoid the list search for trial 0
1283	best_error_weights[`0`] = index_of_best_combination;
1284	if (index_of_best_combination >= `0`)
1285	{
1286	errors_of_best_combination[index_of_best_combination] = ERROR_CALC_DEFAULT;
1287	}
1288
1289	// Search the remaining results and pick the best candidate modes for trial 1+
1290	for (unsigned int i = `1`; i < tune_candidate_limit; i++)
1291	{
1292	vint vbest_error_index(-`1`);
1293	vfloat vbest_ep_error(ERROR_CALC_DEFAULT);
1294
1295	start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
1296	vint lane_ids = vint::lane_id() + vint (start_block_mode);
1297	for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
1298	{
1299	vfloat err = vfloat (errors_of_best_combination + j);
1300	vmask mask = err < vbest_ep_error;
1301	vbest_ep_error = select(vbest_ep_error, err, mask);
1302	vbest_error_index = select(vbest_error_index, lane_ids, mask);
1303	lane_ids += vint (ASTCENC_SIMD_WIDTH);
1304	}
1305
1306	// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
1307	vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
1308	vbest_error_index = select(vint (`0x7FFFFFFF`), vbest_error_index, lanes_min_error);
1309	vbest_error_index = hmin(vbest_error_index);
1310	int best_error_index = vbest_error_index.lane<`0`>();
1311
1312	best_error_weights[i] = best_error_index;
1313
1314	// Max the error for this candidate so we don't pick it again
1315	if (best_error_index >= `0`)
1316	{
1317	errors_of_best_combination[best_error_index] = ERROR_CALC_DEFAULT;
1318	}
1319	// Early-out if no more candidates are valid
1320	else
1321	{
1322	break;
1323	}
1324	}
1325
1326	for (unsigned int i = `0`; i < tune_candidate_limit; i++)
1327	{
1328	if (best_error_weights[i] < `0`)
1329	{
1330	return i;
1331	}
1332
1333	block_mode[i] = best_error_weights[i];
1334
1335	quant_level[i] = static_cast<quant_method>(best_quant_levels[best_error_weights[i]]);
1336	quant_level_mod[i] = static_cast<quant_method>(best_quant_levels_mod[best_error_weights[i]]);
1337
1338	assert(quant_level[i] >= QUANT_6 && quant_level[i] <= QUANT_256);
1339	assert(quant_level_mod[i] >= QUANT_6 && quant_level_mod[i] <= QUANT_256);
1340
1341	for (int j = `0`; j < partition_count; j++)
1342	{
1343	partition_format_specifiers[i][j] = best_ep_formats[best_error_weights[i]][j];
1344	}
1345	}
1346
1347	return tune_candidate_limit;
1348	}
1349
1350	#endif
1351

Browse the source code of Godot/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp