astcenc_ideal_endpoints_and_weights.cpp source code [Godot/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp]

1	// SPDX-License-Identifier: Apache-2.0
2	// ----------------------------------------------------------------------------
3	// Copyright 2011-2023 Arm Limited
4	//
5	// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6	// use this file except in compliance with the License. You may obtain a copy
7	// of the License at:
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing, software
12	// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13	// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14	// License for the specific language governing permissions and limitations
15	// under the License.
16	// ----------------------------------------------------------------------------
17
18	#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20	/**
21	* @brief Functions for computing color endpoints and texel weights.
22	*/
23
24	#include <cassert>
25
26	#include "astcenc_internal.h"
27	#include "astcenc_vecmathlib.h"
28
29	/**
30	* @brief Compute the infilled weight for N texel indices in a decimated grid.
31	*
32	* @param di The weight grid decimation to use.
33	* @param weights The decimated weight values to use.
34	* @param index The first texel index to interpolate.
35	*
36	* @return The interpolated weight for the given set of SIMD_WIDTH texels.
37	*/
38	static vfloat bilinear_infill_vla(
39	const decimation_info& di,
40	const float* weights,
41	unsigned int index
42	) {
43	// Load the bilinear filter texel weight indexes in the decimated grid
44	vint weight_idx0 = vint (di.texel_weights_tr[`0`] + index);
45	vint weight_idx1 = vint (di.texel_weights_tr[`1`] + index);
46	vint weight_idx2 = vint (di.texel_weights_tr[`2`] + index);
47	vint weight_idx3 = vint (di.texel_weights_tr[`3`] + index);
48
49	// Load the bilinear filter weights from the decimated grid
50	vfloat weight_val0 = gatherf(weights, weight_idx0);
51	vfloat weight_val1 = gatherf(weights, weight_idx1);
52	vfloat weight_val2 = gatherf(weights, weight_idx2);
53	vfloat weight_val3 = gatherf(weights, weight_idx3);
54
55	// Load the weight contribution factors for each decimated weight
56	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[`0`] + index);
57	vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[`1`] + index);
58	vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[`2`] + index);
59	vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[`3`] + index);
60
61	// Compute the bilinear interpolation to generate the per-texel weight
62	return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
63	(weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
64	}
65
66	/**
67	* @brief Compute the infilled weight for N texel indices in a decimated grid.
68	*
69	* This is specialized version which computes only two weights per texel for
70	* encodings that are only decimated in a single axis.
71	*
72	* @param di The weight grid decimation to use.
73	* @param weights The decimated weight values to use.
74	* @param index The first texel index to interpolate.
75	*
76	* @return The interpolated weight for the given set of SIMD_WIDTH texels.
77	*/
78	static vfloat bilinear_infill_vla_2(
79	const decimation_info& di,
80	const float* weights,
81	unsigned int index
82	) {
83	// Load the bilinear filter texel weight indexes in the decimated grid
84	vint weight_idx0 = vint (di.texel_weights_tr[`0`] + index);
85	vint weight_idx1 = vint (di.texel_weights_tr[`1`] + index);
86
87	// Load the bilinear filter weights from the decimated grid
88	vfloat weight_val0 = gatherf(weights, weight_idx0);
89	vfloat weight_val1 = gatherf(weights, weight_idx1);
90
91	// Load the weight contribution factors for each decimated weight
92	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[`0`] + index);
93	vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[`1`] + index);
94
95	// Compute the bilinear interpolation to generate the per-texel weight
96	return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
97	}
98
99	/**
100	* @brief Compute the ideal endpoints and weights for 1 color component.
101	*
102	* @param blk The image block color data to compress.
103	* @param pi The partition info for the current trial.
104	* @param[out] ei The computed ideal endpoints and weights.
105	* @param component The color component to compute.
106	*/
107	static void compute_ideal_colors_and_weights_1_comp(
108	const image_block& blk,
109	const partition_info& pi,
110	endpoints_and_weights& ei,
111	unsigned int component
112	) {
113	unsigned int partition_count = pi.partition_count;
114	ei.ep.partition_count = partition_count;
115	promise(partition_count > `0`);
116
117	unsigned int texel_count = blk.texel_count;
118	promise(texel_count > `0`);
119
120	float error_weight;
121	const float* data_vr = nullptr;
122
123	assert(component < BLOCK_MAX_COMPONENTS);
124	switch (component)
125	{
126	case `0`:
127	error_weight = blk.channel_weight.lane<`0`>();
128	data_vr = blk.data_r;
129	break;
130	case `1`:
131	error_weight = blk.channel_weight.lane<`1`>();
132	data_vr = blk.data_g;
133	break;
134	case `2`:
135	error_weight = blk.channel_weight.lane<`2`>();
136	data_vr = blk.data_b;
137	break;
138	default:
139	assert(component == `3`);
140	error_weight = blk.channel_weight.lane<`3`>();
141	data_vr = blk.data_a;
142	break;
143	}
144
145	vmask4 sep_mask = vint4::lane_id() == vint4 (component);
146	bool is_constant_wes { true };
147	float partition0_len_sq { `0.0f` };
148
149	for (unsigned int i = `0`; i < partition_count; i++)
150	{
151	float lowvalue { `1e10f` };
152	float highvalue { -`1e10f` };
153
154	unsigned int partition_texel_count = pi.partition_texel_count[i];
155	for (unsigned int j = `0`; j < partition_texel_count; j++)
156	{
157	unsigned int tix = pi.texels_of_partition[i][j];
158	float value = data_vr[tix];
159	lowvalue = astc::min(value, lowvalue);
160	highvalue = astc::max(value, highvalue);
161	}
162
163	if (highvalue <= lowvalue)
164	{
165	lowvalue = `0.0f`;
166	highvalue = `1e-7f`;
167	}
168
169	float length = highvalue - lowvalue;
170	float length_squared = length * length;
171	float scale = `1.0f` / length;
172
173	if (i == `0`)
174	{
175	partition0_len_sq = length_squared;
176	}
177	else
178	{
179	is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
180	}
181
182	for (unsigned int j = `0`; j < partition_texel_count; j++)
183	{
184	unsigned int tix = pi.texels_of_partition[i][j];
185	float value = (data_vr[tix] - lowvalue) * scale;
186	value = astc::clamp1f(value);
187
188	ei.weights[tix] = value;
189	ei.weight_error_scale[tix] = length_squared * error_weight;
190	assert(!astc::isnan(ei.weight_error_scale[tix]));
191	}
192
193	ei.ep.endpt0[i] = select(blk.data_min, vfloat4 (lowvalue), sep_mask);
194	ei.ep.endpt1[i] = select(blk.data_max, vfloat4 (highvalue), sep_mask);
195	}
196
197	// Zero initialize any SIMD over-fetch
198	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
199	for (unsigned int i = texel_count; i < texel_count_simd; i++)
200	{
201	ei.weights[i] = `0.0f`;
202	ei.weight_error_scale[i] = `0.0f`;
203	}
204
205	ei.is_constant_weight_error_scale = is_constant_wes;
206	}
207
208	/**
209	* @brief Compute the ideal endpoints and weights for 2 color components.
210	*
211	* @param blk The image block color data to compress.
212	* @param pi The partition info for the current trial.
213	* @param[out] ei The computed ideal endpoints and weights.
214	* @param component1 The first color component to compute.
215	* @param component2 The second color component to compute.
216	*/
217	static void compute_ideal_colors_and_weights_2_comp(
218	const image_block& blk,
219	const partition_info& pi,
220	endpoints_and_weights& ei,
221	int component1,
222	int component2
223	) {
224	unsigned int partition_count = pi.partition_count;
225	ei.ep.partition_count = partition_count;
226	promise(partition_count > `0`);
227
228	unsigned int texel_count = blk.texel_count;
229	promise(texel_count > `0`);
230
231	partition_metrics pms[BLOCK_MAX_PARTITIONS];
232
233	float error_weight;
234	const float* data_vr = nullptr;
235	const float* data_vg = nullptr;
236
237	if (component1 == `0` && component2 == `1`)
238	{
239	error_weight = hadd_s(blk.channel_weight.swz<`0`, `1`>()) / `2.0f`;
240
241	data_vr = blk.data_r;
242	data_vg = blk.data_g;
243	}
244	else if (component1 == `0` && component2 == `2`)
245	{
246	error_weight = hadd_s(blk.channel_weight.swz<`0`, `2`>()) / `2.0f`;
247
248	data_vr = blk.data_r;
249	data_vg = blk.data_b;
250	}
251	else // (component1 == 1 && component2 == 2)
252	{
253	assert(component1 == `1` && component2 == `2`);
254
255	error_weight = hadd_s(blk.channel_weight.swz<`1`, `2`>()) / `2.0f`;
256
257	data_vr = blk.data_g;
258	data_vg = blk.data_b;
259	}
260
261	compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
262
263	bool is_constant_wes { true };
264	float partition0_len_sq { `0.0f` };
265
266	vmask4 comp1_mask = vint4::lane_id() == vint4 (component1);
267	vmask4 comp2_mask = vint4::lane_id() == vint4 (component2);
268
269	for (unsigned int i = `0`; i < partition_count; i++)
270	{
271	vfloat4 dir = pms[i].dir;
272	if (hadd_s(dir) < `0.0f`)
273	{
274	dir = vfloat4::zero() - dir;
275	}
276
277	line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
278	float lowparam { `1e10f` };
279	float highparam { -`1e10f` };
280
281	unsigned int partition_texel_count = pi.partition_texel_count[i];
282	for (unsigned int j = `0`; j < partition_texel_count; j++)
283	{
284	unsigned int tix = pi.texels_of_partition[i][j];
285	vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
286	float param = dot_s(point - line.a, line.b);
287	ei.weights[tix] = param;
288
289	lowparam = astc::min(param, lowparam);
290	highparam = astc::max(param, highparam);
291	}
292
293	// It is possible for a uniform-color partition to produce length=0;
294	// this causes NaN issues so set to small value to avoid this problem
295	if (highparam <= lowparam)
296	{
297	lowparam = `0.0f`;
298	highparam = `1e-7f`;
299	}
300
301	float length = highparam - lowparam;
302	float length_squared = length * length;
303	float scale = `1.0f` / length;
304
305	if (i == `0`)
306	{
307	partition0_len_sq = length_squared;
308	}
309	else
310	{
311	is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
312	}
313
314	for (unsigned int j = `0`; j < partition_texel_count; j++)
315	{
316	unsigned int tix = pi.texels_of_partition[i][j];
317	float idx = (ei.weights[tix] - lowparam) * scale;
318	idx = astc::clamp1f(idx);
319
320	ei.weights[tix] = idx;
321	ei.weight_error_scale[tix] = length_squared * error_weight;
322	assert(!astc::isnan(ei.weight_error_scale[tix]));
323	}
324
325	vfloat4 lowvalue = line.a + line.b * lowparam;
326	vfloat4 highvalue = line.a + line.b * highparam;
327
328	vfloat4 ep0 = select(blk.data_min, vfloat4 (lowvalue.lane<`0`>()), comp1_mask);
329	vfloat4 ep1 = select(blk.data_max, vfloat4 (highvalue.lane<`0`>()), comp1_mask);
330
331	ei.ep.endpt0[i] = select(ep0, vfloat4 (lowvalue.lane<`1`>()), comp2_mask);
332	ei.ep.endpt1[i] = select(ep1, vfloat4 (highvalue.lane<`1`>()), comp2_mask);
333	}
334
335	// Zero initialize any SIMD over-fetch
336	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
337	for (unsigned int i = texel_count; i < texel_count_simd; i++)
338	{
339	ei.weights[i] = `0.0f`;
340	ei.weight_error_scale[i] = `0.0f`;
341	}
342
343	ei.is_constant_weight_error_scale = is_constant_wes;
344	}
345
346	/**
347	* @brief Compute the ideal endpoints and weights for 3 color components.
348	*
349	* @param blk The image block color data to compress.
350	* @param pi The partition info for the current trial.
351	* @param[out] ei The computed ideal endpoints and weights.
352	* @param omitted_component The color component excluded from the calculation.
353	*/
354	static void compute_ideal_colors_and_weights_3_comp(
355	const image_block& blk,
356	const partition_info& pi,
357	endpoints_and_weights& ei,
358	unsigned int omitted_component
359	) {
360	unsigned int partition_count = pi.partition_count;
361	ei.ep.partition_count = partition_count;
362	promise(partition_count > `0`);
363
364	unsigned int texel_count = blk.texel_count;
365	promise(texel_count > `0`);
366
367	partition_metrics pms[BLOCK_MAX_PARTITIONS];
368
369	float error_weight;
370	const float* data_vr = nullptr;
371	const float* data_vg = nullptr;
372	const float* data_vb = nullptr;
373	if (omitted_component == `0`)
374	{
375	error_weight = hadd_s(blk.channel_weight.swz<`0`, `1`, `2`>());
376	data_vr = blk.data_g;
377	data_vg = blk.data_b;
378	data_vb = blk.data_a;
379	}
380	else if (omitted_component == `1`)
381	{
382	error_weight = hadd_s(blk.channel_weight.swz<`0`, `2`, `3`>());
383	data_vr = blk.data_r;
384	data_vg = blk.data_b;
385	data_vb = blk.data_a;
386	}
387	else if (omitted_component == `2`)
388	{
389	error_weight = hadd_s(blk.channel_weight.swz<`0`, `1`, `3`>());
390	data_vr = blk.data_r;
391	data_vg = blk.data_g;
392	data_vb = blk.data_a;
393	}
394	else
395	{
396	assert(omitted_component == `3`);
397
398	error_weight = hadd_s(blk.channel_weight.swz<`0`, `1`, `2`>());
399	data_vr = blk.data_r;
400	data_vg = blk.data_g;
401	data_vb = blk.data_b;
402	}
403
404	error_weight = error_weight * (`1.0f` / `3.0f`);
405
406	if (omitted_component == `3`)
407	{
408	compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
409	}
410	else
411	{
412	compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
413	}
414
415	bool is_constant_wes { true };
416	float partition0_len_sq { `0.0f` };
417
418	for (unsigned int i = `0`; i < partition_count; i++)
419	{
420	vfloat4 dir = pms[i].dir;
421	if (hadd_rgb_s(dir) < `0.0f`)
422	{
423	dir = vfloat4::zero() - dir;
424	}
425
426	line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
427	float lowparam { `1e10f` };
428	float highparam { -`1e10f` };
429
430	unsigned int partition_texel_count = pi.partition_texel_count[i];
431	for (unsigned int j = `0`; j < partition_texel_count; j++)
432	{
433	unsigned int tix = pi.texels_of_partition[i][j];
434	vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
435	float param = dot3_s(point - line.a, line.b);
436	ei.weights[tix] = param;
437
438	lowparam = astc::min(param, lowparam);
439	highparam = astc::max(param, highparam);
440	}
441
442	// It is possible for a uniform-color partition to produce length=0;
443	// this causes NaN issues so set to small value to avoid this problem
444	if (highparam <= lowparam)
445	{
446	lowparam = `0.0f`;
447	highparam = `1e-7f`;
448	}
449
450	float length = highparam - lowparam;
451	float length_squared = length * length;
452	float scale = `1.0f` / length;
453
454	if (i == `0`)
455	{
456	partition0_len_sq = length_squared;
457	}
458	else
459	{
460	is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
461	}
462
463	for (unsigned int j = `0`; j < partition_texel_count; j++)
464	{
465	unsigned int tix = pi.texels_of_partition[i][j];
466	float idx = (ei.weights[tix] - lowparam) * scale;
467	idx = astc::clamp1f(idx);
468
469	ei.weights[tix] = idx;
470	ei.weight_error_scale[tix] = length_squared * error_weight;
471	assert(!astc::isnan(ei.weight_error_scale[tix]));
472	}
473
474	vfloat4 ep0 = line.a + line.b * lowparam;
475	vfloat4 ep1 = line.a + line.b * highparam;
476
477	vfloat4 bmin = blk.data_min;
478	vfloat4 bmax = blk.data_max;
479
480	assert(omitted_component < BLOCK_MAX_COMPONENTS);
481	switch (omitted_component)
482	{
483	case `0`:
484	ei.ep.endpt0[i] = vfloat4 (bmin.lane<`0`>(), ep0.lane<`0`>(), ep0.lane<`1`>(), ep0.lane<`2`>());
485	ei.ep.endpt1[i] = vfloat4 (bmax.lane<`0`>(), ep1.lane<`0`>(), ep1.lane<`1`>(), ep1.lane<`2`>());
486	break;
487	case `1`:
488	ei.ep.endpt0[i] = vfloat4 (ep0.lane<`0`>(), bmin.lane<`1`>(), ep0.lane<`1`>(), ep0.lane<`2`>());
489	ei.ep.endpt1[i] = vfloat4 (ep1.lane<`0`>(), bmax.lane<`1`>(), ep1.lane<`1`>(), ep1.lane<`2`>());
490	break;
491	case `2`:
492	ei.ep.endpt0[i] = vfloat4 (ep0.lane<`0`>(), ep0.lane<`1`>(), bmin.lane<`2`>(), ep0.lane<`2`>());
493	ei.ep.endpt1[i] = vfloat4 (ep1.lane<`0`>(), ep1.lane<`1`>(), bmax.lane<`2`>(), ep1.lane<`2`>());
494	break;
495	default:
496	ei.ep.endpt0[i] = vfloat4 (ep0.lane<`0`>(), ep0.lane<`1`>(), ep0.lane<`2`>(), bmin.lane<`3`>());
497	ei.ep.endpt1[i] = vfloat4 (ep1.lane<`0`>(), ep1.lane<`1`>(), ep1.lane<`2`>(), bmax.lane<`3`>());
498	break;
499	}
500	}
501
502	// Zero initialize any SIMD over-fetch
503	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
504	for (unsigned int i = texel_count; i < texel_count_simd; i++)
505	{
506	ei.weights[i] = `0.0f`;
507	ei.weight_error_scale[i] = `0.0f`;
508	}
509
510	ei.is_constant_weight_error_scale = is_constant_wes;
511	}
512
513	/**
514	* @brief Compute the ideal endpoints and weights for 4 color components.
515	*
516	* @param blk The image block color data to compress.
517	* @param pi The partition info for the current trial.
518	* @param[out] ei The computed ideal endpoints and weights.
519	*/
520	static void compute_ideal_colors_and_weights_4_comp(
521	const image_block& blk,
522	const partition_info& pi,
523	endpoints_and_weights& ei
524	) {
525	const float error_weight = hadd_s(blk.channel_weight) / `4.0f`;
526
527	unsigned int partition_count = pi.partition_count;
528
529	unsigned int texel_count = blk.texel_count;
530	promise(texel_count > `0`);
531	promise(partition_count > `0`);
532
533	partition_metrics pms[BLOCK_MAX_PARTITIONS];
534
535	compute_avgs_and_dirs_4_comp(pi, blk, pms);
536
537	bool is_constant_wes { true };
538	float partition0_len_sq { `0.0f` };
539
540	for (unsigned int i = `0`; i < partition_count; i++)
541	{
542	vfloat4 dir = pms[i].dir;
543	if (hadd_rgb_s(dir) < `0.0f`)
544	{
545	dir = vfloat4::zero() - dir;
546	}
547
548	line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
549	float lowparam { `1e10f` };
550	float highparam { -`1e10f` };
551
552	unsigned int partition_texel_count = pi.partition_texel_count[i];
553	for (unsigned int j = `0`; j < partition_texel_count; j++)
554	{
555	unsigned int tix = pi.texels_of_partition[i][j];
556	vfloat4 point = blk.texel(tix);
557	float param = dot_s(point - line.a, line.b);
558	ei.weights[tix] = param;
559
560	lowparam = astc::min(param, lowparam);
561	highparam = astc::max(param, highparam);
562	}
563
564	// It is possible for a uniform-color partition to produce length=0;
565	// this causes NaN issues so set to small value to avoid this problem
566	if (highparam <= lowparam)
567	{
568	lowparam = `0.0f`;
569	highparam = `1e-7f`;
570	}
571
572	float length = highparam - lowparam;
573	float length_squared = length * length;
574	float scale = `1.0f` / length;
575
576	if (i == `0`)
577	{
578	partition0_len_sq = length_squared;
579	}
580	else
581	{
582	is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
583	}
584
585	ei.ep.endpt0[i] = line.a + line.b * lowparam;
586	ei.ep.endpt1[i] = line.a + line.b * highparam;
587
588	for (unsigned int j = `0`; j < partition_texel_count; j++)
589	{
590	unsigned int tix = pi.texels_of_partition[i][j];
591	float idx = (ei.weights[tix] - lowparam) * scale;
592	idx = astc::clamp1f(idx);
593
594	ei.weights[tix] = idx;
595	ei.weight_error_scale[tix] = length_squared * error_weight;
596	assert(!astc::isnan(ei.weight_error_scale[tix]));
597	}
598	}
599
600	// Zero initialize any SIMD over-fetch
601	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
602	for (unsigned int i = texel_count; i < texel_count_simd; i++)
603	{
604	ei.weights[i] = `0.0f`;
605	ei.weight_error_scale[i] = `0.0f`;
606	}
607
608	ei.is_constant_weight_error_scale = is_constant_wes;
609	}
610
611	/ See header for documentation. /
612	void compute_ideal_colors_and_weights_1plane(
613	const image_block& blk,
614	const partition_info& pi,
615	endpoints_and_weights& ei
616	) {
617	bool uses_alpha = !blk.is_constant_channel(`3`);
618
619	if (uses_alpha)
620	{
621	compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
622	}
623	else
624	{
625	compute_ideal_colors_and_weights_3_comp(blk, pi, ei, `3`);
626	}
627	}
628
629	/ See header for documentation. /
630	void compute_ideal_colors_and_weights_2planes(
631	const block_size_descriptor& bsd,
632	const image_block& blk,
633	unsigned int plane2_component,
634	endpoints_and_weights& ei1,
635	endpoints_and_weights& ei2
636	) {
637	const auto& pi = bsd.get_partition_info(`1`, `0`);
638	bool uses_alpha = !blk.is_constant_channel(`3`);
639
640	assert(plane2_component < BLOCK_MAX_COMPONENTS);
641	switch (plane2_component)
642	{
643	case `0`: // Separate weights for red
644	if (uses_alpha)
645	{
646	compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, `0`);
647	}
648	else
649	{
650	compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, `1`, `2`);
651	}
652	compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, `0`);
653	break;
654
655	case `1`: // Separate weights for green
656	if (uses_alpha)
657	{
658	compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, `1`);
659	}
660	else
661	{
662	compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, `0`, `2`);
663	}
664	compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, `1`);
665	break;
666
667	case `2`: // Separate weights for blue
668	if (uses_alpha)
669	{
670	compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, `2`);
671	}
672	else
673	{
674	compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, `0`, `1`);
675	}
676	compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, `2`);
677	break;
678
679	default: // Separate weights for alpha
680	assert(uses_alpha);
681	compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, `3`);
682	compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, `3`);
683	break;
684	}
685	}
686
687	/ See header for documentation. /
688	float compute_error_of_weight_set_1plane(
689	const endpoints_and_weights& eai,
690	const decimation_info& di,
691	const float* dec_weight_quant_uvalue
692	) {
693	vfloatacc error_summav = vfloatacc::zero();
694	unsigned int texel_count = di.texel_count;
695	promise(texel_count > `0`);
696
697	// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
698	if (di.max_texel_weight_count > `2`)
699	{
700	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
701	{
702	// Compute the bilinear interpolation of the decimated weight grid
703	vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
704
705	// Compute the error between the computed value and the ideal weight
706	vfloat actual_values = loada(eai.weights + i);
707	vfloat diff = current_values - actual_values;
708	vfloat significance = loada(eai.weight_error_scale + i);
709	vfloat error = diff * diff * significance;
710
711	haccumulate(error_summav, error);
712	}
713	}
714	else if (di.max_texel_weight_count > `1`)
715	{
716	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
717	{
718	// Compute the bilinear interpolation of the decimated weight grid
719	vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
720
721	// Compute the error between the computed value and the ideal weight
722	vfloat actual_values = loada(eai.weights + i);
723	vfloat diff = current_values - actual_values;
724	vfloat significance = loada(eai.weight_error_scale + i);
725	vfloat error = diff * diff * significance;
726
727	haccumulate(error_summav, error);
728	}
729	}
730	else
731	{
732	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
733	{
734	// Load the weight set directly, without interpolation
735	vfloat current_values = loada(dec_weight_quant_uvalue + i);
736
737	// Compute the error between the computed value and the ideal weight
738	vfloat actual_values = loada(eai.weights + i);
739	vfloat diff = current_values - actual_values;
740	vfloat significance = loada(eai.weight_error_scale + i);
741	vfloat error = diff * diff * significance;
742
743	haccumulate(error_summav, error);
744	}
745	}
746
747	// Resolve the final scalar accumulator sum
748	return hadd_s(error_summav);
749	}
750
751	/ See header for documentation. /
752	float compute_error_of_weight_set_2planes(
753	const endpoints_and_weights& eai1,
754	const endpoints_and_weights& eai2,
755	const decimation_info& di,
756	const float* dec_weight_quant_uvalue_plane1,
757	const float* dec_weight_quant_uvalue_plane2
758	) {
759	vfloatacc error_summav = vfloatacc::zero();
760	unsigned int texel_count = di.texel_count;
761	promise(texel_count > `0`);
762
763	// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
764	if (di.max_texel_weight_count > `2`)
765	{
766	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
767	{
768	// Plane 1
769	// Compute the bilinear interpolation of the decimated weight grid
770	vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
771
772	// Compute the error between the computed value and the ideal weight
773	vfloat actual_values1 = loada(eai1.weights + i);
774	vfloat diff = current_values1 - actual_values1;
775	vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
776
777	// Plane 2
778	// Compute the bilinear interpolation of the decimated weight grid
779	vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
780
781	// Compute the error between the computed value and the ideal weight
782	vfloat actual_values2 = loada(eai2.weights + i);
783	diff = current_values2 - actual_values2;
784	vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
785
786	haccumulate(error_summav, error1 + error2);
787	}
788	}
789	else if (di.max_texel_weight_count > `1`)
790	{
791	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
792	{
793	// Plane 1
794	// Compute the bilinear interpolation of the decimated weight grid
795	vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
796
797	// Compute the error between the computed value and the ideal weight
798	vfloat actual_values1 = loada(eai1.weights + i);
799	vfloat diff = current_values1 - actual_values1;
800	vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
801
802	// Plane 2
803	// Compute the bilinear interpolation of the decimated weight grid
804	vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
805
806	// Compute the error between the computed value and the ideal weight
807	vfloat actual_values2 = loada(eai2.weights + i);
808	diff = current_values2 - actual_values2;
809	vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
810
811	haccumulate(error_summav, error1 + error2);
812	}
813	}
814	else
815	{
816	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
817	{
818	// Plane 1
819	// Load the weight set directly, without interpolation
820	vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
821
822	// Compute the error between the computed value and the ideal weight
823	vfloat actual_values1 = loada(eai1.weights + i);
824	vfloat diff = current_values1 - actual_values1;
825	vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
826
827	// Plane 2
828	// Load the weight set directly, without interpolation
829	vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
830
831	// Compute the error between the computed value and the ideal weight
832	vfloat actual_values2 = loada(eai2.weights + i);
833	diff = current_values2 - actual_values2;
834	vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
835
836	haccumulate(error_summav, error1 + error2);
837	}
838	}
839
840	// Resolve the final scalar accumulator sum
841	return hadd_s(error_summav);
842	}
843
844	/ See header for documentation. /
845	void compute_ideal_weights_for_decimation(
846	const endpoints_and_weights& ei,
847	const decimation_info& di,
848	float* dec_weight_ideal_value
849	) {
850	unsigned int texel_count = di.texel_count;
851	unsigned int weight_count = di.weight_count;
852	bool is_direct = texel_count == weight_count;
853	promise(texel_count > `0`);
854	promise(weight_count > `0`);
855
856	// Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
857	// can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
858	// arrays always contain space for 64 elements
859	unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - `1`);
860	storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
861
862	// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
863	// zero-initialized SIMD over-fetch region
864	if (is_direct)
865	{
866	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
867	{
868	vfloat weight(ei.weights + i);
869	storea(weight, dec_weight_ideal_value + i);
870	}
871
872	return;
873	}
874
875	// Otherwise compute an estimate and perform single refinement iteration
876	alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
877
878	// Compute an initial average for each decimated weight
879	bool constant_wes = ei.is_constant_weight_error_scale;
880	vfloat weight_error_scale(ei.weight_error_scale[`0`]);
881
882	// This overshoots - this is OK as we initialize the array tails in the
883	// decimation table structures to safe values ...
884	for (unsigned int i = `0`; i < weight_count; i += ASTCENC_SIMD_WIDTH)
885	{
886	// Start with a small value to avoid div-by-zero later
887	vfloat weight_weight(`1e-10f`);
888	vfloat initial_weight = vfloat::zero();
889
890	// Accumulate error weighting of all the texels using this weight
891	vint weight_texel_count(di.weight_texel_count + i);
892	unsigned int max_texel_count = hmax(weight_texel_count).lane<`0`>();
893	promise(max_texel_count > `0`);
894
895	for (unsigned int j = `0`; j < max_texel_count; j++)
896	{
897	vint texel(di.weight_texels_tr[j] + i);
898	vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
899
900	if (!constant_wes)
901	{
902	weight_error_scale = gatherf(ei.weight_error_scale, texel);
903	}
904
905	vfloat contrib_weight = weight * weight_error_scale;
906
907	weight_weight += contrib_weight;
908	initial_weight += gatherf(ei.weights, texel) * contrib_weight;
909	}
910
911	storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
912	}
913
914	// Populate the interpolated weight grid based on the initial average
915	// Process SIMD-width texel coordinates at at time while we can. Safe to
916	// over-process full SIMD vectors - the tail is zeroed.
917	if (di.max_texel_weight_count <= `2`)
918	{
919	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
920	{
921	vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
922	storea(weight, infilled_weights + i);
923	}
924	}
925	else
926	{
927	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
928	{
929	vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
930	storea(weight, infilled_weights + i);
931	}
932	}
933
934	// Perform a single iteration of refinement
935	// Empirically determined step size; larger values don't help but smaller drops image quality
936	constexpr float stepsize = `0.25f`;
937	constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
938
939	for (unsigned int i = `0`; i < weight_count; i += ASTCENC_SIMD_WIDTH)
940	{
941	vfloat weight_val = loada(dec_weight_ideal_value + i);
942
943	// Accumulate error weighting of all the texels using this weight
944	// Start with a small value to avoid div-by-zero later
945	vfloat error_change0(`1e-10f`);
946	vfloat error_change1(`0.0f`);
947
948	// Accumulate error weighting of all the texels using this weight
949	vint weight_texel_count(di.weight_texel_count + i);
950	unsigned int max_texel_count = hmax(weight_texel_count).lane<`0`>();
951	promise(max_texel_count > `0`);
952
953	for (unsigned int j = `0`; j < max_texel_count; j++)
954	{
955	vint texel(di.weight_texels_tr[j] + i);
956	vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
957
958	if (!constant_wes)
959	{
960	weight_error_scale = gatherf(ei.weight_error_scale, texel);
961	}
962
963	vfloat scale = weight_error_scale * contrib_weight;
964	vfloat old_weight = gatherf(infilled_weights, texel);
965	vfloat ideal_weight = gatherf(ei.weights, texel);
966
967	error_change0 += contrib_weight * scale;
968	error_change1 += (old_weight - ideal_weight) * scale;
969	}
970
971	vfloat step = (error_change1 * chd_scale) / error_change0;
972	step = clamp(-stepsize, stepsize, step);
973
974	// Update the weight; note this can store negative values
975	storea(weight_val + step, dec_weight_ideal_value + i);
976	}
977	}
978
979	/ See header for documentation. /
980	void compute_quantized_weights_for_decimation(
981	const decimation_info& di,
982	float low_bound,
983	float high_bound,
984	const float* dec_weight_ideal_value,
985	float* weight_set_out,
986	uint8_t* quantized_weight_set,
987	quant_method quant_level
988	) {
989	int weight_count = di.weight_count;
990	promise(weight_count > `0`);
991	const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
992
993	// The available quant levels, stored with a minus 1 bias
994	static const float quant_levels_m1[`12`] {
995	`1.0f`, `2.0f`, `3.0f`, `4.0f`, `5.0f`, `7.0f`, `9.0f`, `11.0f`, `15.0f`, `19.0f`, `23.0f`, `31.0f`
996	};
997
998	vint steps_m1(get_quant_level(quant_level) - `1`);
999	float quant_level_m1 = quant_levels_m1[quant_level];
1000
1001	// Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
1002
1003	// TODO: Oddity to investigate; triggered by test in issue #265.
1004	if (high_bound <= low_bound)
1005	{
1006	low_bound = `0.0f`;
1007	high_bound = `1.0f`;
1008	}
1009
1010	float rscale = high_bound - low_bound;
1011	float scale = `1.0f` / rscale;
1012
1013	float scaled_low_bound = low_bound * scale;
1014	rscale *= `1.0f` / `64.0f`;
1015
1016	vfloat scalev(scale);
1017	vfloat scaled_low_boundv(scaled_low_bound);
1018	vfloat quant_level_m1v(quant_level_m1);
1019	vfloat rscalev(rscale);
1020	vfloat low_boundv(low_bound);
1021
1022	// This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
1023	// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
1024	if (get_quant_level(quant_level) <= `16`)
1025	{
1026	vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
1027	vint tab0p;
1028	vtable_prepare(tab0, tab0p);
1029
1030	for (int i = `0`; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1031	{
1032	vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
1033	ix = clampzo(ix);
1034
1035	// Look up the two closest indexes and return the one that was closest
1036	vfloat ix1 = ix * quant_level_m1v;
1037
1038	vint weightl = float_to_int(ix1);
1039	vint weighth = min(weightl + vint (`1`), steps_m1);
1040
1041	vint ixli = vtable_8bt_32bi(tab0p, weightl);
1042	vint ixhi = vtable_8bt_32bi(tab0p, weighth);
1043
1044	vfloat ixl = int_to_float(ixli);
1045	vfloat ixh = int_to_float(ixhi);
1046
1047	vmask mask = (ixl + ixh) < (vfloat (`128.0f`) * ix);
1048	vint weight = select(ixli, ixhi, mask);
1049	ixl = select(ixl, ixh, mask);
1050
1051	// Invert the weight-scaling that was done initially
1052	storea(ixl * rscalev + low_boundv, weight_set_out + i);
1053	vint scn = pack_low_bytes(weight);
1054	store_nbytes(scn, quantized_weight_set + i);
1055	}
1056	}
1057	else
1058	{
1059	vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
1060	vint4 tab1(reinterpret_cast<const int*>(qat.quant_to_unquant + `16`));
1061	vint tab0p, tab1p;
1062	vtable_prepare(tab0, tab1, tab0p, tab1p);
1063
1064	for (int i = `0`; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1065	{
1066	vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
1067	ix = clampzo(ix);
1068
1069	// Look up the two closest indexes and return the one that was closest
1070	vfloat ix1 = ix * quant_level_m1v;
1071
1072	vint weightl = float_to_int(ix1);
1073	vint weighth = min(weightl + vint (`1`), steps_m1);
1074
1075	vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
1076	vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
1077
1078	vfloat ixl = int_to_float(ixli);
1079	vfloat ixh = int_to_float(ixhi);
1080
1081	vmask mask = (ixl + ixh) < (vfloat (`128.0f`) * ix);
1082	vint weight = select(ixli, ixhi, mask);
1083	ixl = select(ixl, ixh, mask);
1084
1085	// Invert the weight-scaling that was done initially
1086	storea(ixl * rscalev + low_boundv, weight_set_out + i);
1087	vint scn = pack_low_bytes(weight);
1088	store_nbytes(scn, quantized_weight_set + i);
1089	}
1090	}
1091	}
1092
1093	/**
1094	* @brief Compute the RGB + offset for a HDR endpoint mode #7.
1095	*
1096	* Since the matrix needed has a regular structure we can simplify the inverse calculation. This
1097	* gives us ~24 multiplications vs. 96 for a generic inverse.
1098	*
1099	* mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x);
1100	* mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y);
1101	* mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z);
1102	* mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum);
1103	* mat = invert(mat);
1104	*
1105	* @param rgba_weight_sum Sum of partition component error weights.
1106	* @param weight_weight_sum Sum of partition component error weights * texel weight.
1107	* @param rgbq_sum Sum of partition component error weights * texel weight * color data.
1108	* @param psum Sum of RGB color weights * texel weight^2.
1109	*/
1110	static inline vfloat4 compute_rgbo_vector(
1111	vfloat4 rgba_weight_sum,
1112	vfloat4 weight_weight_sum,
1113	vfloat4 rgbq_sum,
1114	float psum
1115	) {
1116	float X = rgba_weight_sum.lane<`0`>();
1117	float Y = rgba_weight_sum.lane<`1`>();
1118	float Z = rgba_weight_sum.lane<`2`>();
1119	float P = weight_weight_sum.lane<`0`>();
1120	float Q = weight_weight_sum.lane<`1`>();
1121	float R = weight_weight_sum.lane<`2`>();
1122	float S = psum;
1123
1124	float PP = P * P;
1125	float QQ = Q * Q;
1126	float RR = R * R;
1127
1128	float SZmRR = S * Z - RR;
1129	float DT = SZmRR * Y - Z * QQ;
1130	float YP = Y * P;
1131	float QX = Q * X;
1132	float YX = Y * X;
1133	float mZYP = -Z * YP;
1134	float mZQX = -Z * QX;
1135	float mRYX = -R * YX;
1136	float ZQP = Z * Q * P;
1137	float RYP = R * YP;
1138	float RQX = R * QX;
1139
1140	// Compute the reciprocal of matrix determinant
1141	float rdet = `1.0f` / (DT * X + mZYP * P);
1142
1143	// Actually compute the adjugate, and then apply 1/det separately
1144	vfloat4 mat0(DT, ZQP, RYP, mZYP);
1145	vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
1146	vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
1147	vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
1148	vfloat4 vect = rgbq_sum * rdet;
1149
1150	return vfloat4 (dot_s(mat0, vect),
1151	dot_s(mat1, vect),
1152	dot_s(mat2, vect),
1153	dot_s(mat3, vect));
1154	}
1155
1156	/ See header for documentation. /
1157	void recompute_ideal_colors_1plane(
1158	const image_block& blk,
1159	const partition_info& pi,
1160	const decimation_info& di,
1161	const uint8_t* dec_weights_uquant,
1162	endpoints& ep,
1163	vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
1164	vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
1165	) {
1166	unsigned int weight_count = di.weight_count;
1167	unsigned int total_texel_count = blk.texel_count;
1168	unsigned int partition_count = pi.partition_count;
1169
1170	promise(weight_count > `0`);
1171	promise(total_texel_count > `0`);
1172	promise(partition_count > `0`);
1173
1174	alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS];
1175	for (unsigned int i = `0`; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1176	{
1177	vint unquant_value(dec_weights_uquant + i);
1178	vfloat unquant_valuef = int_to_float(unquant_value) * vfloat (`1.0f` / `64.0f`);
1179	storea(unquant_valuef, dec_weight + i);
1180	}
1181
1182	alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
1183	float* undec_weight_ref;
1184	if (di.max_texel_weight_count == `1`)
1185	{
1186	undec_weight_ref = dec_weight;
1187	}
1188	else if (di.max_texel_weight_count <= `2`)
1189	{
1190	for (unsigned int i = `0`; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1191	{
1192	vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
1193	storea(weight, undec_weight + i);
1194	}
1195
1196	undec_weight_ref = undec_weight;
1197	}
1198	else
1199	{
1200	for (unsigned int i = `0`; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1201	{
1202	vfloat weight = bilinear_infill_vla(di, dec_weight, i);
1203	storea(weight, undec_weight + i);
1204	}
1205
1206	undec_weight_ref = undec_weight;
1207	}
1208
1209	vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
1210
1211	for (unsigned int i = `0`; i < partition_count; i++)
1212	{
1213	unsigned int texel_count = pi.partition_texel_count[i];
1214	const uint8_t *texel_indexes = pi.texels_of_partition[i];
1215
1216	// Only compute a partition mean if more than one partition
1217	if (partition_count > `1`)
1218	{
1219	rgba_sum = vfloat4::zero();
1220	promise(texel_count > `0`);
1221	for (unsigned int j = `0`; j < texel_count; j++)
1222	{
1223	unsigned int tix = texel_indexes[j];
1224	rgba_sum += blk.texel(tix);
1225	}
1226	}
1227
1228	rgba_sum = rgba_sum * blk.channel_weight;
1229	vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), `1e-17f`);
1230	vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<`0`, `1`, `2`>());
1231
1232	float scale_max = `0.0f`;
1233	float scale_min = `1e10f`;
1234
1235	float wmin1 = `1.0f`;
1236	float wmax1 = `0.0f`;
1237
1238	float left_sum_s = `0.0f`;
1239	float middle_sum_s = `0.0f`;
1240	float right_sum_s = `0.0f`;
1241
1242	vfloat4 color_vec_x = vfloat4::zero();
1243	vfloat4 color_vec_y = vfloat4::zero();
1244
1245	vfloat4 scale_vec = vfloat4::zero();
1246
1247	float weight_weight_sum_s = `1e-17f`;
1248
1249	vfloat4 color_weight = blk.channel_weight;
1250	float ls_weight = hadd_rgb_s(color_weight);
1251
1252	for (unsigned int j = `0`; j < texel_count; j++)
1253	{
1254	unsigned int tix = texel_indexes[j];
1255	vfloat4 rgba = blk.texel(tix);
1256
1257	float idx0 = undec_weight_ref[tix];
1258
1259	float om_idx0 = `1.0f` - idx0;
1260	wmin1 = astc::min(idx0, wmin1);
1261	wmax1 = astc::max(idx0, wmax1);
1262
1263	float scale = dot3_s(scale_dir, rgba);
1264	scale_min = astc::min(scale, scale_min);
1265	scale_max = astc::max(scale, scale_max);
1266
1267	left_sum_s += om_idx0 * om_idx0;
1268	middle_sum_s += om_idx0 * idx0;
1269	right_sum_s += idx0 * idx0;
1270	weight_weight_sum_s += idx0;
1271
1272	vfloat4 color_idx(idx0);
1273	vfloat4 cwprod = rgba;
1274	vfloat4 cwiprod = cwprod * color_idx;
1275
1276	color_vec_y += cwiprod;
1277	color_vec_x += cwprod - cwiprod;
1278
1279	scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
1280	}
1281
1282	vfloat4 left_sum = vfloat4 (left_sum_s) * color_weight;
1283	vfloat4 middle_sum = vfloat4 (middle_sum_s) * color_weight;
1284	vfloat4 right_sum = vfloat4 (right_sum_s) * color_weight;
1285	vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
1286
1287	color_vec_x = color_vec_x * color_weight;
1288	color_vec_y = color_vec_y * color_weight;
1289
1290	// Initialize the luminance and scale vectors with a reasonable default
1291	float scalediv = scale_min / astc::max(scale_max, `1e-10f`);
1292	scalediv = astc::clamp1f(scalediv);
1293
1294	vfloat4 sds = scale_dir * scale_max;
1295
1296	rgbs_vectors[i] = vfloat4 (sds.lane<`0`>(), sds.lane<`1`>(), sds.lane<`2`>(), scalediv);
1297
1298	if (wmin1 >= wmax1 * `0.999f`)
1299	{
1300	// If all weights in the partition were equal, then just take average of all colors in
1301	// the partition and use that as both endpoint colors
1302	vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1303
1304	vmask4 notnan_mask = avg == avg;
1305	ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
1306	ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
1307
1308	rgbs_vectors[i] = vfloat4 (sds.lane<`0`>(), sds.lane<`1`>(), sds.lane<`2`>(), `1.0f`);
1309	}
1310	else
1311	{
1312	// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1313	// set of texel weights and pixel colors
1314	vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
1315	vfloat4 color_rdet1 = `1.0f` / color_det1;
1316
1317	float ls_det1 = (lmrs_sum.lane<`0`>() * lmrs_sum.lane<`2`>()) - (lmrs_sum.lane<`1`>() * lmrs_sum.lane<`1`>());
1318	float ls_rdet1 = `1.0f` / ls_det1;
1319
1320	vfloat4 color_mss1 = (left_sum * left_sum)
1321	+ (`2.0f` * middle_sum * middle_sum)
1322	+ (right_sum * right_sum);
1323
1324	float ls_mss1 = (lmrs_sum.lane<`0`>() * lmrs_sum.lane<`0`>())
1325	+ (`2.0f` * lmrs_sum.lane<`1`>() * lmrs_sum.lane<`1`>())
1326	+ (lmrs_sum.lane<`2`>() * lmrs_sum.lane<`2`>());
1327
1328	vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
1329	vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
1330
1331	vmask4 det_mask = abs(color_det1) > (color_mss1 * `1e-4f`);
1332	vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1333	vmask4 full_mask = det_mask & notnan_mask;
1334
1335	ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
1336	ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
1337
1338	float scale_ep0 = (lmrs_sum.lane<`2`>() * scale_vec.lane<`0`>() - lmrs_sum.lane<`1`>() * scale_vec.lane<`1`>()) * ls_rdet1;
1339	float scale_ep1 = (lmrs_sum.lane<`0`>() * scale_vec.lane<`1`>() - lmrs_sum.lane<`1`>() * scale_vec.lane<`0`>()) * ls_rdet1;
1340
1341	if (fabsf(ls_det1) > (ls_mss1 * `1e-4f`) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1342	{
1343	float scalediv2 = scale_ep0 / scale_ep1;
1344	vfloat4 sdsm = scale_dir * scale_ep1;
1345	rgbs_vectors[i] = vfloat4 (sdsm.lane<`0`>(), sdsm.lane<`1`>(), sdsm.lane<`2`>(), scalediv2);
1346	}
1347	}
1348
1349	// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
1350	if (blk.rgb_lns[`0`] \|\| blk.alpha_lns[`0`])
1351	{
1352	vfloat4 weight_weight_sum = vfloat4 (weight_weight_sum_s) * color_weight;
1353	float psum = right_sum_s * hadd_rgb_s(color_weight);
1354
1355	vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1356	rgbq_sum.set_lane<`3`>(hadd_rgb_s(color_vec_y));
1357
1358	vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1359	rgbo_vectors[i] = rgbovec;
1360
1361	// We can get a failure due to the use of a singular (non-invertible) matrix
1362	// If it failed, compute rgbo_vectors[] with a different method ...
1363	if (astc::isnan(dot_s(rgbovec, rgbovec)))
1364	{
1365	vfloat4 v0 = ep.endpt0[i];
1366	vfloat4 v1 = ep.endpt1[i];
1367
1368	float avgdif = hadd_rgb_s(v1 - v0) * (`1.0f` / `3.0f`);
1369	avgdif = astc::max(avgdif, `0.0f`);
1370
1371	vfloat4 avg = (v0 + v1) * `0.5f`;
1372	vfloat4 ep0 = avg - vfloat4 (avgdif) * `0.5f`;
1373	rgbo_vectors[i] = vfloat4 (ep0.lane<`0`>(), ep0.lane<`1`>(), ep0.lane<`2`>(), avgdif);
1374	}
1375	}
1376	}
1377	}
1378
1379	/ See header for documentation. /
1380	void recompute_ideal_colors_2planes(
1381	const image_block& blk,
1382	const block_size_descriptor& bsd,
1383	const decimation_info& di,
1384	const uint8_t* dec_weights_uquant_plane1,
1385	const uint8_t* dec_weights_uquant_plane2,
1386	endpoints& ep,
1387	vfloat4& rgbs_vector,
1388	vfloat4& rgbo_vector,
1389	int plane2_component
1390	) {
1391	unsigned int weight_count = di.weight_count;
1392	unsigned int total_texel_count = blk.texel_count;
1393
1394	promise(total_texel_count > `0`);
1395	promise(weight_count > `0`);
1396
1397	alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
1398	alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
1399
1400	assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
1401
1402	for (unsigned int i = `0`; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1403	{
1404	vint unquant_value1(dec_weights_uquant_plane1 + i);
1405	vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat (`1.0f` / `64.0f`);
1406	storea(unquant_value1f, dec_weight_plane1 + i);
1407
1408	vint unquant_value2(dec_weights_uquant_plane2 + i);
1409	vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat (`1.0f` / `64.0f`);
1410	storea(unquant_value2f, dec_weight_plane2 + i);
1411	}
1412
1413	alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
1414	alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
1415
1416	float* undec_weight_plane1_ref;
1417	float* undec_weight_plane2_ref;
1418
1419	if (di.max_texel_weight_count == `1`)
1420	{
1421	undec_weight_plane1_ref = dec_weight_plane1;
1422	undec_weight_plane2_ref = dec_weight_plane2;
1423	}
1424	else if (di.max_texel_weight_count <= `2`)
1425	{
1426	for (unsigned int i = `0`; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1427	{
1428	vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
1429	storea(weight, undec_weight_plane1 + i);
1430
1431	weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
1432	storea(weight, undec_weight_plane2 + i);
1433	}
1434
1435	undec_weight_plane1_ref = undec_weight_plane1;
1436	undec_weight_plane2_ref = undec_weight_plane2;
1437	}
1438	else
1439	{
1440	for (unsigned int i = `0`; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1441	{
1442	vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
1443	storea(weight, undec_weight_plane1 + i);
1444
1445	weight = bilinear_infill_vla(di, dec_weight_plane2, i);
1446	storea(weight, undec_weight_plane2 + i);
1447	}
1448
1449	undec_weight_plane1_ref = undec_weight_plane1;
1450	undec_weight_plane2_ref = undec_weight_plane2;
1451	}
1452
1453	unsigned int texel_count = bsd.texel_count;
1454	vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), `1e-17f`);
1455	vfloat4 scale_dir = normalize(blk.data_mean.swz<`0`, `1`, `2`>());
1456
1457	float scale_max = `0.0f`;
1458	float scale_min = `1e10f`;
1459
1460	float wmin1 = `1.0f`;
1461	float wmax1 = `0.0f`;
1462
1463	float wmin2 = `1.0f`;
1464	float wmax2 = `0.0f`;
1465
1466	float left1_sum_s = `0.0f`;
1467	float middle1_sum_s = `0.0f`;
1468	float right1_sum_s = `0.0f`;
1469
1470	float left2_sum_s = `0.0f`;
1471	float middle2_sum_s = `0.0f`;
1472	float right2_sum_s = `0.0f`;
1473
1474	vfloat4 color_vec_x = vfloat4::zero();
1475	vfloat4 color_vec_y = vfloat4::zero();
1476
1477	vfloat4 scale_vec = vfloat4::zero();
1478
1479	vfloat4 weight_weight_sum = vfloat4 (`1e-17f`);
1480
1481	vmask4 p2_mask = vint4::lane_id() == vint4 (plane2_component);
1482	vfloat4 color_weight = blk.channel_weight;
1483	float ls_weight = hadd_rgb_s(color_weight);
1484
1485	for (unsigned int j = `0`; j < texel_count; j++)
1486	{
1487	vfloat4 rgba = blk.texel(j);
1488
1489	float idx0 = undec_weight_plane1_ref[j];
1490
1491	float om_idx0 = `1.0f` - idx0;
1492	wmin1 = astc::min(idx0, wmin1);
1493	wmax1 = astc::max(idx0, wmax1);
1494
1495	float scale = dot3_s(scale_dir, rgba);
1496	scale_min = astc::min(scale, scale_min);
1497	scale_max = astc::max(scale, scale_max);
1498
1499	left1_sum_s += om_idx0 * om_idx0;
1500	middle1_sum_s += om_idx0 * idx0;
1501	right1_sum_s += idx0 * idx0;
1502
1503	float idx1 = undec_weight_plane2_ref[j];
1504
1505	float om_idx1 = `1.0f` - idx1;
1506	wmin2 = astc::min(idx1, wmin2);
1507	wmax2 = astc::max(idx1, wmax2);
1508
1509	left2_sum_s += om_idx1 * om_idx1;
1510	middle2_sum_s += om_idx1 * idx1;
1511	right2_sum_s += idx1 * idx1;
1512
1513	vfloat4 color_idx = select(vfloat4 (idx0), vfloat4 (idx1), p2_mask);
1514
1515	vfloat4 cwprod = rgba;
1516	vfloat4 cwiprod = cwprod * color_idx;
1517
1518	color_vec_y += cwiprod;
1519	color_vec_x += cwprod - cwiprod;
1520
1521	scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
1522	weight_weight_sum += color_idx;
1523	}
1524
1525	vfloat4 left1_sum = vfloat4 (left1_sum_s) * color_weight;
1526	vfloat4 middle1_sum = vfloat4 (middle1_sum_s) * color_weight;
1527	vfloat4 right1_sum = vfloat4 (right1_sum_s) * color_weight;
1528	vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
1529
1530	vfloat4 left2_sum = vfloat4 (left2_sum_s) * color_weight;
1531	vfloat4 middle2_sum = vfloat4 (middle2_sum_s) * color_weight;
1532	vfloat4 right2_sum = vfloat4 (right2_sum_s) * color_weight;
1533
1534	color_vec_x = color_vec_x * color_weight;
1535	color_vec_y = color_vec_y * color_weight;
1536
1537	// Initialize the luminance and scale vectors with a reasonable default
1538	float scalediv = scale_min / astc::max(scale_max, `1e-10f`);
1539	scalediv = astc::clamp1f(scalediv);
1540
1541	vfloat4 sds = scale_dir * scale_max;
1542
1543	rgbs_vector = vfloat4 (sds.lane<`0`>(), sds.lane<`1`>(), sds.lane<`2`>(), scalediv);
1544
1545	if (wmin1 >= wmax1 * `0.999f`)
1546	{
1547	// If all weights in the partition were equal, then just take average of all colors in
1548	// the partition and use that as both endpoint colors
1549	vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1550
1551	vmask4 p1_mask = vint4::lane_id() != vint4 (plane2_component);
1552	vmask4 notnan_mask = avg == avg;
1553	vmask4 full_mask = p1_mask & notnan_mask;
1554
1555	ep.endpt0[`0`] = select(ep.endpt0[`0`], avg, full_mask);
1556	ep.endpt1[`0`] = select(ep.endpt1[`0`], avg, full_mask);
1557
1558	rgbs_vector = vfloat4 (sds.lane<`0`>(), sds.lane<`1`>(), sds.lane<`2`>(), `1.0f`);
1559	}
1560	else
1561	{
1562	// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1563	// set of texel weights and pixel colors
1564	vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
1565	vfloat4 color_rdet1 = `1.0f` / color_det1;
1566
1567	float ls_det1 = (lmrs_sum.lane<`0`>() * lmrs_sum.lane<`2`>()) - (lmrs_sum.lane<`1`>() * lmrs_sum.lane<`1`>());
1568	float ls_rdet1 = `1.0f` / ls_det1;
1569
1570	vfloat4 color_mss1 = (left1_sum * left1_sum)
1571	+ (`2.0f` * middle1_sum * middle1_sum)
1572	+ (right1_sum * right1_sum);
1573
1574	float ls_mss1 = (lmrs_sum.lane<`0`>() * lmrs_sum.lane<`0`>())
1575	+ (`2.0f` * lmrs_sum.lane<`1`>() * lmrs_sum.lane<`1`>())
1576	+ (lmrs_sum.lane<`2`>() * lmrs_sum.lane<`2`>());
1577
1578	vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
1579	vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
1580
1581	float scale_ep0 = (lmrs_sum.lane<`2`>() * scale_vec.lane<`0`>() - lmrs_sum.lane<`1`>() * scale_vec.lane<`1`>()) * ls_rdet1;
1582	float scale_ep1 = (lmrs_sum.lane<`0`>() * scale_vec.lane<`1`>() - lmrs_sum.lane<`1`>() * scale_vec.lane<`0`>()) * ls_rdet1;
1583
1584	vmask4 p1_mask = vint4::lane_id() != vint4 (plane2_component);
1585	vmask4 det_mask = abs(color_det1) > (color_mss1 * `1e-4f`);
1586	vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1587	vmask4 full_mask = p1_mask & det_mask & notnan_mask;
1588
1589	ep.endpt0[`0`] = select(ep.endpt0[`0`], ep0, full_mask);
1590	ep.endpt1[`0`] = select(ep.endpt1[`0`], ep1, full_mask);
1591
1592	if (fabsf(ls_det1) > (ls_mss1 * `1e-4f`) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1593	{
1594	float scalediv2 = scale_ep0 / scale_ep1;
1595	vfloat4 sdsm = scale_dir * scale_ep1;
1596	rgbs_vector = vfloat4 (sdsm.lane<`0`>(), sdsm.lane<`1`>(), sdsm.lane<`2`>(), scalediv2);
1597	}
1598	}
1599
1600	if (wmin2 >= wmax2 * `0.999f`)
1601	{
1602	// If all weights in the partition were equal, then just take average of all colors in
1603	// the partition and use that as both endpoint colors
1604	vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1605
1606	vmask4 notnan_mask = avg == avg;
1607	vmask4 full_mask = p2_mask & notnan_mask;
1608
1609	ep.endpt0[`0`] = select(ep.endpt0[`0`], avg, full_mask);
1610	ep.endpt1[`0`] = select(ep.endpt1[`0`], avg, full_mask);
1611	}
1612	else
1613	{
1614	// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1615	// set of texel weights and pixel colors
1616	vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
1617	vfloat4 color_rdet2 = `1.0f` / color_det2;
1618
1619	vfloat4 color_mss2 = (left2_sum * left2_sum)
1620	+ (`2.0f` * middle2_sum * middle2_sum)
1621	+ (right2_sum * right2_sum);
1622
1623	vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
1624	vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
1625
1626	vmask4 det_mask = abs(color_det2) > (color_mss2 * `1e-4f`);
1627	vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1628	vmask4 full_mask = p2_mask & det_mask & notnan_mask;
1629
1630	ep.endpt0[`0`] = select(ep.endpt0[`0`], ep0, full_mask);
1631	ep.endpt1[`0`] = select(ep.endpt1[`0`], ep1, full_mask);
1632	}
1633
1634	// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
1635	if (blk.rgb_lns[`0`] \|\| blk.alpha_lns[`0`])
1636	{
1637	weight_weight_sum = weight_weight_sum * color_weight;
1638	float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
1639
1640	vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1641	rgbq_sum.set_lane<`3`>(hadd_rgb_s(color_vec_y));
1642
1643	rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1644
1645	// We can get a failure due to the use of a singular (non-invertible) matrix
1646	// If it failed, compute rgbo_vectors[] with a different method ...
1647	if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
1648	{
1649	vfloat4 v0 = ep.endpt0[`0`];
1650	vfloat4 v1 = ep.endpt1[`0`];
1651
1652	float avgdif = hadd_rgb_s(v1 - v0) * (`1.0f` / `3.0f`);
1653	avgdif = astc::max(avgdif, `0.0f`);
1654
1655	vfloat4 avg = (v0 + v1) * `0.5f`;
1656	vfloat4 ep0 = avg - vfloat4 (avgdif) * `0.5f`;
1657
1658	rgbo_vector = vfloat4 (ep0.lane<`0`>(), ep0.lane<`1`>(), ep0.lane<`2`>(), avgdif);
1659	}
1660	}
1661	}
1662
1663	#endif
1664

Browse the source code of Godot/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp