astcenc_averages_and_directions.cpp source code [Godot/thirdparty/astcenc/astcenc_averages_and_directions.cpp]

1	// SPDX-License-Identifier: Apache-2.0
2	// ----------------------------------------------------------------------------
3	// Copyright 2011-2023 Arm Limited
4	//
5	// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6	// use this file except in compliance with the License. You may obtain a copy
7	// of the License at:
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing, software
12	// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13	// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14	// License for the specific language governing permissions and limitations
15	// under the License.
16	// ----------------------------------------------------------------------------
17
18	/**
19	* @brief Functions for finding dominant direction of a set of colors.
20	*/
21	#if !defined(ASTCENC_DECOMPRESS_ONLY)
22
23	#include "astcenc_internal.h"
24
25	#include <cassert>
26
27	/**
28	* @brief Compute the average RGB color of each partition.
29	*
30	* The algorithm here uses a vectorized sequential scan and per-partition
31	* color accumulators, using select() to mask texel lanes in other partitions.
32	*
33	* We only accumulate sums for N-1 partitions during the scan; the value for
34	* the last partition can be computed given that we know the block-wide average
35	* already.
36	*
37	* Because of this we could reduce the loop iteration count so it "just" spans
38	* the max texel index needed for the N-1 partitions, which could need fewer
39	* iterations than the full block texel count. However, this makes the loop
40	* count erratic and causes more branch mispredictions so is a net loss.
41	*
42	* @param pi The partitioning to use.
43	* @param blk The block data to process.
44	* @param[out] averages The output averages. Unused partition indices will
45	* not be initialized, and lane<3> will be zero.
46	*/
47	static void compute_partition_averages_rgb(
48	const partition_info& pi,
49	const image_block& blk,
50	vfloat4 averages[BLOCK_MAX_PARTITIONS]
51	) {
52	unsigned int partition_count = pi.partition_count;
53	unsigned int texel_count = blk.texel_count;
54	promise(texel_count > `0`);
55
56	// For 1 partition just use the precomputed mean
57	if (partition_count == `1`)
58	{
59	averages[`0`] = blk.data_mean.swz<`0`, `1`, `2`>();
60	}
61	// For 2 partitions scan results for partition 0, compute partition 1
62	else if (partition_count == `2`)
63	{
64	vfloatacc pp_avg_rgb[`3`] {};
65
66	vint lane_id = vint::lane_id();
67	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
68	{
69	vint texel_partition(pi.partition_of_texel + i);
70
71	vmask lane_mask = lane_id < vint (texel_count);
72	lane_id += vint (ASTCENC_SIMD_WIDTH);
73
74	vmask p0_mask = lane_mask & (texel_partition == vint (`0`));
75
76	vfloat data_r = loada(blk.data_r + i);
77	haccumulate(pp_avg_rgb[`0`], data_r, p0_mask);
78
79	vfloat data_g = loada(blk.data_g + i);
80	haccumulate(pp_avg_rgb[`1`], data_g, p0_mask);
81
82	vfloat data_b = loada(blk.data_b + i);
83	haccumulate(pp_avg_rgb[`2`], data_b, p0_mask);
84	}
85
86	vfloat4 block_total = blk.data_mean.swz<`0`, `1`, `2`>() * static_cast<float>(blk.texel_count);
87
88	vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[`0`]),
89	hadd_s(pp_avg_rgb[`1`]),
90	hadd_s(pp_avg_rgb[`2`]));
91
92	vfloat4 p1_total = block_total - p0_total;
93
94	averages[`0`] = p0_total / static_cast<float>(pi.partition_texel_count[`0`]);
95	averages[`1`] = p1_total / static_cast<float>(pi.partition_texel_count[`1`]);
96	}
97	// For 3 partitions scan results for partition 0/1, compute partition 2
98	else if (partition_count == `3`)
99	{
100	vfloatacc pp_avg_rgb[`2`][`3`] {};
101
102	vint lane_id = vint::lane_id();
103	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
104	{
105	vint texel_partition(pi.partition_of_texel + i);
106
107	vmask lane_mask = lane_id < vint (texel_count);
108	lane_id += vint (ASTCENC_SIMD_WIDTH);
109
110	vmask p0_mask = lane_mask & (texel_partition == vint (`0`));
111	vmask p1_mask = lane_mask & (texel_partition == vint (`1`));
112
113	vfloat data_r = loada(blk.data_r + i);
114	haccumulate(pp_avg_rgb[`0`][`0`], data_r, p0_mask);
115	haccumulate(pp_avg_rgb[`1`][`0`], data_r, p1_mask);
116
117	vfloat data_g = loada(blk.data_g + i);
118	haccumulate(pp_avg_rgb[`0`][`1`], data_g, p0_mask);
119	haccumulate(pp_avg_rgb[`1`][`1`], data_g, p1_mask);
120
121	vfloat data_b = loada(blk.data_b + i);
122	haccumulate(pp_avg_rgb[`0`][`2`], data_b, p0_mask);
123	haccumulate(pp_avg_rgb[`1`][`2`], data_b, p1_mask);
124	}
125
126	vfloat4 block_total = blk.data_mean.swz<`0`, `1`, `2`>() * static_cast<float>(blk.texel_count);
127
128	vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[`0`][`0`]),
129	hadd_s(pp_avg_rgb[`0`][`1`]),
130	hadd_s(pp_avg_rgb[`0`][`2`]));
131
132	vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[`1`][`0`]),
133	hadd_s(pp_avg_rgb[`1`][`1`]),
134	hadd_s(pp_avg_rgb[`1`][`2`]));
135
136	vfloat4 p2_total = block_total - p0_total - p1_total;
137
138	averages[`0`] = p0_total / static_cast<float>(pi.partition_texel_count[`0`]);
139	averages[`1`] = p1_total / static_cast<float>(pi.partition_texel_count[`1`]);
140	averages[`2`] = p2_total / static_cast<float>(pi.partition_texel_count[`2`]);
141	}
142	else
143	{
144	// For 4 partitions scan results for partition 0/1/2, compute partition 3
145	vfloatacc pp_avg_rgb[`3`][`3`] {};
146
147	vint lane_id = vint::lane_id();
148	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
149	{
150	vint texel_partition(pi.partition_of_texel + i);
151
152	vmask lane_mask = lane_id < vint (texel_count);
153	lane_id += vint (ASTCENC_SIMD_WIDTH);
154
155	vmask p0_mask = lane_mask & (texel_partition == vint (`0`));
156	vmask p1_mask = lane_mask & (texel_partition == vint (`1`));
157	vmask p2_mask = lane_mask & (texel_partition == vint (`2`));
158
159	vfloat data_r = loada(blk.data_r + i);
160	haccumulate(pp_avg_rgb[`0`][`0`], data_r, p0_mask);
161	haccumulate(pp_avg_rgb[`1`][`0`], data_r, p1_mask);
162	haccumulate(pp_avg_rgb[`2`][`0`], data_r, p2_mask);
163
164	vfloat data_g = loada(blk.data_g + i);
165	haccumulate(pp_avg_rgb[`0`][`1`], data_g, p0_mask);
166	haccumulate(pp_avg_rgb[`1`][`1`], data_g, p1_mask);
167	haccumulate(pp_avg_rgb[`2`][`1`], data_g, p2_mask);
168
169	vfloat data_b = loada(blk.data_b + i);
170	haccumulate(pp_avg_rgb[`0`][`2`], data_b, p0_mask);
171	haccumulate(pp_avg_rgb[`1`][`2`], data_b, p1_mask);
172	haccumulate(pp_avg_rgb[`2`][`2`], data_b, p2_mask);
173	}
174
175	vfloat4 block_total = blk.data_mean.swz<`0`, `1`, `2`>() * static_cast<float>(blk.texel_count);
176
177	vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[`0`][`0`]),
178	hadd_s(pp_avg_rgb[`0`][`1`]),
179	hadd_s(pp_avg_rgb[`0`][`2`]));
180
181	vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[`1`][`0`]),
182	hadd_s(pp_avg_rgb[`1`][`1`]),
183	hadd_s(pp_avg_rgb[`1`][`2`]));
184
185	vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[`2`][`0`]),
186	hadd_s(pp_avg_rgb[`2`][`1`]),
187	hadd_s(pp_avg_rgb[`2`][`2`]));
188
189	vfloat4 p3_total = block_total - p0_total - p1_total - p2_total;
190
191	averages[`0`] = p0_total / static_cast<float>(pi.partition_texel_count[`0`]);
192	averages[`1`] = p1_total / static_cast<float>(pi.partition_texel_count[`1`]);
193	averages[`2`] = p2_total / static_cast<float>(pi.partition_texel_count[`2`]);
194	averages[`3`] = p3_total / static_cast<float>(pi.partition_texel_count[`3`]);
195	}
196	}
197
198	/**
199	* @brief Compute the average RGBA color of each partition.
200	*
201	* The algorithm here uses a vectorized sequential scan and per-partition
202	* color accumulators, using select() to mask texel lanes in other partitions.
203	*
204	* We only accumulate sums for N-1 partitions during the scan; the value for
205	* the last partition can be computed given that we know the block-wide average
206	* already.
207	*
208	* Because of this we could reduce the loop iteration count so it "just" spans
209	* the max texel index needed for the N-1 partitions, which could need fewer
210	* iterations than the full block texel count. However, this makes the loop
211	* count erratic and causes more branch mispredictions so is a net loss.
212	*
213	* @param pi The partitioning to use.
214	* @param blk The block data to process.
215	* @param[out] averages The output averages. Unused partition indices will
216	* not be initialized.
217	*/
218	static void compute_partition_averages_rgba(
219	const partition_info& pi,
220	const image_block& blk,
221	vfloat4 averages[BLOCK_MAX_PARTITIONS]
222	) {
223	unsigned int partition_count = pi.partition_count;
224	unsigned int texel_count = blk.texel_count;
225	promise(texel_count > `0`);
226
227	// For 1 partition just use the precomputed mean
228	if (partition_count == `1`)
229	{
230	averages[`0`] = blk.data_mean;
231	}
232	// For 2 partitions scan results for partition 0, compute partition 1
233	else if (partition_count == `2`)
234	{
235	vfloat4 pp_avg_rgba[`4`] {};
236
237	vint lane_id = vint::lane_id();
238	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
239	{
240	vint texel_partition(pi.partition_of_texel + i);
241
242	vmask lane_mask = lane_id < vint (texel_count);
243	lane_id += vint (ASTCENC_SIMD_WIDTH);
244
245	vmask p0_mask = lane_mask & (texel_partition == vint (`0`));
246
247	vfloat data_r = loada(blk.data_r + i);
248	haccumulate(pp_avg_rgba[`0`], data_r, p0_mask);
249
250	vfloat data_g = loada(blk.data_g + i);
251	haccumulate(pp_avg_rgba[`1`], data_g, p0_mask);
252
253	vfloat data_b = loada(blk.data_b + i);
254	haccumulate(pp_avg_rgba[`2`], data_b, p0_mask);
255
256	vfloat data_a = loada(blk.data_a + i);
257	haccumulate(pp_avg_rgba[`3`], data_a, p0_mask);
258	}
259
260	vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
261
262	vfloat4 p0_total = vfloat4 (hadd_s(pp_avg_rgba[`0`]),
263	hadd_s(pp_avg_rgba[`1`]),
264	hadd_s(pp_avg_rgba[`2`]),
265	hadd_s(pp_avg_rgba[`3`]));
266
267	vfloat4 p1_total = block_total - p0_total;
268
269	averages[`0`] = p0_total / static_cast<float>(pi.partition_texel_count[`0`]);
270	averages[`1`] = p1_total / static_cast<float>(pi.partition_texel_count[`1`]);
271	}
272	// For 3 partitions scan results for partition 0/1, compute partition 2
273	else if (partition_count == `3`)
274	{
275	vfloat4 pp_avg_rgba[`2`][`4`] {};
276
277	vint lane_id = vint::lane_id();
278	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
279	{
280	vint texel_partition(pi.partition_of_texel + i);
281
282	vmask lane_mask = lane_id < vint (texel_count);
283	lane_id += vint (ASTCENC_SIMD_WIDTH);
284
285	vmask p0_mask = lane_mask & (texel_partition == vint (`0`));
286	vmask p1_mask = lane_mask & (texel_partition == vint (`1`));
287
288	vfloat data_r = loada(blk.data_r + i);
289	haccumulate(pp_avg_rgba[`0`][`0`], data_r, p0_mask);
290	haccumulate(pp_avg_rgba[`1`][`0`], data_r, p1_mask);
291
292	vfloat data_g = loada(blk.data_g + i);
293	haccumulate(pp_avg_rgba[`0`][`1`], data_g, p0_mask);
294	haccumulate(pp_avg_rgba[`1`][`1`], data_g, p1_mask);
295
296	vfloat data_b = loada(blk.data_b + i);
297	haccumulate(pp_avg_rgba[`0`][`2`], data_b, p0_mask);
298	haccumulate(pp_avg_rgba[`1`][`2`], data_b, p1_mask);
299
300	vfloat data_a = loada(blk.data_a + i);
301	haccumulate(pp_avg_rgba[`0`][`3`], data_a, p0_mask);
302	haccumulate(pp_avg_rgba[`1`][`3`], data_a, p1_mask);
303	}
304
305	vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
306
307	vfloat4 p0_total = vfloat4 (hadd_s(pp_avg_rgba[`0`][`0`]),
308	hadd_s(pp_avg_rgba[`0`][`1`]),
309	hadd_s(pp_avg_rgba[`0`][`2`]),
310	hadd_s(pp_avg_rgba[`0`][`3`]));
311
312	vfloat4 p1_total = vfloat4 (hadd_s(pp_avg_rgba[`1`][`0`]),
313	hadd_s(pp_avg_rgba[`1`][`1`]),
314	hadd_s(pp_avg_rgba[`1`][`2`]),
315	hadd_s(pp_avg_rgba[`1`][`3`]));
316
317	vfloat4 p2_total = block_total - p0_total - p1_total;
318
319	averages[`0`] = p0_total / static_cast<float>(pi.partition_texel_count[`0`]);
320	averages[`1`] = p1_total / static_cast<float>(pi.partition_texel_count[`1`]);
321	averages[`2`] = p2_total / static_cast<float>(pi.partition_texel_count[`2`]);
322	}
323	else
324	{
325	// For 4 partitions scan results for partition 0/1/2, compute partition 3
326	vfloat4 pp_avg_rgba[`3`][`4`] {};
327
328	vint lane_id = vint::lane_id();
329	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
330	{
331	vint texel_partition(pi.partition_of_texel + i);
332
333	vmask lane_mask = lane_id < vint (texel_count);
334	lane_id += vint (ASTCENC_SIMD_WIDTH);
335
336	vmask p0_mask = lane_mask & (texel_partition == vint (`0`));
337	vmask p1_mask = lane_mask & (texel_partition == vint (`1`));
338	vmask p2_mask = lane_mask & (texel_partition == vint (`2`));
339
340	vfloat data_r = loada(blk.data_r + i);
341	haccumulate(pp_avg_rgba[`0`][`0`], data_r, p0_mask);
342	haccumulate(pp_avg_rgba[`1`][`0`], data_r, p1_mask);
343	haccumulate(pp_avg_rgba[`2`][`0`], data_r, p2_mask);
344
345	vfloat data_g = loada(blk.data_g + i);
346	haccumulate(pp_avg_rgba[`0`][`1`], data_g, p0_mask);
347	haccumulate(pp_avg_rgba[`1`][`1`], data_g, p1_mask);
348	haccumulate(pp_avg_rgba[`2`][`1`], data_g, p2_mask);
349
350	vfloat data_b = loada(blk.data_b + i);
351	haccumulate(pp_avg_rgba[`0`][`2`], data_b, p0_mask);
352	haccumulate(pp_avg_rgba[`1`][`2`], data_b, p1_mask);
353	haccumulate(pp_avg_rgba[`2`][`2`], data_b, p2_mask);
354
355	vfloat data_a = loada(blk.data_a + i);
356	haccumulate(pp_avg_rgba[`0`][`3`], data_a, p0_mask);
357	haccumulate(pp_avg_rgba[`1`][`3`], data_a, p1_mask);
358	haccumulate(pp_avg_rgba[`2`][`3`], data_a, p2_mask);
359	}
360
361	vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
362
363	vfloat4 p0_total = vfloat4 (hadd_s(pp_avg_rgba[`0`][`0`]),
364	hadd_s(pp_avg_rgba[`0`][`1`]),
365	hadd_s(pp_avg_rgba[`0`][`2`]),
366	hadd_s(pp_avg_rgba[`0`][`3`]));
367
368	vfloat4 p1_total = vfloat4 (hadd_s(pp_avg_rgba[`1`][`0`]),
369	hadd_s(pp_avg_rgba[`1`][`1`]),
370	hadd_s(pp_avg_rgba[`1`][`2`]),
371	hadd_s(pp_avg_rgba[`1`][`3`]));
372
373	vfloat4 p2_total = vfloat4 (hadd_s(pp_avg_rgba[`2`][`0`]),
374	hadd_s(pp_avg_rgba[`2`][`1`]),
375	hadd_s(pp_avg_rgba[`2`][`2`]),
376	hadd_s(pp_avg_rgba[`2`][`3`]));
377
378	vfloat4 p3_total = block_total - p0_total - p1_total - p2_total;
379
380	averages[`0`] = p0_total / static_cast<float>(pi.partition_texel_count[`0`]);
381	averages[`1`] = p1_total / static_cast<float>(pi.partition_texel_count[`1`]);
382	averages[`2`] = p2_total / static_cast<float>(pi.partition_texel_count[`2`]);
383	averages[`3`] = p3_total / static_cast<float>(pi.partition_texel_count[`3`]);
384	}
385	}
386
387	/ See header for documentation. /
388	void compute_avgs_and_dirs_4_comp(
389	const partition_info& pi,
390	const image_block& blk,
391	partition_metrics pm[BLOCK_MAX_PARTITIONS]
392	) {
393	int partition_count = pi.partition_count;
394	promise(partition_count > `0`);
395
396	// Pre-compute partition_averages
397	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
398	compute_partition_averages_rgba(pi, blk, partition_averages);
399
400	for (int partition = `0`; partition < partition_count; partition++)
401	{
402	const uint8_t *texel_indexes = pi.texels_of_partition[partition];
403	unsigned int texel_count = pi.partition_texel_count[partition];
404	promise(texel_count > `0`);
405
406	vfloat4 average = partition_averages[partition];
407	pm[partition].avg = average;
408
409	vfloat4 sum_xp = vfloat4::zero();
410	vfloat4 sum_yp = vfloat4::zero();
411	vfloat4 sum_zp = vfloat4::zero();
412	vfloat4 sum_wp = vfloat4::zero();
413
414	for (unsigned int i = `0`; i < texel_count; i++)
415	{
416	unsigned int iwt = texel_indexes[i];
417	vfloat4 texel_datum = blk.texel(iwt);
418	texel_datum = texel_datum - average;
419
420	vfloat4 zero = vfloat4::zero();
421
422	vmask4 tdm0 = texel_datum.swz<`0`,`0`,`0`,`0`>() > zero;
423	sum_xp += select(zero, texel_datum, tdm0);
424
425	vmask4 tdm1 = texel_datum.swz<`1`,`1`,`1`,`1`>() > zero;
426	sum_yp += select(zero, texel_datum, tdm1);
427
428	vmask4 tdm2 = texel_datum.swz<`2`,`2`,`2`,`2`>() > zero;
429	sum_zp += select(zero, texel_datum, tdm2);
430
431	vmask4 tdm3 = texel_datum.swz<`3`,`3`,`3`,`3`>() > zero;
432	sum_wp += select(zero, texel_datum, tdm3);
433	}
434
435	vfloat4 prod_xp = dot(sum_xp, sum_xp);
436	vfloat4 prod_yp = dot(sum_yp, sum_yp);
437	vfloat4 prod_zp = dot(sum_zp, sum_zp);
438	vfloat4 prod_wp = dot(sum_wp, sum_wp);
439
440	vfloat4 best_vector = sum_xp;
441	vfloat4 best_sum = prod_xp;
442
443	vmask4 mask = prod_yp > best_sum;
444	best_vector = select(best_vector, sum_yp, mask);
445	best_sum = select(best_sum, prod_yp, mask);
446
447	mask = prod_zp > best_sum;
448	best_vector = select(best_vector, sum_zp, mask);
449	best_sum = select(best_sum, prod_zp, mask);
450
451	mask = prod_wp > best_sum;
452	best_vector = select(best_vector, sum_wp, mask);
453
454	pm[partition].dir = best_vector;
455	}
456	}
457
458	/ See header for documentation. /
459	void compute_avgs_and_dirs_3_comp(
460	const partition_info& pi,
461	const image_block& blk,
462	unsigned int omitted_component,
463	partition_metrics pm[BLOCK_MAX_PARTITIONS]
464	) {
465	// Pre-compute partition_averages
466	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
467	compute_partition_averages_rgba(pi, blk, partition_averages);
468
469	const float* data_vr = blk.data_r;
470	const float* data_vg = blk.data_g;
471	const float* data_vb = blk.data_b;
472
473	// TODO: Data-driven permute would be useful to avoid this ...
474	if (omitted_component == `0`)
475	{
476	partition_averages[`0`] = partition_averages[`0`].swz<`1`, `2`, `3`>();
477	partition_averages[`1`] = partition_averages[`1`].swz<`1`, `2`, `3`>();
478	partition_averages[`2`] = partition_averages[`2`].swz<`1`, `2`, `3`>();
479	partition_averages[`3`] = partition_averages[`3`].swz<`1`, `2`, `3`>();
480
481	data_vr = blk.data_g;
482	data_vg = blk.data_b;
483	data_vb = blk.data_a;
484	}
485	else if (omitted_component == `1`)
486	{
487	partition_averages[`0`] = partition_averages[`0`].swz<`0`, `2`, `3`>();
488	partition_averages[`1`] = partition_averages[`1`].swz<`0`, `2`, `3`>();
489	partition_averages[`2`] = partition_averages[`2`].swz<`0`, `2`, `3`>();
490	partition_averages[`3`] = partition_averages[`3`].swz<`0`, `2`, `3`>();
491
492	data_vg = blk.data_b;
493	data_vb = blk.data_a;
494	}
495	else if (omitted_component == `2`)
496	{
497	partition_averages[`0`] = partition_averages[`0`].swz<`0`, `1`, `3`>();
498	partition_averages[`1`] = partition_averages[`1`].swz<`0`, `1`, `3`>();
499	partition_averages[`2`] = partition_averages[`2`].swz<`0`, `1`, `3`>();
500	partition_averages[`3`] = partition_averages[`3`].swz<`0`, `1`, `3`>();
501
502	data_vb = blk.data_a;
503	}
504	else
505	{
506	partition_averages[`0`] = partition_averages[`0`].swz<`0`, `1`, `2`>();
507	partition_averages[`1`] = partition_averages[`1`].swz<`0`, `1`, `2`>();
508	partition_averages[`2`] = partition_averages[`2`].swz<`0`, `1`, `2`>();
509	partition_averages[`3`] = partition_averages[`3`].swz<`0`, `1`, `2`>();
510	}
511
512	unsigned int partition_count = pi.partition_count;
513	promise(partition_count > `0`);
514
515	for (unsigned int partition = `0`; partition < partition_count; partition++)
516	{
517	const uint8_t *texel_indexes = pi.texels_of_partition[partition];
518	unsigned int texel_count = pi.partition_texel_count[partition];
519	promise(texel_count > `0`);
520
521	vfloat4 average = partition_averages[partition];
522	pm[partition].avg = average;
523
524	vfloat4 sum_xp = vfloat4::zero();
525	vfloat4 sum_yp = vfloat4::zero();
526	vfloat4 sum_zp = vfloat4::zero();
527
528	for (unsigned int i = `0`; i < texel_count; i++)
529	{
530	unsigned int iwt = texel_indexes[i];
531
532	vfloat4 texel_datum = vfloat3(data_vr[iwt],
533	data_vg[iwt],
534	data_vb[iwt]);
535	texel_datum = texel_datum - average;
536
537	vfloat4 zero = vfloat4::zero();
538
539	vmask4 tdm0 = texel_datum.swz<`0`,`0`,`0`,`0`>() > zero;
540	sum_xp += select(zero, texel_datum, tdm0);
541
542	vmask4 tdm1 = texel_datum.swz<`1`,`1`,`1`,`1`>() > zero;
543	sum_yp += select(zero, texel_datum, tdm1);
544
545	vmask4 tdm2 = texel_datum.swz<`2`,`2`,`2`,`2`>() > zero;
546	sum_zp += select(zero, texel_datum, tdm2);
547	}
548
549	vfloat4 prod_xp = dot(sum_xp, sum_xp);
550	vfloat4 prod_yp = dot(sum_yp, sum_yp);
551	vfloat4 prod_zp = dot(sum_zp, sum_zp);
552
553	vfloat4 best_vector = sum_xp;
554	vfloat4 best_sum = prod_xp;
555
556	vmask4 mask = prod_yp > best_sum;
557	best_vector = select(best_vector, sum_yp, mask);
558	best_sum = select(best_sum, prod_yp, mask);
559
560	mask = prod_zp > best_sum;
561	best_vector = select(best_vector, sum_zp, mask);
562
563	pm[partition].dir = best_vector;
564	}
565	}
566
567	/ See header for documentation. /
568	void compute_avgs_and_dirs_3_comp_rgb(
569	const partition_info& pi,
570	const image_block& blk,
571	partition_metrics pm[BLOCK_MAX_PARTITIONS]
572	) {
573	unsigned int partition_count = pi.partition_count;
574	promise(partition_count > `0`);
575
576	// Pre-compute partition_averages
577	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
578	compute_partition_averages_rgb(pi, blk, partition_averages);
579
580	for (unsigned int partition = `0`; partition < partition_count; partition++)
581	{
582	const uint8_t *texel_indexes = pi.texels_of_partition[partition];
583	unsigned int texel_count = pi.partition_texel_count[partition];
584	promise(texel_count > `0`);
585
586	vfloat4 average = partition_averages[partition];
587	pm[partition].avg = average;
588
589	vfloat4 sum_xp = vfloat4::zero();
590	vfloat4 sum_yp = vfloat4::zero();
591	vfloat4 sum_zp = vfloat4::zero();
592
593	for (unsigned int i = `0`; i < texel_count; i++)
594	{
595	unsigned int iwt = texel_indexes[i];
596
597	vfloat4 texel_datum = blk.texel3(iwt);
598	texel_datum = texel_datum - average;
599
600	vfloat4 zero = vfloat4::zero();
601
602	vmask4 tdm0 = texel_datum.swz<`0`,`0`,`0`,`0`>() > zero;
603	sum_xp += select(zero, texel_datum, tdm0);
604
605	vmask4 tdm1 = texel_datum.swz<`1`,`1`,`1`,`1`>() > zero;
606	sum_yp += select(zero, texel_datum, tdm1);
607
608	vmask4 tdm2 = texel_datum.swz<`2`,`2`,`2`,`2`>() > zero;
609	sum_zp += select(zero, texel_datum, tdm2);
610	}
611
612	vfloat4 prod_xp = dot(sum_xp, sum_xp);
613	vfloat4 prod_yp = dot(sum_yp, sum_yp);
614	vfloat4 prod_zp = dot(sum_zp, sum_zp);
615
616	vfloat4 best_vector = sum_xp;
617	vfloat4 best_sum = prod_xp;
618
619	vmask4 mask = prod_yp > best_sum;
620	best_vector = select(best_vector, sum_yp, mask);
621	best_sum = select(best_sum, prod_yp, mask);
622
623	mask = prod_zp > best_sum;
624	best_vector = select(best_vector, sum_zp, mask);
625
626	pm[partition].dir = best_vector;
627	}
628	}
629
630	/ See header for documentation. /
631	void compute_avgs_and_dirs_2_comp(
632	const partition_info& pt,
633	const image_block& blk,
634	unsigned int component1,
635	unsigned int component2,
636	partition_metrics pm[BLOCK_MAX_PARTITIONS]
637	) {
638	vfloat4 average;
639
640	const float* data_vr = nullptr;
641	const float* data_vg = nullptr;
642
643	if (component1 == `0` && component2 == `1`)
644	{
645	average = blk.data_mean.swz<`0`, `1`>();
646
647	data_vr = blk.data_r;
648	data_vg = blk.data_g;
649	}
650	else if (component1 == `0` && component2 == `2`)
651	{
652	average = blk.data_mean.swz<`0`, `2`>();
653
654	data_vr = blk.data_r;
655	data_vg = blk.data_b;
656	}
657	else // (component1 == 1 && component2 == 2)
658	{
659	assert(component1 == `1` && component2 == `2`);
660
661	average = blk.data_mean.swz<`1`, `2`>();
662
663	data_vr = blk.data_g;
664	data_vg = blk.data_b;
665	}
666
667	unsigned int partition_count = pt.partition_count;
668	promise(partition_count > `0`);
669
670	for (unsigned int partition = `0`; partition < partition_count; partition++)
671	{
672	const uint8_t *texel_indexes = pt.texels_of_partition[partition];
673	unsigned int texel_count = pt.partition_texel_count[partition];
674	promise(texel_count > `0`);
675
676	// Only compute a partition mean if more than one partition
677	if (partition_count > `1`)
678	{
679	average = vfloat4::zero();
680	for (unsigned int i = `0`; i < texel_count; i++)
681	{
682	unsigned int iwt = texel_indexes[i];
683	average += vfloat2(data_vr[iwt], data_vg[iwt]);
684	}
685
686	average = average / static_cast<float>(texel_count);
687	}
688
689	pm[partition].avg = average;
690
691	vfloat4 sum_xp = vfloat4::zero();
692	vfloat4 sum_yp = vfloat4::zero();
693
694	for (unsigned int i = `0`; i < texel_count; i++)
695	{
696	unsigned int iwt = texel_indexes[i];
697	vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
698	texel_datum = texel_datum - average;
699
700	vfloat4 zero = vfloat4::zero();
701
702	vmask4 tdm0 = texel_datum.swz<`0`,`0`,`0`,`0`>() > zero;
703	sum_xp += select(zero, texel_datum, tdm0);
704
705	vmask4 tdm1 = texel_datum.swz<`1`,`1`,`1`,`1`>() > zero;
706	sum_yp += select(zero, texel_datum, tdm1);
707	}
708
709	vfloat4 prod_xp = dot(sum_xp, sum_xp);
710	vfloat4 prod_yp = dot(sum_yp, sum_yp);
711
712	vfloat4 best_vector = sum_xp;
713	vfloat4 best_sum = prod_xp;
714
715	vmask4 mask = prod_yp > best_sum;
716	best_vector = select(best_vector, sum_yp, mask);
717
718	pm[partition].dir = best_vector;
719	}
720	}
721
722	/ See header for documentation. /
723	void compute_error_squared_rgba(
724	const partition_info& pi,
725	const image_block& blk,
726	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
727	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
728	float line_lengths[BLOCK_MAX_PARTITIONS],
729	float& uncor_error,
730	float& samec_error
731	) {
732	unsigned int partition_count = pi.partition_count;
733	promise(partition_count > `0`);
734
735	vfloatacc uncor_errorsumv = vfloatacc::zero();
736	vfloatacc samec_errorsumv = vfloatacc::zero();
737
738	for (unsigned int partition = `0`; partition < partition_count; partition++)
739	{
740	const uint8_t *texel_indexes = pi.texels_of_partition[partition];
741
742	processed_line4 l_uncor = uncor_plines[partition];
743	processed_line4 l_samec = samec_plines[partition];
744
745	unsigned int texel_count = pi.partition_texel_count[partition];
746	promise(texel_count > `0`);
747
748	// Vectorize some useful scalar inputs
749	vfloat l_uncor_bs0(l_uncor.bs.lane<`0`>());
750	vfloat l_uncor_bs1(l_uncor.bs.lane<`1`>());
751	vfloat l_uncor_bs2(l_uncor.bs.lane<`2`>());
752	vfloat l_uncor_bs3(l_uncor.bs.lane<`3`>());
753
754	vfloat l_uncor_amod0(l_uncor.amod.lane<`0`>());
755	vfloat l_uncor_amod1(l_uncor.amod.lane<`1`>());
756	vfloat l_uncor_amod2(l_uncor.amod.lane<`2`>());
757	vfloat l_uncor_amod3(l_uncor.amod.lane<`3`>());
758
759	vfloat l_samec_bs0(l_samec.bs.lane<`0`>());
760	vfloat l_samec_bs1(l_samec.bs.lane<`1`>());
761	vfloat l_samec_bs2(l_samec.bs.lane<`2`>());
762	vfloat l_samec_bs3(l_samec.bs.lane<`3`>());
763
764	assert(all(l_samec.amod == vfloat4(`0.0f`)));
765
766	vfloat uncor_loparamv(`1e10f`);
767	vfloat uncor_hiparamv(-`1e10f`);
768
769	vfloat ew_r(blk.channel_weight.lane<`0`>());
770	vfloat ew_g(blk.channel_weight.lane<`1`>());
771	vfloat ew_b(blk.channel_weight.lane<`2`>());
772	vfloat ew_a(blk.channel_weight.lane<`3`>());
773
774	// This implementation over-shoots, but this is safe as we initialize the texel_indexes
775	// array to extend the last value. This means min/max are not impacted, but we need to mask
776	// out the dummy values when we compute the line weighting.
777	vint lane_ids = vint::lane_id();
778	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
779	{
780	vmask mask = lane_ids < vint (texel_count);
781	vint texel_idxs(texel_indexes + i);
782
783	vfloat data_r = gatherf(blk.data_r, texel_idxs);
784	vfloat data_g = gatherf(blk.data_g, texel_idxs);
785	vfloat data_b = gatherf(blk.data_b, texel_idxs);
786	vfloat data_a = gatherf(blk.data_a, texel_idxs);
787
788	vfloat uncor_param = (data_r * l_uncor_bs0)
789	+ (data_g * l_uncor_bs1)
790	+ (data_b * l_uncor_bs2)
791	+ (data_a * l_uncor_bs3);
792
793	uncor_loparamv = min(uncor_param, uncor_loparamv);
794	uncor_hiparamv = max(uncor_param, uncor_hiparamv);
795
796	vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
797	+ (uncor_param * l_uncor_bs0);
798	vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
799	+ (uncor_param * l_uncor_bs1);
800	vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
801	+ (uncor_param * l_uncor_bs2);
802	vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
803	+ (uncor_param * l_uncor_bs3);
804
805	vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
806	+ (ew_g * uncor_dist1 * uncor_dist1)
807	+ (ew_b * uncor_dist2 * uncor_dist2)
808	+ (ew_a * uncor_dist3 * uncor_dist3);
809
810	haccumulate(uncor_errorsumv, uncor_err, mask);
811
812	// Process samechroma data
813	vfloat samec_param = (data_r * l_samec_bs0)
814	+ (data_g * l_samec_bs1)
815	+ (data_b * l_samec_bs2)
816	+ (data_a * l_samec_bs3);
817
818	vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
819	vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
820	vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
821	vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
822
823	vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
824	+ (ew_g * samec_dist1 * samec_dist1)
825	+ (ew_b * samec_dist2 * samec_dist2)
826	+ (ew_a * samec_dist3 * samec_dist3);
827
828	haccumulate(samec_errorsumv, samec_err, mask);
829
830	lane_ids += vint (ASTCENC_SIMD_WIDTH);
831	}
832
833	// Turn very small numbers and NaNs into a small number
834	float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
835	line_lengths[partition] = astc::max(uncor_linelen, `1e-7f`);
836	}
837
838	uncor_error = hadd_s(uncor_errorsumv);
839	samec_error = hadd_s(samec_errorsumv);
840	}
841
842	/ See header for documentation. /
843	void compute_error_squared_rgb(
844	const partition_info& pi,
845	const image_block& blk,
846	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
847	float& uncor_error,
848	float& samec_error
849	) {
850	unsigned int partition_count = pi.partition_count;
851	promise(partition_count > `0`);
852
853	vfloatacc uncor_errorsumv = vfloatacc::zero();
854	vfloatacc samec_errorsumv = vfloatacc::zero();
855
856	for (unsigned int partition = `0`; partition < partition_count; partition++)
857	{
858	partition_lines3& pl = plines[partition];
859	const uint8_t *texel_indexes = pi.texels_of_partition[partition];
860	unsigned int texel_count = pi.partition_texel_count[partition];
861	promise(texel_count > `0`);
862
863	processed_line3 l_uncor = pl.uncor_pline;
864	processed_line3 l_samec = pl.samec_pline;
865
866	// Vectorize some useful scalar inputs
867	vfloat l_uncor_bs0(l_uncor.bs.lane<`0`>());
868	vfloat l_uncor_bs1(l_uncor.bs.lane<`1`>());
869	vfloat l_uncor_bs2(l_uncor.bs.lane<`2`>());
870
871	vfloat l_uncor_amod0(l_uncor.amod.lane<`0`>());
872	vfloat l_uncor_amod1(l_uncor.amod.lane<`1`>());
873	vfloat l_uncor_amod2(l_uncor.amod.lane<`2`>());
874
875	vfloat l_samec_bs0(l_samec.bs.lane<`0`>());
876	vfloat l_samec_bs1(l_samec.bs.lane<`1`>());
877	vfloat l_samec_bs2(l_samec.bs.lane<`2`>());
878
879	assert(all(l_samec.amod == vfloat4(`0.0f`)));
880
881	vfloat uncor_loparamv(`1e10f`);
882	vfloat uncor_hiparamv(-`1e10f`);
883
884	vfloat ew_r(blk.channel_weight.lane<`0`>());
885	vfloat ew_g(blk.channel_weight.lane<`1`>());
886	vfloat ew_b(blk.channel_weight.lane<`2`>());
887
888	// This implementation over-shoots, but this is safe as we initialize the weights array
889	// to extend the last value. This means min/max are not impacted, but we need to mask
890	// out the dummy values when we compute the line weighting.
891	vint lane_ids = vint::lane_id();
892	for (unsigned int i = `0`; i < texel_count; i += ASTCENC_SIMD_WIDTH)
893	{
894	vmask mask = lane_ids < vint (texel_count);
895	vint texel_idxs(texel_indexes + i);
896
897	vfloat data_r = gatherf(blk.data_r, texel_idxs);
898	vfloat data_g = gatherf(blk.data_g, texel_idxs);
899	vfloat data_b = gatherf(blk.data_b, texel_idxs);
900
901	vfloat uncor_param = (data_r * l_uncor_bs0)
902	+ (data_g * l_uncor_bs1)
903	+ (data_b * l_uncor_bs2);
904
905	uncor_loparamv = min(uncor_param, uncor_loparamv);
906	uncor_hiparamv = max(uncor_param, uncor_hiparamv);
907
908	vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
909	+ (uncor_param * l_uncor_bs0);
910	vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
911	+ (uncor_param * l_uncor_bs1);
912	vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
913	+ (uncor_param * l_uncor_bs2);
914
915	vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
916	+ (ew_g * uncor_dist1 * uncor_dist1)
917	+ (ew_b * uncor_dist2 * uncor_dist2);
918
919	haccumulate(uncor_errorsumv, uncor_err, mask);
920
921	// Process samechroma data
922	vfloat samec_param = (data_r * l_samec_bs0)
923	+ (data_g * l_samec_bs1)
924	+ (data_b * l_samec_bs2);
925
926	vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
927	vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
928	vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
929
930	vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
931	+ (ew_g * samec_dist1 * samec_dist1)
932	+ (ew_b * samec_dist2 * samec_dist2);
933
934	haccumulate(samec_errorsumv, samec_err, mask);
935
936	lane_ids += vint (ASTCENC_SIMD_WIDTH);
937	}
938
939	// Turn very small numbers and NaNs into a small number
940	float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
941	pl.line_length = astc::max(uncor_linelen, `1e-7f`);
942	}
943
944	uncor_error = hadd_s(uncor_errorsumv);
945	samec_error = hadd_s(samec_errorsumv);
946	}
947
948	#endif
949

Browse the source code of Godot/thirdparty/astcenc/astcenc_averages_and_directions.cpp