astcenc_block_sizes.cpp source code [Godot/thirdparty/astcenc/astcenc_block_sizes.cpp]

1	// SPDX-License-Identifier: Apache-2.0
2	// ----------------------------------------------------------------------------
3	// Copyright 2011-2023 Arm Limited
4	//
5	// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6	// use this file except in compliance with the License. You may obtain a copy
7	// of the License at:
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing, software
12	// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13	// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14	// License for the specific language governing permissions and limitations
15	// under the License.
16	// ----------------------------------------------------------------------------
17
18	/**
19	* @brief Functions to generate block size descriptor and decimation tables.
20	*/
21
22	#include "astcenc_internal.h"
23
24	/**
25	* @brief Decode the properties of an encoded 2D block mode.
26	*
27	* @param block_mode The encoded block mode.
28	* @param[out] x_weights The number of weights in the X dimension.
29	* @param[out] y_weights The number of weights in the Y dimension.
30	* @param[out] is_dual_plane True if this block mode has two weight planes.
31	* @param[out] quant_mode The quantization level for the weights.
32	* @param[out] weight_bits The storage bit count for the weights.
33	*
34	* @return Returns true if a valid mode, false otherwise.
35	*/
36	static bool decode_block_mode_2d(
37	unsigned int block_mode,
38	unsigned int& x_weights,
39	unsigned int& y_weights,
40	bool& is_dual_plane,
41	unsigned int& quant_mode,
42	unsigned int& weight_bits
43	) {
44	unsigned int base_quant_mode = (block_mode >> `4`) & `1`;
45	unsigned int H = (block_mode >> `9`) & `1`;
46	unsigned int D = (block_mode >> `10`) & `1`;
47	unsigned int A = (block_mode >> `5`) & `0x3`;
48
49	x_weights = `0`;
50	y_weights = `0`;
51
52	if ((block_mode & `3`) != `0`)
53	{
54	base_quant_mode \|= (block_mode & `3`) << `1`;
55	unsigned int B = (block_mode >> `7`) & `3`;
56	switch ((block_mode >> `2`) & `3`)
57	{
58	case `0`:
59	x_weights = B + `4`;
60	y_weights = A + `2`;
61	break;
62	case `1`:
63	x_weights = B + `8`;
64	y_weights = A + `2`;
65	break;
66	case `2`:
67	x_weights = A + `2`;
68	y_weights = B + `8`;
69	break;
70	case `3`:
71	B &= `1`;
72	if (block_mode & `0x100`)
73	{
74	x_weights = B + `2`;
75	y_weights = A + `2`;
76	}
77	else
78	{
79	x_weights = A + `2`;
80	y_weights = B + `6`;
81	}
82	break;
83	}
84	}
85	else
86	{
87	base_quant_mode \|= ((block_mode >> `2`) & `3`) << `1`;
88	if (((block_mode >> `2`) & `3`) == `0`)
89	{
90	return false;
91	}
92
93	unsigned int B = (block_mode >> `9`) & `3`;
94	switch ((block_mode >> `7`) & `3`)
95	{
96	case `0`:
97	x_weights = `12`;
98	y_weights = A + `2`;
99	break;
100	case `1`:
101	x_weights = A + `2`;
102	y_weights = `12`;
103	break;
104	case `2`:
105	x_weights = A + `6`;
106	y_weights = B + `6`;
107	D = `0`;
108	H = `0`;
109	break;
110	case `3`:
111	switch ((block_mode >> `5`) & `3`)
112	{
113	case `0`:
114	x_weights = `6`;
115	y_weights = `10`;
116	break;
117	case `1`:
118	x_weights = `10`;
119	y_weights = `6`;
120	break;
121	case `2`:
122	case `3`:
123	return false;
124	}
125	break;
126	}
127	}
128
129	unsigned int weight_count = x_weights * y_weights * (D + `1`);
130	quant_mode = (base_quant_mode - `2`) + `6` * H;
131	is_dual_plane = D != `0`;
132
133	weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode));
134	return (weight_count <= BLOCK_MAX_WEIGHTS &&
135	weight_bits >= BLOCK_MIN_WEIGHT_BITS &&
136	weight_bits <= BLOCK_MAX_WEIGHT_BITS);
137	}
138
139	/**
140	* @brief Decode the properties of an encoded 3D block mode.
141	*
142	* @param block_mode The encoded block mode.
143	* @param[out] x_weights The number of weights in the X dimension.
144	* @param[out] y_weights The number of weights in the Y dimension.
145	* @param[out] z_weights The number of weights in the Z dimension.
146	* @param[out] is_dual_plane True if this block mode has two weight planes.
147	* @param[out] quant_mode The quantization level for the weights.
148	* @param[out] weight_bits The storage bit count for the weights.
149	*
150	* @return Returns true if a valid mode, false otherwise.
151	*/
152	static bool decode_block_mode_3d(
153	unsigned int block_mode,
154	unsigned int& x_weights,
155	unsigned int& y_weights,
156	unsigned int& z_weights,
157	bool& is_dual_plane,
158	unsigned int& quant_mode,
159	unsigned int& weight_bits
160	) {
161	unsigned int base_quant_mode = (block_mode >> `4`) & `1`;
162	unsigned int H = (block_mode >> `9`) & `1`;
163	unsigned int D = (block_mode >> `10`) & `1`;
164	unsigned int A = (block_mode >> `5`) & `0x3`;
165
166	x_weights = `0`;
167	y_weights = `0`;
168	z_weights = `0`;
169
170	if ((block_mode & `3`) != `0`)
171	{
172	base_quant_mode \|= (block_mode & `3`) << `1`;
173	unsigned int B = (block_mode >> `7`) & `3`;
174	unsigned int C = (block_mode >> `2`) & `0x3`;
175	x_weights = A + `2`;
176	y_weights = B + `2`;
177	z_weights = C + `2`;
178	}
179	else
180	{
181	base_quant_mode \|= ((block_mode >> `2`) & `3`) << `1`;
182	if (((block_mode >> `2`) & `3`) == `0`)
183	{
184	return false;
185	}
186
187	int B = (block_mode >> `9`) & `3`;
188	if (((block_mode >> `7`) & `3`) != `3`)
189	{
190	D = `0`;
191	H = `0`;
192	}
193	switch ((block_mode >> `7`) & `3`)
194	{
195	case `0`:
196	x_weights = `6`;
197	y_weights = B + `2`;
198	z_weights = A + `2`;
199	break;
200	case `1`:
201	x_weights = A + `2`;
202	y_weights = `6`;
203	z_weights = B + `2`;
204	break;
205	case `2`:
206	x_weights = A + `2`;
207	y_weights = B + `2`;
208	z_weights = `6`;
209	break;
210	case `3`:
211	x_weights = `2`;
212	y_weights = `2`;
213	z_weights = `2`;
214	switch ((block_mode >> `5`) & `3`)
215	{
216	case `0`:
217	x_weights = `6`;
218	break;
219	case `1`:
220	y_weights = `6`;
221	break;
222	case `2`:
223	z_weights = `6`;
224	break;
225	case `3`:
226	return false;
227	}
228	break;
229	}
230	}
231
232	unsigned int weight_count = x_weights * y_weights * z_weights * (D + `1`);
233	quant_mode = (base_quant_mode - `2`) + `6` * H;
234	is_dual_plane = D != `0`;
235
236	weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode));
237	return (weight_count <= BLOCK_MAX_WEIGHTS &&
238	weight_bits >= BLOCK_MIN_WEIGHT_BITS &&
239	weight_bits <= BLOCK_MAX_WEIGHT_BITS);
240	}
241
242	/**
243	* @brief Create a 2D decimation entry for a block-size and weight-decimation pair.
244	*
245	* @param x_texels The number of texels in the X dimension.
246	* @param y_texels The number of texels in the Y dimension.
247	* @param x_weights The number of weights in the X dimension.
248	* @param y_weights The number of weights in the Y dimension.
249	* @param[out] di The decimation info structure to populate.
250	* @param[out] wb The decimation table init scratch working buffers.
251	*/
252	static void init_decimation_info_2d(
253	unsigned int x_texels,
254	unsigned int y_texels,
255	unsigned int x_weights,
256	unsigned int y_weights,
257	decimation_info& di,
258	dt_init_working_buffers& wb
259	) {
260	unsigned int texels_per_block = x_texels * y_texels;
261	unsigned int weights_per_block = x_weights * y_weights;
262
263	uint8_t max_texel_count_of_weight = `0`;
264
265	promise(weights_per_block > `0`);
266	promise(texels_per_block > `0`);
267	promise(x_texels > `0`);
268	promise(y_texels > `0`);
269
270	for (unsigned int i = `0`; i < weights_per_block; i++)
271	{
272	wb.texel_count_of_weight[i] = `0`;
273	}
274
275	for (unsigned int i = `0`; i < texels_per_block; i++)
276	{
277	wb.weight_count_of_texel[i] = `0`;
278	}
279
280	for (unsigned int y = `0`; y < y_texels; y++)
281	{
282	for (unsigned int x = `0`; x < x_texels; x++)
283	{
284	unsigned int texel = y * x_texels + x;
285
286	unsigned int x_weight = (((`1024` + x_texels / `2`) / (x_texels - `1`)) * x * (x_weights - `1`) + `32`) >> `6`;
287	unsigned int y_weight = (((`1024` + y_texels / `2`) / (y_texels - `1`)) * y * (y_weights - `1`) + `32`) >> `6`;
288
289	unsigned int x_weight_frac = x_weight & `0xF`;
290	unsigned int y_weight_frac = y_weight & `0xF`;
291	unsigned int x_weight_int = x_weight >> `4`;
292	unsigned int y_weight_int = y_weight >> `4`;
293
294	unsigned int qweight[`4`];
295	qweight[`0`] = x_weight_int + y_weight_int * x_weights;
296	qweight[`1`] = qweight[`0`] + `1`;
297	qweight[`2`] = qweight[`0`] + x_weights;
298	qweight[`3`] = qweight[`2`] + `1`;
299
300	// Truncated-precision bilinear interpolation
301	unsigned int prod = x_weight_frac * y_weight_frac;
302
303	unsigned int weight[`4`];
304	weight[`3`] = (prod + `8`) >> `4`;
305	weight[`1`] = x_weight_frac - weight[`3`];
306	weight[`2`] = y_weight_frac - weight[`3`];
307	weight[`0`] = `16` - x_weight_frac - y_weight_frac + weight[`3`];
308
309	for (unsigned int i = `0`; i < `4`; i++)
310	{
311	if (weight[i] != `0`)
312	{
313	wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
314	wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
315	wb.weight_count_of_texel[texel]++;
316	wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
317	wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
318	wb.texel_count_of_weight[qweight[i]]++;
319	max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
320	}
321	}
322	}
323	}
324
325	uint8_t max_texel_weight_count = `0`;
326	for (unsigned int i = `0`; i < texels_per_block; i++)
327	{
328	di.texel_weight_count[i] = wb.weight_count_of_texel[i];
329	max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]);
330
331	for (unsigned int j = `0`; j < wb.weight_count_of_texel[i]; j++)
332	{
333	di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
334	di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (`1.0f` / WEIGHTS_TEXEL_SUM);
335	di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
336	}
337
338	// Init all 4 entries so we can rely on zeros for vectorization
339	for (unsigned int j = wb.weight_count_of_texel[i]; j < `4`; j++)
340	{
341	di.texel_weight_contribs_int_tr[j][i] = `0`;
342	di.texel_weight_contribs_float_tr[j][i] = `0.0f`;
343	di.texel_weights_tr[j][i] = `0`;
344	}
345	}
346
347	di.max_texel_weight_count = max_texel_weight_count;
348
349	for (unsigned int i = `0`; i < weights_per_block; i++)
350	{
351	unsigned int texel_count_wt = wb.texel_count_of_weight[i];
352	di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt);
353
354	for (unsigned int j = `0`; j < texel_count_wt; j++)
355	{
356	uint8_t texel = wb.texels_of_weight[i][j];
357
358	// Create transposed versions of these for better vectorization
359	di.weight_texels_tr[j][i] = texel;
360	di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
361
362	// Store the per-texel contribution of this weight for each texel it contributes to
363	di.texel_contrib_for_weight[j][i] = `0.0f`;
364	for (unsigned int k = `0`; k < `4`; k++)
365	{
366	uint8_t dttw = di.texel_weights_tr[k][texel];
367	float dttwf = di.texel_weight_contribs_float_tr[k][texel];
368	if (dttw == i && dttwf != `0.0f`)
369	{
370	di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
371	break;
372	}
373	}
374	}
375
376	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
377	// Match last texel in active lane in SIMD group, for better gathers
378	uint8_t last_texel = di.weight_texels_tr[texel_count_wt - `1`][i];
379	for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
380	{
381	di.weight_texels_tr[j][i] = last_texel;
382	di.weights_texel_contribs_tr[j][i] = `0.0f`;
383	}
384	}
385
386	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
387	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
388	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
389	{
390	di.texel_weight_count[i] = `0`;
391
392	for (unsigned int j = `0`; j < `4`; j++)
393	{
394	di.texel_weight_contribs_float_tr[j][i] = `0`;
395	di.texel_weights_tr[j][i] = `0`;
396	di.texel_weight_contribs_int_tr[j][i] = `0`;
397	}
398	}
399
400	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
401	// Match last texel in active lane in SIMD group, for better gathers
402	unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - `1`];
403	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - `1`][weights_per_block - `1`];
404
405	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
406	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
407	{
408	di.weight_texel_count[i] = `0`;
409
410	for (unsigned int j = `0`; j < max_texel_count_of_weight; j++)
411	{
412	di.weight_texels_tr[j][i] = last_texel;
413	di.weights_texel_contribs_tr[j][i] = `0.0f`;
414	}
415	}
416
417	di.texel_count = static_cast<uint8_t>(texels_per_block);
418	di.weight_count = static_cast<uint8_t>(weights_per_block);
419	di.weight_x = static_cast<uint8_t>(x_weights);
420	di.weight_y = static_cast<uint8_t>(y_weights);
421	di.weight_z = `1`;
422	}
423
424	/**
425	* @brief Create a 3D decimation entry for a block-size and weight-decimation pair.
426	*
427	* @param x_texels The number of texels in the X dimension.
428	* @param y_texels The number of texels in the Y dimension.
429	* @param z_texels The number of texels in the Z dimension.
430	* @param x_weights The number of weights in the X dimension.
431	* @param y_weights The number of weights in the Y dimension.
432	* @param z_weights The number of weights in the Z dimension.
433	* @param[out] di The decimation info structure to populate.
434	@param[out] wb The decimation table init scratch working buffers.
435	*/
436	static void init_decimation_info_3d(
437	unsigned int x_texels,
438	unsigned int y_texels,
439	unsigned int z_texels,
440	unsigned int x_weights,
441	unsigned int y_weights,
442	unsigned int z_weights,
443	decimation_info& di,
444	dt_init_working_buffers& wb
445	) {
446	unsigned int texels_per_block = x_texels * y_texels * z_texels;
447	unsigned int weights_per_block = x_weights * y_weights * z_weights;
448
449	uint8_t max_texel_count_of_weight = `0`;
450
451	promise(weights_per_block > `0`);
452	promise(texels_per_block > `0`);
453
454	for (unsigned int i = `0`; i < weights_per_block; i++)
455	{
456	wb.texel_count_of_weight[i] = `0`;
457	}
458
459	for (unsigned int i = `0`; i < texels_per_block; i++)
460	{
461	wb.weight_count_of_texel[i] = `0`;
462	}
463
464	for (unsigned int z = `0`; z < z_texels; z++)
465	{
466	for (unsigned int y = `0`; y < y_texels; y++)
467	{
468	for (unsigned int x = `0`; x < x_texels; x++)
469	{
470	int texel = (z * y_texels + y) * x_texels + x;
471
472	int x_weight = (((`1024` + x_texels / `2`) / (x_texels - `1`)) * x * (x_weights - `1`) + `32`) >> `6`;
473	int y_weight = (((`1024` + y_texels / `2`) / (y_texels - `1`)) * y * (y_weights - `1`) + `32`) >> `6`;
474	int z_weight = (((`1024` + z_texels / `2`) / (z_texels - `1`)) * z * (z_weights - `1`) + `32`) >> `6`;
475
476	int x_weight_frac = x_weight & `0xF`;
477	int y_weight_frac = y_weight & `0xF`;
478	int z_weight_frac = z_weight & `0xF`;
479	int x_weight_int = x_weight >> `4`;
480	int y_weight_int = y_weight >> `4`;
481	int z_weight_int = z_weight >> `4`;
482	int qweight[`4`];
483	int weight[`4`];
484	qweight[`0`] = (z_weight_int * y_weights + y_weight_int) * x_weights + x_weight_int;
485	qweight[`3`] = ((z_weight_int + `1`) * y_weights + (y_weight_int + `1`)) * x_weights + (x_weight_int + `1`);
486
487	// simplex interpolation
488	int fs = x_weight_frac;
489	int ft = y_weight_frac;
490	int fp = z_weight_frac;
491
492	int cas = ((fs > ft) << `2`) + ((ft > fp) << `1`) + ((fs > fp));
493	int N = x_weights;
494	int NM = x_weights * y_weights;
495
496	int s1, s2, w0, w1, w2, w3;
497	switch (cas)
498	{
499	case `7`:
500	s1 = `1`;
501	s2 = N;
502	w0 = `16` - fs;
503	w1 = fs - ft;
504	w2 = ft - fp;
505	w3 = fp;
506	break;
507	case `3`:
508	s1 = N;
509	s2 = `1`;
510	w0 = `16` - ft;
511	w1 = ft - fs;
512	w2 = fs - fp;
513	w3 = fp;
514	break;
515	case `5`:
516	s1 = `1`;
517	s2 = NM;
518	w0 = `16` - fs;
519	w1 = fs - fp;
520	w2 = fp - ft;
521	w3 = ft;
522	break;
523	case `4`:
524	s1 = NM;
525	s2 = `1`;
526	w0 = `16` - fp;
527	w1 = fp - fs;
528	w2 = fs - ft;
529	w3 = ft;
530	break;
531	case `2`:
532	s1 = N;
533	s2 = NM;
534	w0 = `16` - ft;
535	w1 = ft - fp;
536	w2 = fp - fs;
537	w3 = fs;
538	break;
539	case `0`:
540	s1 = NM;
541	s2 = N;
542	w0 = `16` - fp;
543	w1 = fp - ft;
544	w2 = ft - fs;
545	w3 = fs;
546	break;
547	default:
548	s1 = NM;
549	s2 = N;
550	w0 = `16` - fp;
551	w1 = fp - ft;
552	w2 = ft - fs;
553	w3 = fs;
554	break;
555	}
556
557	qweight[`1`] = qweight[`0`] + s1;
558	qweight[`2`] = qweight[`1`] + s2;
559	weight[`0`] = w0;
560	weight[`1`] = w1;
561	weight[`2`] = w2;
562	weight[`3`] = w3;
563
564	for (unsigned int i = `0`; i < `4`; i++)
565	{
566	if (weight[i] != `0`)
567	{
568	wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
569	wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
570	wb.weight_count_of_texel[texel]++;
571	wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
572	wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
573	wb.texel_count_of_weight[qweight[i]]++;
574	max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
575	}
576	}
577	}
578	}
579	}
580
581	uint8_t max_texel_weight_count = `0`;
582	for (unsigned int i = `0`; i < texels_per_block; i++)
583	{
584	di.texel_weight_count[i] = wb.weight_count_of_texel[i];
585	max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]);
586
587	// Init all 4 entries so we can rely on zeros for vectorization
588	for (unsigned int j = `0`; j < `4`; j++)
589	{
590	di.texel_weight_contribs_int_tr[j][i] = `0`;
591	di.texel_weight_contribs_float_tr[j][i] = `0.0f`;
592	di.texel_weights_tr[j][i] = `0`;
593	}
594
595	for (unsigned int j = `0`; j < wb.weight_count_of_texel[i]; j++)
596	{
597	di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
598	di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (`1.0f` / WEIGHTS_TEXEL_SUM);
599	di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
600	}
601	}
602
603	di.max_texel_weight_count = max_texel_weight_count;
604
605	for (unsigned int i = `0`; i < weights_per_block; i++)
606	{
607	unsigned int texel_count_wt = wb.texel_count_of_weight[i];
608	di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt);
609
610	for (unsigned int j = `0`; j < texel_count_wt; j++)
611	{
612	unsigned int texel = wb.texels_of_weight[i][j];
613
614	// Create transposed versions of these for better vectorization
615	di.weight_texels_tr[j][i] = static_cast<uint8_t>(texel);
616	di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
617
618	// Store the per-texel contribution of this weight for each texel it contributes to
619	di.texel_contrib_for_weight[j][i] = `0.0f`;
620	for (unsigned int k = `0`; k < `4`; k++)
621	{
622	uint8_t dttw = di.texel_weights_tr[k][texel];
623	float dttwf = di.texel_weight_contribs_float_tr[k][texel];
624	if (dttw == i && dttwf != `0.0f`)
625	{
626	di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
627	break;
628	}
629	}
630	}
631
632	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
633	// Match last texel in active lane in SIMD group, for better gathers
634	uint8_t last_texel = di.weight_texels_tr[texel_count_wt - `1`][i];
635	for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
636	{
637	di.weight_texels_tr[j][i] = last_texel;
638	di.weights_texel_contribs_tr[j][i] = `0.0f`;
639	}
640	}
641
642	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
643	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
644	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
645	{
646	di.texel_weight_count[i] = `0`;
647
648	for (unsigned int j = `0`; j < `4`; j++)
649	{
650	di.texel_weight_contribs_float_tr[j][i] = `0`;
651	di.texel_weights_tr[j][i] = `0`;
652	di.texel_weight_contribs_int_tr[j][i] = `0`;
653	}
654	}
655
656	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
657	// Match last texel in active lane in SIMD group, for better gathers
658	int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - `1`];
659	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - `1`][weights_per_block - `1`];
660
661	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
662	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
663	{
664	di.weight_texel_count[i] = `0`;
665
666	for (int j = `0`; j < max_texel_count_of_weight; j++)
667	{
668	di.weight_texels_tr[j][i] = last_texel;
669	di.weights_texel_contribs_tr[j][i] = `0.0f`;
670	}
671	}
672
673	di.texel_count = static_cast<uint8_t>(texels_per_block);
674	di.weight_count = static_cast<uint8_t>(weights_per_block);
675	di.weight_x = static_cast<uint8_t>(x_weights);
676	di.weight_y = static_cast<uint8_t>(y_weights);
677	di.weight_z = static_cast<uint8_t>(z_weights);
678	}
679
680	/**
681	* @brief Assign the texels to use for kmeans clustering.
682	*
683	* The max limit is @c BLOCK_MAX_KMEANS_TEXELS; above this a random selection is used.
684	* The @c bsd.texel_count is an input and must be populated beforehand.
685	*
686	* @param[in,out] bsd The block size descriptor to populate.
687	*/
688	static void assign_kmeans_texels(
689	block_size_descriptor& bsd
690	) {
691	// Use all texels for kmeans on a small block
692	if (bsd.texel_count <= BLOCK_MAX_KMEANS_TEXELS)
693	{
694	for (uint8_t i = `0`; i < bsd.texel_count; i++)
695	{
696	bsd.kmeans_texels[i] = i;
697	}
698
699	return;
700	}
701
702	// Select a random subset of BLOCK_MAX_KMEANS_TEXELS for kmeans on a large block
703	uint64_t rng_state[`2`];
704	astc::rand_init(rng_state);
705
706	// Initialize array used for tracking used indices
707	bool seen[BLOCK_MAX_TEXELS];
708	for (uint8_t i = `0`; i < bsd.texel_count; i++)
709	{
710	seen[i] = false;
711	}
712
713	// Assign 64 random indices, retrying if we see repeats
714	unsigned int arr_elements_set = `0`;
715	while (arr_elements_set < BLOCK_MAX_KMEANS_TEXELS)
716	{
717	uint8_t texel = static_cast<uint8_t>(astc::rand(rng_state));
718	texel = texel % bsd.texel_count;
719	if (!seen[texel])
720	{
721	bsd.kmeans_texels[arr_elements_set++] = texel;
722	seen[texel] = true;
723	}
724	}
725	}
726
727	/**
728	* @brief Allocate a single 2D decimation table entry.
729	*
730	* @param x_texels The number of texels in the X dimension.
731	* @param y_texels The number of texels in the Y dimension.
732	* @param x_weights The number of weights in the X dimension.
733	* @param y_weights The number of weights in the Y dimension.
734	* @param bsd The block size descriptor we are populating.
735	* @param wb The decimation table init scratch working buffers.
736	* @param index The packed array index to populate.
737	*/
738	static void construct_dt_entry_2d(
739	unsigned int x_texels,
740	unsigned int y_texels,
741	unsigned int x_weights,
742	unsigned int y_weights,
743	block_size_descriptor& bsd,
744	dt_init_working_buffers& wb,
745	unsigned int index
746	) {
747	unsigned int weight_count = x_weights * y_weights;
748	assert(weight_count <= BLOCK_MAX_WEIGHTS);
749
750	bool try_2planes = (`2` * weight_count) <= BLOCK_MAX_WEIGHTS;
751
752	decimation_info& di = bsd.decimation_tables[index];
753	init_decimation_info_2d(x_texels, y_texels, x_weights, y_weights, di, wb);
754
755	int maxprec_1plane = -`1`;
756	int maxprec_2planes = -`1`;
757	for (int i = `0`; i < `12`; i++)
758	{
759	unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i));
760	if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS)
761	{
762	maxprec_1plane = i;
763	}
764
765	if (try_2planes)
766	{
767	unsigned int bits_2planes = get_ise_sequence_bitcount(`2` * weight_count, static_cast<quant_method>(i));
768	if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS)
769	{
770	maxprec_2planes = i;
771	}
772	}
773	}
774
775	// At least one of the two should be valid ...
776	assert(maxprec_1plane >= `0` \|\| maxprec_2planes >= `0`);
777	bsd.decimation_modes[index].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
778	bsd.decimation_modes[index].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
779	bsd.decimation_modes[index].refprec_1plane = `0`;
780	bsd.decimation_modes[index].refprec_2planes = `0`;
781	}
782
783	/**
784	* @brief Allocate block modes and decimation tables for a single 2D block size.
785	*
786	* @param x_texels The number of texels in the X dimension.
787	* @param y_texels The number of texels in the Y dimension.
788	* @param can_omit_modes Can we discard modes that astcenc won't use, even if legal?
789	* @param mode_cutoff Percentile cutoff in range [0,1]. Low values more likely to be used.
790	* @param[out] bsd The block size descriptor to populate.
791	*/
792	static void construct_block_size_descriptor_2d(
793	unsigned int x_texels,
794	unsigned int y_texels,
795	bool can_omit_modes,
796	float mode_cutoff,
797	block_size_descriptor& bsd
798	) {
799	// Store a remap table for storing packed decimation modes.
800	// Indexing uses [Y 16 + X] and max size for each axis is 12.*
801	static const unsigned int MAX_DMI = `12` * `16` + `12`;
802	int decimation_mode_index[MAX_DMI];
803
804	dt_init_working_buffers* wb = new dt_init_working_buffers;
805
806	bsd.xdim = static_cast<uint8_t>(x_texels);
807	bsd.ydim = static_cast<uint8_t>(y_texels);
808	bsd.zdim = `1`;
809	bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels);
810
811	for (unsigned int i = `0`; i < MAX_DMI; i++)
812	{
813	decimation_mode_index[i] = -`1`;
814	}
815
816	// Gather all the decimation grids that can be used with the current block
817	#if !defined(ASTCENC_DECOMPRESS_ONLY)
818	const float *percentiles = get_2d_percentile_table(x_texels, y_texels);
819	float always_cutoff = `0.0f`;
820	#else
821	// Unused in decompress-only builds
822	(void)can_omit_modes;
823	(void)mode_cutoff;
824	#endif
825
826	// Construct the list of block formats referencing the decimation tables
827	unsigned int packed_bm_idx = `0`;
828	unsigned int packed_dm_idx = `0`;
829
830	// Trackers
831	unsigned int bm_counts[`4`] { `0` };
832	unsigned int dm_counts[`4`] { `0` };
833
834	// Clear the list to a known-bad value
835	for (unsigned int i = `0`; i < WEIGHTS_MAX_BLOCK_MODES; i++)
836	{
837	bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE;
838	}
839
840	// Iterate four times to build a usefully ordered list:
841	// - Pass 0 - keep selected single plane "always" block modes
842	// - Pass 1 - keep selected single plane "non-always" block modes
843	// - Pass 2 - keep select dual plane block modes
844	// - Pass 3 - keep everything else that's legal
845	unsigned int limit = can_omit_modes ? `3` : `4`;
846	for (unsigned int j = `0`; j < limit; j ++)
847	{
848	for (unsigned int i = `0`; i < WEIGHTS_MAX_BLOCK_MODES; i++)
849	{
850	// Skip modes we've already included in a previous pass
851	if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE)
852	{
853	continue;
854	}
855
856	// Decode parameters
857	unsigned int x_weights;
858	unsigned int y_weights;
859	bool is_dual_plane;
860	unsigned int quant_mode;
861	unsigned int weight_bits;
862	bool valid = decode_block_mode_2d(i, x_weights, y_weights, is_dual_plane, quant_mode, weight_bits);
863
864	// Always skip invalid encodings for the current block size
865	if (!valid \|\| (x_weights > x_texels) \|\| (y_weights > y_texels))
866	{
867	continue;
868	}
869
870	// Selectively skip dual plane encodings
871	if (((j <= `1`) && is_dual_plane) \|\| (j == `2` && !is_dual_plane))
872	{
873	continue;
874	}
875
876	// Always skip encodings we can't physically encode based on
877	// generic encoding bit availability
878	if (is_dual_plane)
879	{
880	// This is the only check we need as only support 1 partition
881	if ((`109` - weight_bits) <= `0`)
882	{
883	continue;
884	}
885	}
886	else
887	{
888	// This is conservative - fewer bits may be available for > 1 partition
889	if ((`111` - weight_bits) <= `0`)
890	{
891	continue;
892	}
893	}
894
895	// Selectively skip encodings based on percentile
896	bool percentile_hit = false;
897	#if !defined(ASTCENC_DECOMPRESS_ONLY)
898	if (j == `0`)
899	{
900	percentile_hit = percentiles[i] <= always_cutoff;
901	}
902	else
903	{
904	percentile_hit = percentiles[i] <= mode_cutoff;
905	}
906	#endif
907
908	if (j != `3` && !percentile_hit)
909	{
910	continue;
911	}
912
913	// Allocate and initialize the decimation table entry if we've not used it yet
914	int decimation_mode = decimation_mode_index[y_weights * `16` + x_weights];
915	if (decimation_mode < `0`)
916	{
917	construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd, *wb, packed_dm_idx);
918	decimation_mode_index[y_weights * `16` + x_weights] = packed_dm_idx;
919	decimation_mode = packed_dm_idx;
920
921	dm_counts[j]++;
922	packed_dm_idx++;
923	}
924
925	auto& bm = bsd.block_modes[packed_bm_idx];
926
927	bm.decimation_mode = static_cast<uint8_t>(decimation_mode);
928	bm.quant_mode = static_cast<uint8_t>(quant_mode);
929	bm.is_dual_plane = static_cast<uint8_t>(is_dual_plane);
930	bm.weight_bits = static_cast<uint8_t>(weight_bits);
931	bm.mode_index = static_cast<uint16_t>(i);
932
933	auto& dm = bsd.decimation_modes[decimation_mode];
934
935	if (is_dual_plane)
936	{
937	dm.set_ref_2plane(bm.get_weight_quant_mode());
938	}
939	else
940	{
941	dm.set_ref_1plane(bm.get_weight_quant_mode());
942	}
943
944	bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_bm_idx);
945
946	packed_bm_idx++;
947	bm_counts[j]++;
948	}
949	}
950
951	bsd.block_mode_count_1plane_always = bm_counts[`0`];
952	bsd.block_mode_count_1plane_selected = bm_counts[`0`] + bm_counts[`1`];
953	bsd.block_mode_count_1plane_2plane_selected = bm_counts[`0`] + bm_counts[`1`] + bm_counts[`2`];
954	bsd.block_mode_count_all = bm_counts[`0`] + bm_counts[`1`] + bm_counts[`2`] + bm_counts[`3`];
955
956	bsd.decimation_mode_count_always = dm_counts[`0`];
957	bsd.decimation_mode_count_selected = dm_counts[`0`] + dm_counts[`1`] + dm_counts[`2`];
958	bsd.decimation_mode_count_all = dm_counts[`0`] + dm_counts[`1`] + dm_counts[`2`] + dm_counts[`3`];
959
960	#if !defined(ASTCENC_DECOMPRESS_ONLY)
961	assert(bsd.block_mode_count_1plane_always > `0`);
962	assert(bsd.decimation_mode_count_always > `0`);
963
964	delete[] percentiles;
965	#endif
966
967	// Ensure the end of the array contains valid data (should never get read)
968	for (unsigned int i = bsd.decimation_mode_count_all; i < WEIGHTS_MAX_DECIMATION_MODES; i++)
969	{
970	bsd.decimation_modes[i].maxprec_1plane = -`1`;
971	bsd.decimation_modes[i].maxprec_2planes = -`1`;
972	bsd.decimation_modes[i].refprec_1plane = `0`;
973	bsd.decimation_modes[i].refprec_2planes = `0`;
974	}
975
976	// Determine the texels to use for kmeans clustering.
977	assign_kmeans_texels(bsd);
978
979	delete wb;
980	}
981
982	/**
983	* @brief Allocate block modes and decimation tables for a single 3D block size.
984	*
985	* TODO: This function doesn't include all of the heuristics that we use for 2D block sizes such as
986	* the percentile mode cutoffs. If 3D becomes more widely used we should look at this.
987	*
988	* @param x_texels The number of texels in the X dimension.
989	* @param y_texels The number of texels in the Y dimension.
990	* @param z_texels The number of texels in the Z dimension.
991	* @param[out] bsd The block size descriptor to populate.
992	*/
993	static void construct_block_size_descriptor_3d(
994	unsigned int x_texels,
995	unsigned int y_texels,
996	unsigned int z_texels,
997	block_size_descriptor& bsd
998	) {
999	// Store a remap table for storing packed decimation modes.
1000	// Indexing uses [Z 64 + Y * 8 + X] and max size for each axis is 6.*
1001	static constexpr unsigned int MAX_DMI = `6` * `64` + `6` * `8` + `6`;
1002	int decimation_mode_index[MAX_DMI];
1003	unsigned int decimation_mode_count = `0`;
1004
1005	dt_init_working_buffers* wb = new dt_init_working_buffers;
1006
1007	bsd.xdim = static_cast<uint8_t>(x_texels);
1008	bsd.ydim = static_cast<uint8_t>(y_texels);
1009	bsd.zdim = static_cast<uint8_t>(z_texels);
1010	bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels * z_texels);
1011
1012	for (unsigned int i = `0`; i < MAX_DMI; i++)
1013	{
1014	decimation_mode_index[i] = -`1`;
1015	}
1016
1017	// gather all the infill-modes that can be used with the current block size
1018	for (unsigned int x_weights = `2`; x_weights <= x_texels; x_weights++)
1019	{
1020	for (unsigned int y_weights = `2`; y_weights <= y_texels; y_weights++)
1021	{
1022	for (unsigned int z_weights = `2`; z_weights <= z_texels; z_weights++)
1023	{
1024	unsigned int weight_count = x_weights * y_weights * z_weights;
1025	if (weight_count > BLOCK_MAX_WEIGHTS)
1026	{
1027	continue;
1028	}
1029
1030	decimation_info& di = bsd.decimation_tables[decimation_mode_count];
1031	decimation_mode_index[z_weights * `64` + y_weights * `8` + x_weights] = decimation_mode_count;
1032	init_decimation_info_3d(x_texels, y_texels, z_texels, x_weights, y_weights, z_weights, di, *wb);
1033
1034	int maxprec_1plane = -`1`;
1035	int maxprec_2planes = -`1`;
1036	for (unsigned int i = `0`; i < `12`; i++)
1037	{
1038	unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i));
1039	if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS)
1040	{
1041	maxprec_1plane = i;
1042	}
1043
1044	unsigned int bits_2planes = get_ise_sequence_bitcount(`2` * weight_count, static_cast<quant_method>(i));
1045	if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS)
1046	{
1047	maxprec_2planes = i;
1048	}
1049	}
1050
1051	if ((`2` * weight_count) > BLOCK_MAX_WEIGHTS)
1052	{
1053	maxprec_2planes = -`1`;
1054	}
1055
1056	bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
1057	bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
1058	bsd.decimation_modes[decimation_mode_count].refprec_1plane = maxprec_1plane == -`1` ? `0` : `0xFFFF`;
1059	bsd.decimation_modes[decimation_mode_count].refprec_2planes = maxprec_2planes == -`1` ? `0` : `0xFFFF`;
1060	decimation_mode_count++;
1061	}
1062	}
1063	}
1064
1065	// Ensure the end of the array contains valid data (should never get read)
1066	for (unsigned int i = decimation_mode_count; i < WEIGHTS_MAX_DECIMATION_MODES; i++)
1067	{
1068	bsd.decimation_modes[i].maxprec_1plane = -`1`;
1069	bsd.decimation_modes[i].maxprec_2planes = -`1`;
1070	bsd.decimation_modes[i].refprec_1plane = `0`;
1071	bsd.decimation_modes[i].refprec_2planes = `0`;
1072	}
1073
1074	bsd.decimation_mode_count_always = `0`; // Skipped for 3D modes
1075	bsd.decimation_mode_count_selected = decimation_mode_count;
1076	bsd.decimation_mode_count_all = decimation_mode_count;
1077
1078	// Construct the list of block formats referencing the decimation tables
1079
1080	// Clear the list to a known-bad value
1081	for (unsigned int i = `0`; i < WEIGHTS_MAX_BLOCK_MODES; i++)
1082	{
1083	bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE;
1084	}
1085
1086	unsigned int packed_idx = `0`;
1087	unsigned int bm_counts[`2`] { `0` };
1088
1089	// Iterate two times to build a usefully ordered list:
1090	// - Pass 0 - keep valid single plane block modes
1091	// - Pass 1 - keep valid dual plane block modes
1092	for (unsigned int j = `0`; j < `2`; j++)
1093	{
1094	for (unsigned int i = `0`; i < WEIGHTS_MAX_BLOCK_MODES; i++)
1095	{
1096	// Skip modes we've already included in a previous pass
1097	if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE)
1098	{
1099	continue;
1100	}
1101
1102	unsigned int x_weights;
1103	unsigned int y_weights;
1104	unsigned int z_weights;
1105	bool is_dual_plane;
1106	unsigned int quant_mode;
1107	unsigned int weight_bits;
1108
1109	bool valid = decode_block_mode_3d(i, x_weights, y_weights, z_weights, is_dual_plane, quant_mode, weight_bits);
1110	// Skip invalid encodings
1111	if (!valid \|\| x_weights > x_texels \|\| y_weights > y_texels \|\| z_weights > z_texels)
1112	{
1113	continue;
1114	}
1115
1116	// Skip encodings in the wrong iteration
1117	if ((j == `0` && is_dual_plane) \|\| (j == `1` && !is_dual_plane))
1118	{
1119	continue;
1120	}
1121
1122	// Always skip encodings we can't physically encode based on bit availability
1123	if (is_dual_plane)
1124	{
1125	// This is the only check we need as only support 1 partition
1126	if ((`109` - weight_bits) <= `0`)
1127	{
1128	continue;
1129	}
1130	}
1131	else
1132	{
1133	// This is conservative - fewer bits may be available for > 1 partition
1134	if ((`111` - weight_bits) <= `0`)
1135	{
1136	continue;
1137	}
1138	}
1139
1140	int decimation_mode = decimation_mode_index[z_weights * `64` + y_weights * `8` + x_weights];
1141	bsd.block_modes[packed_idx].decimation_mode = static_cast<uint8_t>(decimation_mode);
1142	bsd.block_modes[packed_idx].quant_mode = static_cast<uint8_t>(quant_mode);
1143	bsd.block_modes[packed_idx].weight_bits = static_cast<uint8_t>(weight_bits);
1144	bsd.block_modes[packed_idx].is_dual_plane = static_cast<uint8_t>(is_dual_plane);
1145	bsd.block_modes[packed_idx].mode_index = static_cast<uint16_t>(i);
1146
1147	bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_idx);
1148	bm_counts[j]++;
1149	packed_idx++;
1150	}
1151	}
1152
1153	bsd.block_mode_count_1plane_always = `0`; // Skipped for 3D modes
1154	bsd.block_mode_count_1plane_selected = bm_counts[`0`];
1155	bsd.block_mode_count_1plane_2plane_selected = bm_counts[`0`] + bm_counts[`1`];
1156	bsd.block_mode_count_all = bm_counts[`0`] + bm_counts[`1`];
1157
1158	// Determine the texels to use for kmeans clustering.
1159	assign_kmeans_texels(bsd);
1160
1161	delete wb;
1162	}
1163
1164	/ See header for documentation. /
1165	void init_block_size_descriptor(
1166	unsigned int x_texels,
1167	unsigned int y_texels,
1168	unsigned int z_texels,
1169	bool can_omit_modes,
1170	unsigned int partition_count_cutoff,
1171	float mode_cutoff,
1172	block_size_descriptor& bsd
1173	) {
1174	if (z_texels > `1`)
1175	{
1176	construct_block_size_descriptor_3d(x_texels, y_texels, z_texels, bsd);
1177	}
1178	else
1179	{
1180	construct_block_size_descriptor_2d(x_texels, y_texels, can_omit_modes, mode_cutoff, bsd);
1181	}
1182
1183	init_partition_tables(bsd, can_omit_modes, partition_count_cutoff);
1184	}
1185

Browse the source code of Godot/thirdparty/astcenc/astcenc_block_sizes.cpp