astcenc_compute_variance.cpp source code [Godot/thirdparty/astcenc/astcenc_compute_variance.cpp]

1	// SPDX-License-Identifier: Apache-2.0
2	// ----------------------------------------------------------------------------
3	// Copyright 2011-2022 Arm Limited
4	//
5	// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6	// use this file except in compliance with the License. You may obtain a copy
7	// of the License at:
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing, software
12	// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13	// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14	// License for the specific language governing permissions and limitations
15	// under the License.
16	// ----------------------------------------------------------------------------
17
18	#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20	/**
21	* @brief Functions to calculate variance per component in a NxN footprint.
22	*
23	* We need N to be parametric, so the routine below uses summed area tables in order to execute in
24	* O(1) time independent of how big N is.
25	*
26	* The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
27	* perform a binary reduction, and then distributes the results. This method means that there is no
28	* serial dependency between a given element and the next one, and also significantly improves
29	* numerical stability allowing us to use floats rather than doubles.
30	*/
31
32	#include "astcenc_internal.h"
33
34	#include <cassert>
35
36	/**
37	* @brief Generate a prefix-sum array using the Brent-Kung algorithm.
38	*
39	* This will take an input array of the form:
40	* v0, v1, v2, ...
41	* ... and modify in-place to turn it into a prefix-sum array of the form:
42	* v0, v0+v1, v0+v1+v2, ...
43	*
44	* @param d The array to prefix-sum.
45	* @param items The number of items in the array.
46	* @param stride The item spacing in the array; i.e. dense arrays should use 1.
47	*/
48	static void brent_kung_prefix_sum(
49	vfloat4* d,
50	size_t items,
51	int stride
52	) {
53	if (items < `2`)
54	return;
55
56	size_t lc_stride = `2`;
57	size_t log2_stride = `1`;
58
59	// The reduction-tree loop
60	do {
61	size_t step = lc_stride >> `1`;
62	size_t start = lc_stride - `1`;
63	size_t iters = items >> log2_stride;
64
65	vfloat4 da = d + (start stride);
66	ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
67	size_t ofs_stride = stride << log2_stride;
68
69	while (iters)
70	{
71	da = da + da[ofs];
72	da += ofs_stride;
73	iters--;
74	}
75
76	log2_stride += `1`;
77	lc_stride <<= `1`;
78	} while (lc_stride <= items);
79
80	// The expansion-tree loop
81	do {
82	log2_stride -= `1`;
83	lc_stride >>= `1`;
84
85	size_t step = lc_stride >> `1`;
86	size_t start = step + lc_stride - `1`;
87	size_t iters = (items - step) >> log2_stride;
88
89	vfloat4 da = d + (start stride);
90	ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
91	size_t ofs_stride = stride << log2_stride;
92
93	while (iters)
94	{
95	da = da + da[ofs];
96	da += ofs_stride;
97	iters--;
98	}
99	} while (lc_stride > `2`);
100	}
101
102	/ See header for documentation. /
103	void compute_pixel_region_variance(
104	astcenc_contexti& ctx,
105	const pixel_region_args& arg
106	) {
107	// Unpack the memory structure into local variables
108	const astcenc_image* img = arg.img;
109	astcenc_swizzle swz = arg.swz;
110	bool have_z = arg.have_z;
111
112	int size_x = arg.size_x;
113	int size_y = arg.size_y;
114	int size_z = arg.size_z;
115
116	int offset_x = arg.offset_x;
117	int offset_y = arg.offset_y;
118	int offset_z = arg.offset_z;
119
120	int alpha_kernel_radius = arg.alpha_kernel_radius;
121
122	float* input_alpha_averages = ctx.input_alpha_averages;
123	vfloat4* work_memory = arg.work_memory;
124
125	// Compute memory sizes and dimensions that we need
126	int kernel_radius = alpha_kernel_radius;
127	int kerneldim = `2` * kernel_radius + `1`;
128	int kernel_radius_xy = kernel_radius;
129	int kernel_radius_z = have_z ? kernel_radius : `0`;
130
131	int padsize_x = size_x + kerneldim;
132	int padsize_y = size_y + kerneldim;
133	int padsize_z = size_z + (have_z ? kerneldim : `0`);
134	int sizeprod = padsize_x * padsize_y * padsize_z;
135
136	int zd_start = have_z ? `1` : `0`;
137
138	vfloat4 *varbuf1 = work_memory;
139	vfloat4 *varbuf2 = work_memory + sizeprod;
140
141	// Scaling factors to apply to Y and Z for accesses into the work buffers
142	int yst = padsize_x;
143	int zst = padsize_x * padsize_y;
144
145	// Scaling factors to apply to Y and Z for accesses into result buffers
146	int ydt = img->dim_x;
147	int zdt = img->dim_x * img->dim_y;
148
149	// Macros to act as accessor functions for the work-memory
150	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
151	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
152
153	// Load N and N^2 values into the work buffers
154	if (img->data_type == ASTCENC_TYPE_U8)
155	{
156	// Swizzle data structure 4 = ZERO, 5 = ONE
157	uint8_t data[`6`];
158	data[ASTCENC_SWZ_0] = `0`;
159	data[ASTCENC_SWZ_1] = `255`;
160
161	for (int z = zd_start; z < padsize_z; z++)
162	{
163	int z_src = (z - zd_start) + offset_z - kernel_radius_z;
164	z_src = astc::clamp(z_src, `0`, static_cast<int>(img->dim_z - `1`));
165	uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
166
167	for (int y = `1`; y < padsize_y; y++)
168	{
169	int y_src = (y - `1`) + offset_y - kernel_radius_xy;
170	y_src = astc::clamp(y_src, `0`, static_cast<int>(img->dim_y - `1`));
171
172	for (int x = `1`; x < padsize_x; x++)
173	{
174	int x_src = (x - `1`) + offset_x - kernel_radius_xy;
175	x_src = astc::clamp(x_src, `0`, static_cast<int>(img->dim_x - `1`));
176
177	data[`0`] = data8[(`4` * img->dim_x * y_src) + (`4` * x_src )];
178	data[`1`] = data8[(`4` * img->dim_x * y_src) + (`4` * x_src + `1`)];
179	data[`2`] = data8[(`4` * img->dim_x * y_src) + (`4` * x_src + `2`)];
180	data[`3`] = data8[(`4` * img->dim_x * y_src) + (`4` * x_src + `3`)];
181
182	uint8_t r = data[swz.r];
183	uint8_t g = data[swz.g];
184	uint8_t b = data[swz.b];
185	uint8_t a = data[swz.a];
186
187	vfloat4 d = vfloat4 (r * (`1.0f` / `255.0f`),
188	g * (`1.0f` / `255.0f`),
189	b * (`1.0f` / `255.0f`),
190	a * (`1.0f` / `255.0f`));
191
192	VARBUF1(z, y, x) = d;
193	VARBUF2(z, y, x) = d * d;
194	}
195	}
196	}
197	}
198	else if (img->data_type == ASTCENC_TYPE_F16)
199	{
200	// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
201	uint16_t data[`6`];
202	data[ASTCENC_SWZ_0] = `0`;
203	data[ASTCENC_SWZ_1] = `0x3C00`;
204
205	for (int z = zd_start; z < padsize_z; z++)
206	{
207	int z_src = (z - zd_start) + offset_z - kernel_radius_z;
208	z_src = astc::clamp(z_src, `0`, static_cast<int>(img->dim_z - `1`));
209	uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
210
211	for (int y = `1`; y < padsize_y; y++)
212	{
213	int y_src = (y - `1`) + offset_y - kernel_radius_xy;
214	y_src = astc::clamp(y_src, `0`, static_cast<int>(img->dim_y - `1`));
215
216	for (int x = `1`; x < padsize_x; x++)
217	{
218	int x_src = (x - `1`) + offset_x - kernel_radius_xy;
219	x_src = astc::clamp(x_src, `0`, static_cast<int>(img->dim_x - `1`));
220
221	data[`0`] = data16[(`4` * img->dim_x * y_src) + (`4` * x_src )];
222	data[`1`] = data16[(`4` * img->dim_x * y_src) + (`4` * x_src + `1`)];
223	data[`2`] = data16[(`4` * img->dim_x * y_src) + (`4` * x_src + `2`)];
224	data[`3`] = data16[(`4` * img->dim_x * y_src) + (`4` * x_src + `3`)];
225
226	vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
227	vfloat4 d = float16_to_float(di);
228
229	VARBUF1(z, y, x) = d;
230	VARBUF2(z, y, x) = d * d;
231	}
232	}
233	}
234	}
235	else // if (img->data_type == ASTCENC_TYPE_F32)
236	{
237	assert(img->data_type == ASTCENC_TYPE_F32);
238
239	// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
240	float data[`6`];
241	data[ASTCENC_SWZ_0] = `0.0f`;
242	data[ASTCENC_SWZ_1] = `1.0f`;
243
244	for (int z = zd_start; z < padsize_z; z++)
245	{
246	int z_src = (z - zd_start) + offset_z - kernel_radius_z;
247	z_src = astc::clamp(z_src, `0`, static_cast<int>(img->dim_z - `1`));
248	float* data32 = static_cast<float*>(img->data[z_src]);
249
250	for (int y = `1`; y < padsize_y; y++)
251	{
252	int y_src = (y - `1`) + offset_y - kernel_radius_xy;
253	y_src = astc::clamp(y_src, `0`, static_cast<int>(img->dim_y - `1`));
254
255	for (int x = `1`; x < padsize_x; x++)
256	{
257	int x_src = (x - `1`) + offset_x - kernel_radius_xy;
258	x_src = astc::clamp(x_src, `0`, static_cast<int>(img->dim_x - `1`));
259
260	data[`0`] = data32[(`4` * img->dim_x * y_src) + (`4` * x_src )];
261	data[`1`] = data32[(`4` * img->dim_x * y_src) + (`4` * x_src + `1`)];
262	data[`2`] = data32[(`4` * img->dim_x * y_src) + (`4` * x_src + `2`)];
263	data[`3`] = data32[(`4` * img->dim_x * y_src) + (`4` * x_src + `3`)];
264
265	float r = data[swz.r];
266	float g = data[swz.g];
267	float b = data[swz.b];
268	float a = data[swz.a];
269
270	vfloat4 d(r, g, b, a);
271
272	VARBUF1(z, y, x) = d;
273	VARBUF2(z, y, x) = d * d;
274	}
275	}
276	}
277	}
278
279	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
280	vfloat4 vbz = vfloat4::zero();
281	for (int z = `0`; z < padsize_z; z++)
282	{
283	for (int y = `0`; y < padsize_y; y++)
284	{
285	VARBUF1(z, y, `0`) = vbz;
286	VARBUF2(z, y, `0`) = vbz;
287	}
288
289	for (int x = `0`; x < padsize_x; x++)
290	{
291	VARBUF1(z, `0`, x) = vbz;
292	VARBUF2(z, `0`, x) = vbz;
293	}
294	}
295
296	if (have_z)
297	{
298	for (int y = `0`; y < padsize_y; y++)
299	{
300	for (int x = `0`; x < padsize_x; x++)
301	{
302	VARBUF1(`0`, y, x) = vbz;
303	VARBUF2(`0`, y, x) = vbz;
304	}
305	}
306	}
307
308	// Generate summed-area tables for N and N^2; this is done in-place, using
309	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
310	for (int z = zd_start; z < padsize_z; z++)
311	{
312	for (int y = `1`; y < padsize_y; y++)
313	{
314	brent_kung_prefix_sum(&(VARBUF1(z, y, `1`)), padsize_x - `1`, `1`);
315	brent_kung_prefix_sum(&(VARBUF2(z, y, `1`)), padsize_x - `1`, `1`);
316	}
317	}
318
319	for (int z = zd_start; z < padsize_z; z++)
320	{
321	for (int x = `1`; x < padsize_x; x++)
322	{
323	brent_kung_prefix_sum(&(VARBUF1(z, `1`, x)), padsize_y - `1`, yst);
324	brent_kung_prefix_sum(&(VARBUF2(z, `1`, x)), padsize_y - `1`, yst);
325	}
326	}
327
328	if (have_z)
329	{
330	for (int y = `1`; y < padsize_y; y++)
331	{
332	for (int x = `1`; x < padsize_x; x++)
333	{
334	brent_kung_prefix_sum(&(VARBUF1(`1`, y, x)), padsize_z - `1`, zst);
335	brent_kung_prefix_sum(&(VARBUF2(`1`, y, x)), padsize_z - `1`, zst);
336	}
337	}
338	}
339
340	// Compute a few constants used in the variance-calculation.
341	float alpha_kdim = static_cast<float>(`2` * alpha_kernel_radius + `1`);
342	float alpha_rsamples;
343
344	if (have_z)
345	{
346	alpha_rsamples = `1.0f` / (alpha_kdim * alpha_kdim * alpha_kdim);
347	}
348	else
349	{
350	alpha_rsamples = `1.0f` / (alpha_kdim * alpha_kdim);
351	}
352
353	// Use the summed-area tables to compute variance for each neighborhood
354	if (have_z)
355	{
356	for (int z = `0`; z < size_z; z++)
357	{
358	int z_src = z + kernel_radius_z;
359	int z_dst = z + offset_z;
360	int z_low = z_src - alpha_kernel_radius;
361	int z_high = z_src + alpha_kernel_radius + `1`;
362
363	for (int y = `0`; y < size_y; y++)
364	{
365	int y_src = y + kernel_radius_xy;
366	int y_dst = y + offset_y;
367	int y_low = y_src - alpha_kernel_radius;
368	int y_high = y_src + alpha_kernel_radius + `1`;
369
370	for (int x = `0`; x < size_x; x++)
371	{
372	int x_src = x + kernel_radius_xy;
373	int x_dst = x + offset_x;
374	int x_low = x_src - alpha_kernel_radius;
375	int x_high = x_src + alpha_kernel_radius + `1`;
376
377	// Summed-area table lookups for alpha average
378	float vasum = ( VARBUF1(z_high, y_low, x_low).lane<`3`>()
379	- VARBUF1(z_high, y_low, x_high).lane<`3`>()
380	- VARBUF1(z_high, y_high, x_low).lane<`3`>()
381	+ VARBUF1(z_high, y_high, x_high).lane<`3`>()) -
382	( VARBUF1(z_low, y_low, x_low).lane<`3`>()
383	- VARBUF1(z_low, y_low, x_high).lane<`3`>()
384	- VARBUF1(z_low, y_high, x_low).lane<`3`>()
385	+ VARBUF1(z_low, y_high, x_high).lane<`3`>());
386
387	int out_index = z_dst * zdt + y_dst * ydt + x_dst;
388	input_alpha_averages[out_index] = (vasum * alpha_rsamples);
389	}
390	}
391	}
392	}
393	else
394	{
395	for (int y = `0`; y < size_y; y++)
396	{
397	int y_src = y + kernel_radius_xy;
398	int y_dst = y + offset_y;
399	int y_low = y_src - alpha_kernel_radius;
400	int y_high = y_src + alpha_kernel_radius + `1`;
401
402	for (int x = `0`; x < size_x; x++)
403	{
404	int x_src = x + kernel_radius_xy;
405	int x_dst = x + offset_x;
406	int x_low = x_src - alpha_kernel_radius;
407	int x_high = x_src + alpha_kernel_radius + `1`;
408
409	// Summed-area table lookups for alpha average
410	float vasum = VARBUF1(`0`, y_low, x_low).lane<`3`>()
411	- VARBUF1(`0`, y_low, x_high).lane<`3`>()
412	- VARBUF1(`0`, y_high, x_low).lane<`3`>()
413	+ VARBUF1(`0`, y_high, x_high).lane<`3`>();
414
415	int out_index = y_dst * ydt + x_dst;
416	input_alpha_averages[out_index] = (vasum * alpha_rsamples);
417	}
418	}
419	}
420	}
421
422	/ See header for documentation. /
423	unsigned int init_compute_averages(
424	const astcenc_image& img,
425	unsigned int alpha_kernel_radius,
426	const astcenc_swizzle& swz,
427	avg_args& ag
428	) {
429	unsigned int size_x = img.dim_x;
430	unsigned int size_y = img.dim_y;
431	unsigned int size_z = img.dim_z;
432
433	// Compute maximum block size and from that the working memory buffer size
434	unsigned int kernel_radius = alpha_kernel_radius;
435	unsigned int kerneldim = `2` * kernel_radius + `1`;
436
437	bool have_z = (size_z > `1`);
438	unsigned int max_blk_size_xy = have_z ? `16` : `32`;
439	unsigned int max_blk_size_z = astc::min(size_z, have_z ? `16u` : `1u`);
440
441	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
442	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : `0`);
443
444	// Perform block-wise averages calculations across the image
445	// Initialize fields which are not populated until later
446	ag.arg.size_x = `0`;
447	ag.arg.size_y = `0`;
448	ag.arg.size_z = `0`;
449	ag.arg.offset_x = `0`;
450	ag.arg.offset_y = `0`;
451	ag.arg.offset_z = `0`;
452	ag.arg.work_memory = nullptr;
453
454	ag.arg.img = &img;
455	ag.arg.swz = swz;
456	ag.arg.have_z = have_z;
457	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
458
459	ag.img_size_x = size_x;
460	ag.img_size_y = size_y;
461	ag.img_size_z = size_z;
462	ag.blk_size_xy = max_blk_size_xy;
463	ag.blk_size_z = max_blk_size_z;
464	ag.work_memory_size = `2` * max_padsize_xy * max_padsize_xy * max_padsize_z;
465
466	// The parallel task count
467	unsigned int z_tasks = (size_z + max_blk_size_z - `1`) / max_blk_size_z;
468	unsigned int y_tasks = (size_y + max_blk_size_xy - `1`) / max_blk_size_xy;
469	return z_tasks * y_tasks;
470	}
471
472	#endif
473

Browse the source code of Godot/thirdparty/astcenc/astcenc_compute_variance.cpp

Definitions