1 | // SPDX-License-Identifier: Apache-2.0 |
2 | // ---------------------------------------------------------------------------- |
3 | // Copyright 2011-2022 Arm Limited |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | // use this file except in compliance with the License. You may obtain a copy |
7 | // of the License at: |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, software |
12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | // License for the specific language governing permissions and limitations |
15 | // under the License. |
16 | // ---------------------------------------------------------------------------- |
17 | |
18 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
19 | |
20 | /** |
21 | * @brief Functions to calculate variance per component in a NxN footprint. |
22 | * |
23 | * We need N to be parametric, so the routine below uses summed area tables in order to execute in |
24 | * O(1) time independent of how big N is. |
25 | * |
26 | * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first |
27 | * perform a binary reduction, and then distributes the results. This method means that there is no |
28 | * serial dependency between a given element and the next one, and also significantly improves |
29 | * numerical stability allowing us to use floats rather than doubles. |
30 | */ |
31 | |
32 | #include "astcenc_internal.h" |
33 | |
34 | #include <cassert> |
35 | |
36 | /** |
37 | * @brief Generate a prefix-sum array using the Brent-Kung algorithm. |
38 | * |
39 | * This will take an input array of the form: |
40 | * v0, v1, v2, ... |
41 | * ... and modify in-place to turn it into a prefix-sum array of the form: |
42 | * v0, v0+v1, v0+v1+v2, ... |
43 | * |
44 | * @param d The array to prefix-sum. |
45 | * @param items The number of items in the array. |
46 | * @param stride The item spacing in the array; i.e. dense arrays should use 1. |
47 | */ |
48 | static void brent_kung_prefix_sum( |
49 | vfloat4* d, |
50 | size_t items, |
51 | int stride |
52 | ) { |
53 | if (items < 2) |
54 | return; |
55 | |
56 | size_t lc_stride = 2; |
57 | size_t log2_stride = 1; |
58 | |
59 | // The reduction-tree loop |
60 | do { |
61 | size_t step = lc_stride >> 1; |
62 | size_t start = lc_stride - 1; |
63 | size_t iters = items >> log2_stride; |
64 | |
65 | vfloat4 *da = d + (start * stride); |
66 | ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride); |
67 | size_t ofs_stride = stride << log2_stride; |
68 | |
69 | while (iters) |
70 | { |
71 | *da = *da + da[ofs]; |
72 | da += ofs_stride; |
73 | iters--; |
74 | } |
75 | |
76 | log2_stride += 1; |
77 | lc_stride <<= 1; |
78 | } while (lc_stride <= items); |
79 | |
80 | // The expansion-tree loop |
81 | do { |
82 | log2_stride -= 1; |
83 | lc_stride >>= 1; |
84 | |
85 | size_t step = lc_stride >> 1; |
86 | size_t start = step + lc_stride - 1; |
87 | size_t iters = (items - step) >> log2_stride; |
88 | |
89 | vfloat4 *da = d + (start * stride); |
90 | ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride); |
91 | size_t ofs_stride = stride << log2_stride; |
92 | |
93 | while (iters) |
94 | { |
95 | *da = *da + da[ofs]; |
96 | da += ofs_stride; |
97 | iters--; |
98 | } |
99 | } while (lc_stride > 2); |
100 | } |
101 | |
102 | /* See header for documentation. */ |
103 | void compute_pixel_region_variance( |
104 | astcenc_contexti& ctx, |
105 | const pixel_region_args& arg |
106 | ) { |
107 | // Unpack the memory structure into local variables |
108 | const astcenc_image* img = arg.img; |
109 | astcenc_swizzle swz = arg.swz; |
110 | bool have_z = arg.have_z; |
111 | |
112 | int size_x = arg.size_x; |
113 | int size_y = arg.size_y; |
114 | int size_z = arg.size_z; |
115 | |
116 | int offset_x = arg.offset_x; |
117 | int offset_y = arg.offset_y; |
118 | int offset_z = arg.offset_z; |
119 | |
120 | int alpha_kernel_radius = arg.alpha_kernel_radius; |
121 | |
122 | float* input_alpha_averages = ctx.input_alpha_averages; |
123 | vfloat4* work_memory = arg.work_memory; |
124 | |
125 | // Compute memory sizes and dimensions that we need |
126 | int kernel_radius = alpha_kernel_radius; |
127 | int kerneldim = 2 * kernel_radius + 1; |
128 | int kernel_radius_xy = kernel_radius; |
129 | int kernel_radius_z = have_z ? kernel_radius : 0; |
130 | |
131 | int padsize_x = size_x + kerneldim; |
132 | int padsize_y = size_y + kerneldim; |
133 | int padsize_z = size_z + (have_z ? kerneldim : 0); |
134 | int sizeprod = padsize_x * padsize_y * padsize_z; |
135 | |
136 | int zd_start = have_z ? 1 : 0; |
137 | |
138 | vfloat4 *varbuf1 = work_memory; |
139 | vfloat4 *varbuf2 = work_memory + sizeprod; |
140 | |
141 | // Scaling factors to apply to Y and Z for accesses into the work buffers |
142 | int yst = padsize_x; |
143 | int zst = padsize_x * padsize_y; |
144 | |
145 | // Scaling factors to apply to Y and Z for accesses into result buffers |
146 | int ydt = img->dim_x; |
147 | int zdt = img->dim_x * img->dim_y; |
148 | |
149 | // Macros to act as accessor functions for the work-memory |
150 | #define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x] |
151 | #define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x] |
152 | |
153 | // Load N and N^2 values into the work buffers |
154 | if (img->data_type == ASTCENC_TYPE_U8) |
155 | { |
156 | // Swizzle data structure 4 = ZERO, 5 = ONE |
157 | uint8_t data[6]; |
158 | data[ASTCENC_SWZ_0] = 0; |
159 | data[ASTCENC_SWZ_1] = 255; |
160 | |
161 | for (int z = zd_start; z < padsize_z; z++) |
162 | { |
163 | int z_src = (z - zd_start) + offset_z - kernel_radius_z; |
164 | z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); |
165 | uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]); |
166 | |
167 | for (int y = 1; y < padsize_y; y++) |
168 | { |
169 | int y_src = (y - 1) + offset_y - kernel_radius_xy; |
170 | y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); |
171 | |
172 | for (int x = 1; x < padsize_x; x++) |
173 | { |
174 | int x_src = (x - 1) + offset_x - kernel_radius_xy; |
175 | x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); |
176 | |
177 | data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )]; |
178 | data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; |
179 | data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; |
180 | data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; |
181 | |
182 | uint8_t r = data[swz.r]; |
183 | uint8_t g = data[swz.g]; |
184 | uint8_t b = data[swz.b]; |
185 | uint8_t a = data[swz.a]; |
186 | |
187 | vfloat4 d = vfloat4 (r * (1.0f / 255.0f), |
188 | g * (1.0f / 255.0f), |
189 | b * (1.0f / 255.0f), |
190 | a * (1.0f / 255.0f)); |
191 | |
192 | VARBUF1(z, y, x) = d; |
193 | VARBUF2(z, y, x) = d * d; |
194 | } |
195 | } |
196 | } |
197 | } |
198 | else if (img->data_type == ASTCENC_TYPE_F16) |
199 | { |
200 | // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16) |
201 | uint16_t data[6]; |
202 | data[ASTCENC_SWZ_0] = 0; |
203 | data[ASTCENC_SWZ_1] = 0x3C00; |
204 | |
205 | for (int z = zd_start; z < padsize_z; z++) |
206 | { |
207 | int z_src = (z - zd_start) + offset_z - kernel_radius_z; |
208 | z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); |
209 | uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]); |
210 | |
211 | for (int y = 1; y < padsize_y; y++) |
212 | { |
213 | int y_src = (y - 1) + offset_y - kernel_radius_xy; |
214 | y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); |
215 | |
216 | for (int x = 1; x < padsize_x; x++) |
217 | { |
218 | int x_src = (x - 1) + offset_x - kernel_radius_xy; |
219 | x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); |
220 | |
221 | data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )]; |
222 | data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; |
223 | data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; |
224 | data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; |
225 | |
226 | vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); |
227 | vfloat4 d = float16_to_float(di); |
228 | |
229 | VARBUF1(z, y, x) = d; |
230 | VARBUF2(z, y, x) = d * d; |
231 | } |
232 | } |
233 | } |
234 | } |
235 | else // if (img->data_type == ASTCENC_TYPE_F32) |
236 | { |
237 | assert(img->data_type == ASTCENC_TYPE_F32); |
238 | |
239 | // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16) |
240 | float data[6]; |
241 | data[ASTCENC_SWZ_0] = 0.0f; |
242 | data[ASTCENC_SWZ_1] = 1.0f; |
243 | |
244 | for (int z = zd_start; z < padsize_z; z++) |
245 | { |
246 | int z_src = (z - zd_start) + offset_z - kernel_radius_z; |
247 | z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); |
248 | float* data32 = static_cast<float*>(img->data[z_src]); |
249 | |
250 | for (int y = 1; y < padsize_y; y++) |
251 | { |
252 | int y_src = (y - 1) + offset_y - kernel_radius_xy; |
253 | y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); |
254 | |
255 | for (int x = 1; x < padsize_x; x++) |
256 | { |
257 | int x_src = (x - 1) + offset_x - kernel_radius_xy; |
258 | x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); |
259 | |
260 | data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )]; |
261 | data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; |
262 | data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; |
263 | data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; |
264 | |
265 | float r = data[swz.r]; |
266 | float g = data[swz.g]; |
267 | float b = data[swz.b]; |
268 | float a = data[swz.a]; |
269 | |
270 | vfloat4 d(r, g, b, a); |
271 | |
272 | VARBUF1(z, y, x) = d; |
273 | VARBUF2(z, y, x) = d * d; |
274 | } |
275 | } |
276 | } |
277 | } |
278 | |
279 | // Pad with an extra layer of 0s; this forms the edge of the SAT tables |
280 | vfloat4 vbz = vfloat4::zero(); |
281 | for (int z = 0; z < padsize_z; z++) |
282 | { |
283 | for (int y = 0; y < padsize_y; y++) |
284 | { |
285 | VARBUF1(z, y, 0) = vbz; |
286 | VARBUF2(z, y, 0) = vbz; |
287 | } |
288 | |
289 | for (int x = 0; x < padsize_x; x++) |
290 | { |
291 | VARBUF1(z, 0, x) = vbz; |
292 | VARBUF2(z, 0, x) = vbz; |
293 | } |
294 | } |
295 | |
296 | if (have_z) |
297 | { |
298 | for (int y = 0; y < padsize_y; y++) |
299 | { |
300 | for (int x = 0; x < padsize_x; x++) |
301 | { |
302 | VARBUF1(0, y, x) = vbz; |
303 | VARBUF2(0, y, x) = vbz; |
304 | } |
305 | } |
306 | } |
307 | |
308 | // Generate summed-area tables for N and N^2; this is done in-place, using |
309 | // a Brent-Kung parallel-prefix based algorithm to minimize precision loss |
310 | for (int z = zd_start; z < padsize_z; z++) |
311 | { |
312 | for (int y = 1; y < padsize_y; y++) |
313 | { |
314 | brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1); |
315 | brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1); |
316 | } |
317 | } |
318 | |
319 | for (int z = zd_start; z < padsize_z; z++) |
320 | { |
321 | for (int x = 1; x < padsize_x; x++) |
322 | { |
323 | brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst); |
324 | brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst); |
325 | } |
326 | } |
327 | |
328 | if (have_z) |
329 | { |
330 | for (int y = 1; y < padsize_y; y++) |
331 | { |
332 | for (int x = 1; x < padsize_x; x++) |
333 | { |
334 | brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst); |
335 | brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst); |
336 | } |
337 | } |
338 | } |
339 | |
340 | // Compute a few constants used in the variance-calculation. |
341 | float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1); |
342 | float alpha_rsamples; |
343 | |
344 | if (have_z) |
345 | { |
346 | alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim); |
347 | } |
348 | else |
349 | { |
350 | alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim); |
351 | } |
352 | |
353 | // Use the summed-area tables to compute variance for each neighborhood |
354 | if (have_z) |
355 | { |
356 | for (int z = 0; z < size_z; z++) |
357 | { |
358 | int z_src = z + kernel_radius_z; |
359 | int z_dst = z + offset_z; |
360 | int z_low = z_src - alpha_kernel_radius; |
361 | int z_high = z_src + alpha_kernel_radius + 1; |
362 | |
363 | for (int y = 0; y < size_y; y++) |
364 | { |
365 | int y_src = y + kernel_radius_xy; |
366 | int y_dst = y + offset_y; |
367 | int y_low = y_src - alpha_kernel_radius; |
368 | int y_high = y_src + alpha_kernel_radius + 1; |
369 | |
370 | for (int x = 0; x < size_x; x++) |
371 | { |
372 | int x_src = x + kernel_radius_xy; |
373 | int x_dst = x + offset_x; |
374 | int x_low = x_src - alpha_kernel_radius; |
375 | int x_high = x_src + alpha_kernel_radius + 1; |
376 | |
377 | // Summed-area table lookups for alpha average |
378 | float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>() |
379 | - VARBUF1(z_high, y_low, x_high).lane<3>() |
380 | - VARBUF1(z_high, y_high, x_low).lane<3>() |
381 | + VARBUF1(z_high, y_high, x_high).lane<3>()) - |
382 | ( VARBUF1(z_low, y_low, x_low).lane<3>() |
383 | - VARBUF1(z_low, y_low, x_high).lane<3>() |
384 | - VARBUF1(z_low, y_high, x_low).lane<3>() |
385 | + VARBUF1(z_low, y_high, x_high).lane<3>()); |
386 | |
387 | int out_index = z_dst * zdt + y_dst * ydt + x_dst; |
388 | input_alpha_averages[out_index] = (vasum * alpha_rsamples); |
389 | } |
390 | } |
391 | } |
392 | } |
393 | else |
394 | { |
395 | for (int y = 0; y < size_y; y++) |
396 | { |
397 | int y_src = y + kernel_radius_xy; |
398 | int y_dst = y + offset_y; |
399 | int y_low = y_src - alpha_kernel_radius; |
400 | int y_high = y_src + alpha_kernel_radius + 1; |
401 | |
402 | for (int x = 0; x < size_x; x++) |
403 | { |
404 | int x_src = x + kernel_radius_xy; |
405 | int x_dst = x + offset_x; |
406 | int x_low = x_src - alpha_kernel_radius; |
407 | int x_high = x_src + alpha_kernel_radius + 1; |
408 | |
409 | // Summed-area table lookups for alpha average |
410 | float vasum = VARBUF1(0, y_low, x_low).lane<3>() |
411 | - VARBUF1(0, y_low, x_high).lane<3>() |
412 | - VARBUF1(0, y_high, x_low).lane<3>() |
413 | + VARBUF1(0, y_high, x_high).lane<3>(); |
414 | |
415 | int out_index = y_dst * ydt + x_dst; |
416 | input_alpha_averages[out_index] = (vasum * alpha_rsamples); |
417 | } |
418 | } |
419 | } |
420 | } |
421 | |
422 | /* See header for documentation. */ |
423 | unsigned int init_compute_averages( |
424 | const astcenc_image& img, |
425 | unsigned int alpha_kernel_radius, |
426 | const astcenc_swizzle& swz, |
427 | avg_args& ag |
428 | ) { |
429 | unsigned int size_x = img.dim_x; |
430 | unsigned int size_y = img.dim_y; |
431 | unsigned int size_z = img.dim_z; |
432 | |
433 | // Compute maximum block size and from that the working memory buffer size |
434 | unsigned int kernel_radius = alpha_kernel_radius; |
435 | unsigned int kerneldim = 2 * kernel_radius + 1; |
436 | |
437 | bool have_z = (size_z > 1); |
438 | unsigned int max_blk_size_xy = have_z ? 16 : 32; |
439 | unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u); |
440 | |
441 | unsigned int max_padsize_xy = max_blk_size_xy + kerneldim; |
442 | unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0); |
443 | |
444 | // Perform block-wise averages calculations across the image |
445 | // Initialize fields which are not populated until later |
446 | ag.arg.size_x = 0; |
447 | ag.arg.size_y = 0; |
448 | ag.arg.size_z = 0; |
449 | ag.arg.offset_x = 0; |
450 | ag.arg.offset_y = 0; |
451 | ag.arg.offset_z = 0; |
452 | ag.arg.work_memory = nullptr; |
453 | |
454 | ag.arg.img = &img; |
455 | ag.arg.swz = swz; |
456 | ag.arg.have_z = have_z; |
457 | ag.arg.alpha_kernel_radius = alpha_kernel_radius; |
458 | |
459 | ag.img_size_x = size_x; |
460 | ag.img_size_y = size_y; |
461 | ag.img_size_z = size_z; |
462 | ag.blk_size_xy = max_blk_size_xy; |
463 | ag.blk_size_z = max_blk_size_z; |
464 | ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z; |
465 | |
466 | // The parallel task count |
467 | unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z; |
468 | unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy; |
469 | return z_tasks * y_tasks; |
470 | } |
471 | |
472 | #endif |
473 | |