| 1 | // SPDX-License-Identifier: Apache-2.0 |
| 2 | // ---------------------------------------------------------------------------- |
| 3 | // Copyright 2011-2022 Arm Limited |
| 4 | // |
| 5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| 6 | // use this file except in compliance with the License. You may obtain a copy |
| 7 | // of the License at: |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, software |
| 12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| 13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| 14 | // License for the specific language governing permissions and limitations |
| 15 | // under the License. |
| 16 | // ---------------------------------------------------------------------------- |
| 17 | |
| 18 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
| 19 | |
| 20 | /** |
| 21 | * @brief Functions to calculate variance per component in a NxN footprint. |
| 22 | * |
| 23 | * We need N to be parametric, so the routine below uses summed area tables in order to execute in |
| 24 | * O(1) time independent of how big N is. |
| 25 | * |
| 26 | * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first |
| 27 | * perform a binary reduction, and then distributes the results. This method means that there is no |
| 28 | * serial dependency between a given element and the next one, and also significantly improves |
| 29 | * numerical stability allowing us to use floats rather than doubles. |
| 30 | */ |
| 31 | |
| 32 | #include "astcenc_internal.h" |
| 33 | |
| 34 | #include <cassert> |
| 35 | |
| 36 | /** |
| 37 | * @brief Generate a prefix-sum array using the Brent-Kung algorithm. |
| 38 | * |
| 39 | * This will take an input array of the form: |
| 40 | * v0, v1, v2, ... |
| 41 | * ... and modify in-place to turn it into a prefix-sum array of the form: |
| 42 | * v0, v0+v1, v0+v1+v2, ... |
| 43 | * |
| 44 | * @param d The array to prefix-sum. |
| 45 | * @param items The number of items in the array. |
| 46 | * @param stride The item spacing in the array; i.e. dense arrays should use 1. |
| 47 | */ |
| 48 | static void brent_kung_prefix_sum( |
| 49 | vfloat4* d, |
| 50 | size_t items, |
| 51 | int stride |
| 52 | ) { |
| 53 | if (items < 2) |
| 54 | return; |
| 55 | |
| 56 | size_t lc_stride = 2; |
| 57 | size_t log2_stride = 1; |
| 58 | |
| 59 | // The reduction-tree loop |
| 60 | do { |
| 61 | size_t step = lc_stride >> 1; |
| 62 | size_t start = lc_stride - 1; |
| 63 | size_t iters = items >> log2_stride; |
| 64 | |
| 65 | vfloat4 *da = d + (start * stride); |
| 66 | ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride); |
| 67 | size_t ofs_stride = stride << log2_stride; |
| 68 | |
| 69 | while (iters) |
| 70 | { |
| 71 | *da = *da + da[ofs]; |
| 72 | da += ofs_stride; |
| 73 | iters--; |
| 74 | } |
| 75 | |
| 76 | log2_stride += 1; |
| 77 | lc_stride <<= 1; |
| 78 | } while (lc_stride <= items); |
| 79 | |
| 80 | // The expansion-tree loop |
| 81 | do { |
| 82 | log2_stride -= 1; |
| 83 | lc_stride >>= 1; |
| 84 | |
| 85 | size_t step = lc_stride >> 1; |
| 86 | size_t start = step + lc_stride - 1; |
| 87 | size_t iters = (items - step) >> log2_stride; |
| 88 | |
| 89 | vfloat4 *da = d + (start * stride); |
| 90 | ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride); |
| 91 | size_t ofs_stride = stride << log2_stride; |
| 92 | |
| 93 | while (iters) |
| 94 | { |
| 95 | *da = *da + da[ofs]; |
| 96 | da += ofs_stride; |
| 97 | iters--; |
| 98 | } |
| 99 | } while (lc_stride > 2); |
| 100 | } |
| 101 | |
| 102 | /* See header for documentation. */ |
| 103 | void compute_pixel_region_variance( |
| 104 | astcenc_contexti& ctx, |
| 105 | const pixel_region_args& arg |
| 106 | ) { |
| 107 | // Unpack the memory structure into local variables |
| 108 | const astcenc_image* img = arg.img; |
| 109 | astcenc_swizzle swz = arg.swz; |
| 110 | bool have_z = arg.have_z; |
| 111 | |
| 112 | int size_x = arg.size_x; |
| 113 | int size_y = arg.size_y; |
| 114 | int size_z = arg.size_z; |
| 115 | |
| 116 | int offset_x = arg.offset_x; |
| 117 | int offset_y = arg.offset_y; |
| 118 | int offset_z = arg.offset_z; |
| 119 | |
| 120 | int alpha_kernel_radius = arg.alpha_kernel_radius; |
| 121 | |
| 122 | float* input_alpha_averages = ctx.input_alpha_averages; |
| 123 | vfloat4* work_memory = arg.work_memory; |
| 124 | |
| 125 | // Compute memory sizes and dimensions that we need |
| 126 | int kernel_radius = alpha_kernel_radius; |
| 127 | int kerneldim = 2 * kernel_radius + 1; |
| 128 | int kernel_radius_xy = kernel_radius; |
| 129 | int kernel_radius_z = have_z ? kernel_radius : 0; |
| 130 | |
| 131 | int padsize_x = size_x + kerneldim; |
| 132 | int padsize_y = size_y + kerneldim; |
| 133 | int padsize_z = size_z + (have_z ? kerneldim : 0); |
| 134 | int sizeprod = padsize_x * padsize_y * padsize_z; |
| 135 | |
| 136 | int zd_start = have_z ? 1 : 0; |
| 137 | |
| 138 | vfloat4 *varbuf1 = work_memory; |
| 139 | vfloat4 *varbuf2 = work_memory + sizeprod; |
| 140 | |
| 141 | // Scaling factors to apply to Y and Z for accesses into the work buffers |
| 142 | int yst = padsize_x; |
| 143 | int zst = padsize_x * padsize_y; |
| 144 | |
| 145 | // Scaling factors to apply to Y and Z for accesses into result buffers |
| 146 | int ydt = img->dim_x; |
| 147 | int zdt = img->dim_x * img->dim_y; |
| 148 | |
| 149 | // Macros to act as accessor functions for the work-memory |
| 150 | #define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x] |
| 151 | #define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x] |
| 152 | |
| 153 | // Load N and N^2 values into the work buffers |
| 154 | if (img->data_type == ASTCENC_TYPE_U8) |
| 155 | { |
| 156 | // Swizzle data structure 4 = ZERO, 5 = ONE |
| 157 | uint8_t data[6]; |
| 158 | data[ASTCENC_SWZ_0] = 0; |
| 159 | data[ASTCENC_SWZ_1] = 255; |
| 160 | |
| 161 | for (int z = zd_start; z < padsize_z; z++) |
| 162 | { |
| 163 | int z_src = (z - zd_start) + offset_z - kernel_radius_z; |
| 164 | z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); |
| 165 | uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]); |
| 166 | |
| 167 | for (int y = 1; y < padsize_y; y++) |
| 168 | { |
| 169 | int y_src = (y - 1) + offset_y - kernel_radius_xy; |
| 170 | y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); |
| 171 | |
| 172 | for (int x = 1; x < padsize_x; x++) |
| 173 | { |
| 174 | int x_src = (x - 1) + offset_x - kernel_radius_xy; |
| 175 | x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); |
| 176 | |
| 177 | data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )]; |
| 178 | data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; |
| 179 | data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; |
| 180 | data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; |
| 181 | |
| 182 | uint8_t r = data[swz.r]; |
| 183 | uint8_t g = data[swz.g]; |
| 184 | uint8_t b = data[swz.b]; |
| 185 | uint8_t a = data[swz.a]; |
| 186 | |
| 187 | vfloat4 d = vfloat4 (r * (1.0f / 255.0f), |
| 188 | g * (1.0f / 255.0f), |
| 189 | b * (1.0f / 255.0f), |
| 190 | a * (1.0f / 255.0f)); |
| 191 | |
| 192 | VARBUF1(z, y, x) = d; |
| 193 | VARBUF2(z, y, x) = d * d; |
| 194 | } |
| 195 | } |
| 196 | } |
| 197 | } |
| 198 | else if (img->data_type == ASTCENC_TYPE_F16) |
| 199 | { |
| 200 | // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16) |
| 201 | uint16_t data[6]; |
| 202 | data[ASTCENC_SWZ_0] = 0; |
| 203 | data[ASTCENC_SWZ_1] = 0x3C00; |
| 204 | |
| 205 | for (int z = zd_start; z < padsize_z; z++) |
| 206 | { |
| 207 | int z_src = (z - zd_start) + offset_z - kernel_radius_z; |
| 208 | z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); |
| 209 | uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]); |
| 210 | |
| 211 | for (int y = 1; y < padsize_y; y++) |
| 212 | { |
| 213 | int y_src = (y - 1) + offset_y - kernel_radius_xy; |
| 214 | y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); |
| 215 | |
| 216 | for (int x = 1; x < padsize_x; x++) |
| 217 | { |
| 218 | int x_src = (x - 1) + offset_x - kernel_radius_xy; |
| 219 | x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); |
| 220 | |
| 221 | data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )]; |
| 222 | data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; |
| 223 | data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; |
| 224 | data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; |
| 225 | |
| 226 | vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); |
| 227 | vfloat4 d = float16_to_float(di); |
| 228 | |
| 229 | VARBUF1(z, y, x) = d; |
| 230 | VARBUF2(z, y, x) = d * d; |
| 231 | } |
| 232 | } |
| 233 | } |
| 234 | } |
| 235 | else // if (img->data_type == ASTCENC_TYPE_F32) |
| 236 | { |
| 237 | assert(img->data_type == ASTCENC_TYPE_F32); |
| 238 | |
| 239 | // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16) |
| 240 | float data[6]; |
| 241 | data[ASTCENC_SWZ_0] = 0.0f; |
| 242 | data[ASTCENC_SWZ_1] = 1.0f; |
| 243 | |
| 244 | for (int z = zd_start; z < padsize_z; z++) |
| 245 | { |
| 246 | int z_src = (z - zd_start) + offset_z - kernel_radius_z; |
| 247 | z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); |
| 248 | float* data32 = static_cast<float*>(img->data[z_src]); |
| 249 | |
| 250 | for (int y = 1; y < padsize_y; y++) |
| 251 | { |
| 252 | int y_src = (y - 1) + offset_y - kernel_radius_xy; |
| 253 | y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); |
| 254 | |
| 255 | for (int x = 1; x < padsize_x; x++) |
| 256 | { |
| 257 | int x_src = (x - 1) + offset_x - kernel_radius_xy; |
| 258 | x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); |
| 259 | |
| 260 | data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )]; |
| 261 | data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; |
| 262 | data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; |
| 263 | data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; |
| 264 | |
| 265 | float r = data[swz.r]; |
| 266 | float g = data[swz.g]; |
| 267 | float b = data[swz.b]; |
| 268 | float a = data[swz.a]; |
| 269 | |
| 270 | vfloat4 d(r, g, b, a); |
| 271 | |
| 272 | VARBUF1(z, y, x) = d; |
| 273 | VARBUF2(z, y, x) = d * d; |
| 274 | } |
| 275 | } |
| 276 | } |
| 277 | } |
| 278 | |
| 279 | // Pad with an extra layer of 0s; this forms the edge of the SAT tables |
| 280 | vfloat4 vbz = vfloat4::zero(); |
| 281 | for (int z = 0; z < padsize_z; z++) |
| 282 | { |
| 283 | for (int y = 0; y < padsize_y; y++) |
| 284 | { |
| 285 | VARBUF1(z, y, 0) = vbz; |
| 286 | VARBUF2(z, y, 0) = vbz; |
| 287 | } |
| 288 | |
| 289 | for (int x = 0; x < padsize_x; x++) |
| 290 | { |
| 291 | VARBUF1(z, 0, x) = vbz; |
| 292 | VARBUF2(z, 0, x) = vbz; |
| 293 | } |
| 294 | } |
| 295 | |
| 296 | if (have_z) |
| 297 | { |
| 298 | for (int y = 0; y < padsize_y; y++) |
| 299 | { |
| 300 | for (int x = 0; x < padsize_x; x++) |
| 301 | { |
| 302 | VARBUF1(0, y, x) = vbz; |
| 303 | VARBUF2(0, y, x) = vbz; |
| 304 | } |
| 305 | } |
| 306 | } |
| 307 | |
| 308 | // Generate summed-area tables for N and N^2; this is done in-place, using |
| 309 | // a Brent-Kung parallel-prefix based algorithm to minimize precision loss |
| 310 | for (int z = zd_start; z < padsize_z; z++) |
| 311 | { |
| 312 | for (int y = 1; y < padsize_y; y++) |
| 313 | { |
| 314 | brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1); |
| 315 | brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1); |
| 316 | } |
| 317 | } |
| 318 | |
| 319 | for (int z = zd_start; z < padsize_z; z++) |
| 320 | { |
| 321 | for (int x = 1; x < padsize_x; x++) |
| 322 | { |
| 323 | brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst); |
| 324 | brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst); |
| 325 | } |
| 326 | } |
| 327 | |
| 328 | if (have_z) |
| 329 | { |
| 330 | for (int y = 1; y < padsize_y; y++) |
| 331 | { |
| 332 | for (int x = 1; x < padsize_x; x++) |
| 333 | { |
| 334 | brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst); |
| 335 | brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst); |
| 336 | } |
| 337 | } |
| 338 | } |
| 339 | |
| 340 | // Compute a few constants used in the variance-calculation. |
| 341 | float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1); |
| 342 | float alpha_rsamples; |
| 343 | |
| 344 | if (have_z) |
| 345 | { |
| 346 | alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim); |
| 347 | } |
| 348 | else |
| 349 | { |
| 350 | alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim); |
| 351 | } |
| 352 | |
| 353 | // Use the summed-area tables to compute variance for each neighborhood |
| 354 | if (have_z) |
| 355 | { |
| 356 | for (int z = 0; z < size_z; z++) |
| 357 | { |
| 358 | int z_src = z + kernel_radius_z; |
| 359 | int z_dst = z + offset_z; |
| 360 | int z_low = z_src - alpha_kernel_radius; |
| 361 | int z_high = z_src + alpha_kernel_radius + 1; |
| 362 | |
| 363 | for (int y = 0; y < size_y; y++) |
| 364 | { |
| 365 | int y_src = y + kernel_radius_xy; |
| 366 | int y_dst = y + offset_y; |
| 367 | int y_low = y_src - alpha_kernel_radius; |
| 368 | int y_high = y_src + alpha_kernel_radius + 1; |
| 369 | |
| 370 | for (int x = 0; x < size_x; x++) |
| 371 | { |
| 372 | int x_src = x + kernel_radius_xy; |
| 373 | int x_dst = x + offset_x; |
| 374 | int x_low = x_src - alpha_kernel_radius; |
| 375 | int x_high = x_src + alpha_kernel_radius + 1; |
| 376 | |
| 377 | // Summed-area table lookups for alpha average |
| 378 | float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>() |
| 379 | - VARBUF1(z_high, y_low, x_high).lane<3>() |
| 380 | - VARBUF1(z_high, y_high, x_low).lane<3>() |
| 381 | + VARBUF1(z_high, y_high, x_high).lane<3>()) - |
| 382 | ( VARBUF1(z_low, y_low, x_low).lane<3>() |
| 383 | - VARBUF1(z_low, y_low, x_high).lane<3>() |
| 384 | - VARBUF1(z_low, y_high, x_low).lane<3>() |
| 385 | + VARBUF1(z_low, y_high, x_high).lane<3>()); |
| 386 | |
| 387 | int out_index = z_dst * zdt + y_dst * ydt + x_dst; |
| 388 | input_alpha_averages[out_index] = (vasum * alpha_rsamples); |
| 389 | } |
| 390 | } |
| 391 | } |
| 392 | } |
| 393 | else |
| 394 | { |
| 395 | for (int y = 0; y < size_y; y++) |
| 396 | { |
| 397 | int y_src = y + kernel_radius_xy; |
| 398 | int y_dst = y + offset_y; |
| 399 | int y_low = y_src - alpha_kernel_radius; |
| 400 | int y_high = y_src + alpha_kernel_radius + 1; |
| 401 | |
| 402 | for (int x = 0; x < size_x; x++) |
| 403 | { |
| 404 | int x_src = x + kernel_radius_xy; |
| 405 | int x_dst = x + offset_x; |
| 406 | int x_low = x_src - alpha_kernel_radius; |
| 407 | int x_high = x_src + alpha_kernel_radius + 1; |
| 408 | |
| 409 | // Summed-area table lookups for alpha average |
| 410 | float vasum = VARBUF1(0, y_low, x_low).lane<3>() |
| 411 | - VARBUF1(0, y_low, x_high).lane<3>() |
| 412 | - VARBUF1(0, y_high, x_low).lane<3>() |
| 413 | + VARBUF1(0, y_high, x_high).lane<3>(); |
| 414 | |
| 415 | int out_index = y_dst * ydt + x_dst; |
| 416 | input_alpha_averages[out_index] = (vasum * alpha_rsamples); |
| 417 | } |
| 418 | } |
| 419 | } |
| 420 | } |
| 421 | |
| 422 | /* See header for documentation. */ |
| 423 | unsigned int init_compute_averages( |
| 424 | const astcenc_image& img, |
| 425 | unsigned int alpha_kernel_radius, |
| 426 | const astcenc_swizzle& swz, |
| 427 | avg_args& ag |
| 428 | ) { |
| 429 | unsigned int size_x = img.dim_x; |
| 430 | unsigned int size_y = img.dim_y; |
| 431 | unsigned int size_z = img.dim_z; |
| 432 | |
| 433 | // Compute maximum block size and from that the working memory buffer size |
| 434 | unsigned int kernel_radius = alpha_kernel_radius; |
| 435 | unsigned int kerneldim = 2 * kernel_radius + 1; |
| 436 | |
| 437 | bool have_z = (size_z > 1); |
| 438 | unsigned int max_blk_size_xy = have_z ? 16 : 32; |
| 439 | unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u); |
| 440 | |
| 441 | unsigned int max_padsize_xy = max_blk_size_xy + kerneldim; |
| 442 | unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0); |
| 443 | |
| 444 | // Perform block-wise averages calculations across the image |
| 445 | // Initialize fields which are not populated until later |
| 446 | ag.arg.size_x = 0; |
| 447 | ag.arg.size_y = 0; |
| 448 | ag.arg.size_z = 0; |
| 449 | ag.arg.offset_x = 0; |
| 450 | ag.arg.offset_y = 0; |
| 451 | ag.arg.offset_z = 0; |
| 452 | ag.arg.work_memory = nullptr; |
| 453 | |
| 454 | ag.arg.img = &img; |
| 455 | ag.arg.swz = swz; |
| 456 | ag.arg.have_z = have_z; |
| 457 | ag.arg.alpha_kernel_radius = alpha_kernel_radius; |
| 458 | |
| 459 | ag.img_size_x = size_x; |
| 460 | ag.img_size_y = size_y; |
| 461 | ag.img_size_z = size_z; |
| 462 | ag.blk_size_xy = max_blk_size_xy; |
| 463 | ag.blk_size_z = max_blk_size_z; |
| 464 | ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z; |
| 465 | |
| 466 | // The parallel task count |
| 467 | unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z; |
| 468 | unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy; |
| 469 | return z_tasks * y_tasks; |
| 470 | } |
| 471 | |
| 472 | #endif |
| 473 | |