| 1 | // SPDX-License-Identifier: Apache-2.0 |
| 2 | // ---------------------------------------------------------------------------- |
| 3 | // Copyright 2011-2023 Arm Limited |
| 4 | // |
| 5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| 6 | // use this file except in compliance with the License. You may obtain a copy |
| 7 | // of the License at: |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, software |
| 12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| 13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| 14 | // License for the specific language governing permissions and limitations |
| 15 | // under the License. |
| 16 | // ---------------------------------------------------------------------------- |
| 17 | |
| 18 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
| 19 | |
| 20 | /** |
| 21 | * @brief Functions for finding best partition for a block. |
| 22 | * |
| 23 | * The partition search operates in two stages. The first pass uses kmeans clustering to group |
| 24 | * texels into an ideal partitioning for the requested partition count, and then compares that |
| 25 | * against the 1024 partitionings generated by the ASTC partition hash function. The generated |
| 26 | * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal |
| 27 | * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and |
| 28 | * partitionings that actually generate fewer than the requested partition count, but only the top |
| 29 | * N candidates are actually put through a more detailed search. N is determined by the compressor |
| 30 | * quality preset. |
| 31 | * |
| 32 | * For the detailed search, each candidate is checked against two possible encoding methods: |
| 33 | * |
| 34 | * - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints). |
| 35 | * - The best partitioning assuming same chroma colors (RGB + scale endpoints). |
| 36 | * |
| 37 | * This is implemented by computing the compute mean color and dominant direction for each |
| 38 | * partition. This defines two lines, both of which go through the mean color value. |
| 39 | * |
| 40 | * - One line has a direction defined by the dominant direction; this is used to assess the error |
| 41 | * from using an uncorrelated color representation. |
| 42 | * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma |
| 43 | * (RGB + scale) color representation. |
| 44 | * |
| 45 | * The best candidate is selected by computing the squared-errors that result from using these |
| 46 | * lines for endpoint selection. |
| 47 | */ |
| 48 | |
| 49 | #include <limits> |
| 50 | #include "astcenc_internal.h" |
| 51 | |
| 52 | /** |
| 53 | * @brief Pick some initial kmeans cluster centers. |
| 54 | * |
| 55 | * @param blk The image block color data to compress. |
| 56 | * @param texel_count The number of texels in the block. |
| 57 | * @param partition_count The number of partitions in the block. |
| 58 | * @param[out] cluster_centers The initial partition cluster center colors. |
| 59 | */ |
| 60 | static void kmeans_init( |
| 61 | const image_block& blk, |
| 62 | unsigned int texel_count, |
| 63 | unsigned int partition_count, |
| 64 | vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS] |
| 65 | ) { |
| 66 | promise(texel_count > 0); |
| 67 | promise(partition_count > 0); |
| 68 | |
| 69 | unsigned int clusters_selected = 0; |
| 70 | float distances[BLOCK_MAX_TEXELS]; |
| 71 | |
| 72 | // Pick a random sample as first cluster center; 145897 from random.org |
| 73 | unsigned int sample = 145897 % texel_count; |
| 74 | vfloat4 center_color = blk.texel(sample); |
| 75 | cluster_centers[clusters_selected] = center_color; |
| 76 | clusters_selected++; |
| 77 | |
| 78 | // Compute the distance to the first cluster center |
| 79 | float distance_sum = 0.0f; |
| 80 | for (unsigned int i = 0; i < texel_count; i++) |
| 81 | { |
| 82 | vfloat4 color = blk.texel(i); |
| 83 | vfloat4 diff = color - center_color; |
| 84 | float distance = dot_s(diff * diff, blk.channel_weight); |
| 85 | distance_sum += distance; |
| 86 | distances[i] = distance; |
| 87 | } |
| 88 | |
| 89 | // More numbers from random.org for weighted-random center selection |
| 90 | const float cluster_cutoffs[9] { |
| 91 | 0.626220f, 0.932770f, 0.275454f, |
| 92 | 0.318558f, 0.240113f, 0.009190f, |
| 93 | 0.347661f, 0.731960f, 0.156391f |
| 94 | }; |
| 95 | |
| 96 | unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2); |
| 97 | |
| 98 | // Pick the remaining samples as needed |
| 99 | while (true) |
| 100 | { |
| 101 | // Pick the next center in a weighted-random fashion. |
| 102 | float summa = 0.0f; |
| 103 | float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++]; |
| 104 | for (sample = 0; sample < texel_count; sample++) |
| 105 | { |
| 106 | summa += distances[sample]; |
| 107 | if (summa >= distance_cutoff) |
| 108 | { |
| 109 | break; |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | // Clamp to a valid range and store the selected cluster center |
| 114 | sample = astc::min(sample, texel_count - 1); |
| 115 | |
| 116 | center_color = blk.texel(sample); |
| 117 | cluster_centers[clusters_selected++] = center_color; |
| 118 | if (clusters_selected >= partition_count) |
| 119 | { |
| 120 | break; |
| 121 | } |
| 122 | |
| 123 | // Compute the distance to the new cluster center, keep the min dist |
| 124 | distance_sum = 0.0f; |
| 125 | for (unsigned int i = 0; i < texel_count; i++) |
| 126 | { |
| 127 | vfloat4 color = blk.texel(i); |
| 128 | vfloat4 diff = color - center_color; |
| 129 | float distance = dot_s(diff * diff, blk.channel_weight); |
| 130 | distance = astc::min(distance, distances[i]); |
| 131 | distance_sum += distance; |
| 132 | distances[i] = distance; |
| 133 | } |
| 134 | } |
| 135 | } |
| 136 | |
| 137 | /** |
| 138 | * @brief Assign texels to clusters, based on a set of chosen center points. |
| 139 | * |
| 140 | * @param blk The image block color data to compress. |
| 141 | * @param texel_count The number of texels in the block. |
| 142 | * @param partition_count The number of partitions in the block. |
| 143 | * @param cluster_centers The partition cluster center colors. |
| 144 | * @param[out] partition_of_texel The partition assigned for each texel. |
| 145 | */ |
| 146 | static void kmeans_assign( |
| 147 | const image_block& blk, |
| 148 | unsigned int texel_count, |
| 149 | unsigned int partition_count, |
| 150 | const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS], |
| 151 | uint8_t partition_of_texel[BLOCK_MAX_TEXELS] |
| 152 | ) { |
| 153 | promise(texel_count > 0); |
| 154 | promise(partition_count > 0); |
| 155 | |
| 156 | uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 }; |
| 157 | |
| 158 | // Find the best partition for every texel |
| 159 | for (unsigned int i = 0; i < texel_count; i++) |
| 160 | { |
| 161 | float best_distance = std::numeric_limits<float>::max(); |
| 162 | unsigned int best_partition = 0; |
| 163 | |
| 164 | vfloat4 color = blk.texel(i); |
| 165 | for (unsigned int j = 0; j < partition_count; j++) |
| 166 | { |
| 167 | vfloat4 diff = color - cluster_centers[j]; |
| 168 | float distance = dot_s(diff * diff, blk.channel_weight); |
| 169 | if (distance < best_distance) |
| 170 | { |
| 171 | best_distance = distance; |
| 172 | best_partition = j; |
| 173 | } |
| 174 | } |
| 175 | |
| 176 | partition_of_texel[i] = static_cast<uint8_t>(best_partition); |
| 177 | partition_texel_count[best_partition]++; |
| 178 | } |
| 179 | |
| 180 | // It is possible to get a situation where a partition ends up without any texels. In this case, |
| 181 | // assign texel N to partition N. This is silly, but ensures that every partition retains at |
| 182 | // least one texel. Reassigning a texel in this manner may cause another partition to go empty, |
| 183 | // so if we actually did a reassignment, run the whole loop over again. |
| 184 | bool problem_case; |
| 185 | do |
| 186 | { |
| 187 | problem_case = false; |
| 188 | for (unsigned int i = 0; i < partition_count; i++) |
| 189 | { |
| 190 | if (partition_texel_count[i] == 0) |
| 191 | { |
| 192 | partition_texel_count[partition_of_texel[i]]--; |
| 193 | partition_texel_count[i]++; |
| 194 | partition_of_texel[i] = static_cast<uint8_t>(i); |
| 195 | problem_case = true; |
| 196 | } |
| 197 | } |
| 198 | } while (problem_case); |
| 199 | } |
| 200 | |
| 201 | /** |
| 202 | * @brief Compute new cluster centers based on their center of gravity. |
| 203 | * |
| 204 | * @param blk The image block color data to compress. |
| 205 | * @param texel_count The number of texels in the block. |
| 206 | * @param partition_count The number of partitions in the block. |
| 207 | * @param[out] cluster_centers The new cluster center colors. |
| 208 | * @param partition_of_texel The partition assigned for each texel. |
| 209 | */ |
| 210 | static void kmeans_update( |
| 211 | const image_block& blk, |
| 212 | unsigned int texel_count, |
| 213 | unsigned int partition_count, |
| 214 | vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS], |
| 215 | const uint8_t partition_of_texel[BLOCK_MAX_TEXELS] |
| 216 | ) { |
| 217 | promise(texel_count > 0); |
| 218 | promise(partition_count > 0); |
| 219 | |
| 220 | vfloat4 color_sum[BLOCK_MAX_PARTITIONS] { |
| 221 | vfloat4::zero(), |
| 222 | vfloat4::zero(), |
| 223 | vfloat4::zero(), |
| 224 | vfloat4::zero() |
| 225 | }; |
| 226 | |
| 227 | uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 }; |
| 228 | |
| 229 | // Find the center-of-gravity in each cluster |
| 230 | for (unsigned int i = 0; i < texel_count; i++) |
| 231 | { |
| 232 | uint8_t partition = partition_of_texel[i]; |
| 233 | color_sum[partition] += blk.texel(i); |
| 234 | partition_texel_count[partition]++; |
| 235 | } |
| 236 | |
| 237 | // Set the center of gravity to be the new cluster center |
| 238 | for (unsigned int i = 0; i < partition_count; i++) |
| 239 | { |
| 240 | float scale = 1.0f / static_cast<float>(partition_texel_count[i]); |
| 241 | cluster_centers[i] = color_sum[i] * scale; |
| 242 | } |
| 243 | } |
| 244 | |
| 245 | /** |
| 246 | * @brief Compute bit-mismatch for partitioning in 2-partition mode. |
| 247 | * |
| 248 | * @param a The texel assignment bitvector for the block. |
| 249 | * @param b The texel assignment bitvector for the partition table. |
| 250 | * |
| 251 | * @return The number of bit mismatches. |
| 252 | */ |
| 253 | static inline unsigned int partition_mismatch2( |
| 254 | const uint64_t a[2], |
| 255 | const uint64_t b[2] |
| 256 | ) { |
| 257 | int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]); |
| 258 | int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]); |
| 259 | return astc::min(v1, v2); |
| 260 | } |
| 261 | |
| 262 | /** |
| 263 | * @brief Compute bit-mismatch for partitioning in 3-partition mode. |
| 264 | * |
| 265 | * @param a The texel assignment bitvector for the block. |
| 266 | * @param b The texel assignment bitvector for the partition table. |
| 267 | * |
| 268 | * @return The number of bit mismatches. |
| 269 | */ |
| 270 | static inline unsigned int partition_mismatch3( |
| 271 | const uint64_t a[3], |
| 272 | const uint64_t b[3] |
| 273 | ) { |
| 274 | int p00 = popcount(a[0] ^ b[0]); |
| 275 | int p01 = popcount(a[0] ^ b[1]); |
| 276 | int p02 = popcount(a[0] ^ b[2]); |
| 277 | |
| 278 | int p10 = popcount(a[1] ^ b[0]); |
| 279 | int p11 = popcount(a[1] ^ b[1]); |
| 280 | int p12 = popcount(a[1] ^ b[2]); |
| 281 | |
| 282 | int p20 = popcount(a[2] ^ b[0]); |
| 283 | int p21 = popcount(a[2] ^ b[1]); |
| 284 | int p22 = popcount(a[2] ^ b[2]); |
| 285 | |
| 286 | int s0 = p11 + p22; |
| 287 | int s1 = p12 + p21; |
| 288 | int v0 = astc::min(s0, s1) + p00; |
| 289 | |
| 290 | int s2 = p10 + p22; |
| 291 | int s3 = p12 + p20; |
| 292 | int v1 = astc::min(s2, s3) + p01; |
| 293 | |
| 294 | int s4 = p10 + p21; |
| 295 | int s5 = p11 + p20; |
| 296 | int v2 = astc::min(s4, s5) + p02; |
| 297 | |
| 298 | return astc::min(v0, v1, v2); |
| 299 | } |
| 300 | |
| 301 | /** |
| 302 | * @brief Compute bit-mismatch for partitioning in 4-partition mode. |
| 303 | * |
| 304 | * @param a The texel assignment bitvector for the block. |
| 305 | * @param b The texel assignment bitvector for the partition table. |
| 306 | * |
| 307 | * @return The number of bit mismatches. |
| 308 | */ |
| 309 | static inline unsigned int partition_mismatch4( |
| 310 | const uint64_t a[4], |
| 311 | const uint64_t b[4] |
| 312 | ) { |
| 313 | int p00 = popcount(a[0] ^ b[0]); |
| 314 | int p01 = popcount(a[0] ^ b[1]); |
| 315 | int p02 = popcount(a[0] ^ b[2]); |
| 316 | int p03 = popcount(a[0] ^ b[3]); |
| 317 | |
| 318 | int p10 = popcount(a[1] ^ b[0]); |
| 319 | int p11 = popcount(a[1] ^ b[1]); |
| 320 | int p12 = popcount(a[1] ^ b[2]); |
| 321 | int p13 = popcount(a[1] ^ b[3]); |
| 322 | |
| 323 | int p20 = popcount(a[2] ^ b[0]); |
| 324 | int p21 = popcount(a[2] ^ b[1]); |
| 325 | int p22 = popcount(a[2] ^ b[2]); |
| 326 | int p23 = popcount(a[2] ^ b[3]); |
| 327 | |
| 328 | int p30 = popcount(a[3] ^ b[0]); |
| 329 | int p31 = popcount(a[3] ^ b[1]); |
| 330 | int p32 = popcount(a[3] ^ b[2]); |
| 331 | int p33 = popcount(a[3] ^ b[3]); |
| 332 | |
| 333 | int mx23 = astc::min(p22 + p33, p23 + p32); |
| 334 | int mx13 = astc::min(p21 + p33, p23 + p31); |
| 335 | int mx12 = astc::min(p21 + p32, p22 + p31); |
| 336 | int mx03 = astc::min(p20 + p33, p23 + p30); |
| 337 | int mx02 = astc::min(p20 + p32, p22 + p30); |
| 338 | int mx01 = astc::min(p21 + p30, p20 + p31); |
| 339 | |
| 340 | int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12); |
| 341 | int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02); |
| 342 | int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01); |
| 343 | int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12); |
| 344 | |
| 345 | return astc::min(v0, v1, v2, v3); |
| 346 | } |
| 347 | |
| 348 | using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*); |
| 349 | |
| 350 | /** |
| 351 | * @brief Count the partition table mismatches vs the data clustering. |
| 352 | * |
| 353 | * @param bsd The block size information. |
| 354 | * @param partition_count The number of partitions in the block. |
| 355 | * @param bitmaps The block texel partition assignment patterns. |
| 356 | * @param[out] mismatch_counts The array storing per partitioning mismatch counts. |
| 357 | */ |
| 358 | static void count_partition_mismatch_bits( |
| 359 | const block_size_descriptor& bsd, |
| 360 | unsigned int partition_count, |
| 361 | const uint64_t bitmaps[BLOCK_MAX_PARTITIONS], |
| 362 | unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS] |
| 363 | ) { |
| 364 | unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1]; |
| 365 | promise(active_count > 0); |
| 366 | |
| 367 | if (partition_count == 2) |
| 368 | { |
| 369 | for (unsigned int i = 0; i < active_count; i++) |
| 370 | { |
| 371 | mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]); |
| 372 | } |
| 373 | } |
| 374 | else if (partition_count == 3) |
| 375 | { |
| 376 | for (unsigned int i = 0; i < active_count; i++) |
| 377 | { |
| 378 | mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]); |
| 379 | } |
| 380 | } |
| 381 | else |
| 382 | { |
| 383 | for (unsigned int i = 0; i < active_count; i++) |
| 384 | { |
| 385 | mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]); |
| 386 | } |
| 387 | } |
| 388 | } |
| 389 | |
| 390 | /** |
| 391 | * @brief Use counting sort on the mismatch array to sort partition candidates. |
| 392 | * |
| 393 | * @param partitioning_count The number of packed partitionings. |
| 394 | * @param mismatch_count Partitioning mismatch counts, in index order. |
| 395 | * @param[out] partition_ordering Partition index values, in mismatch order. |
| 396 | * |
| 397 | * @return The number of active partitions in this selection. |
| 398 | */ |
| 399 | static unsigned int get_partition_ordering_by_mismatch_bits( |
| 400 | unsigned int partitioning_count, |
| 401 | const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS], |
| 402 | unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS] |
| 403 | ) { |
| 404 | promise(partitioning_count > 0); |
| 405 | unsigned int mscount[256] { 0 }; |
| 406 | |
| 407 | // Create the histogram of mismatch counts |
| 408 | for (unsigned int i = 0; i < partitioning_count; i++) |
| 409 | { |
| 410 | mscount[mismatch_count[i]]++; |
| 411 | } |
| 412 | |
| 413 | unsigned int active_count = partitioning_count - mscount[255]; |
| 414 | |
| 415 | // Create a running sum from the histogram array |
| 416 | // Cells store previous values only; i.e. exclude self after sum |
| 417 | unsigned int summa = 0; |
| 418 | for (unsigned int i = 0; i < 256; i++) |
| 419 | { |
| 420 | unsigned int cnt = mscount[i]; |
| 421 | mscount[i] = summa; |
| 422 | summa += cnt; |
| 423 | } |
| 424 | |
| 425 | // Use the running sum as the index, incrementing after read to allow |
| 426 | // sequential entries with the same count |
| 427 | for (unsigned int i = 0; i < partitioning_count; i++) |
| 428 | { |
| 429 | unsigned int idx = mscount[mismatch_count[i]]++; |
| 430 | partition_ordering[idx] = i; |
| 431 | } |
| 432 | |
| 433 | return active_count; |
| 434 | } |
| 435 | |
| 436 | /** |
| 437 | * @brief Use k-means clustering to compute a partition ordering for a block.. |
| 438 | * |
| 439 | * @param bsd The block size information. |
| 440 | * @param blk The image block color data to compress. |
| 441 | * @param partition_count The desired number of partitions in the block. |
| 442 | * @param[out] partition_ordering The list of recommended partition indices, in priority order. |
| 443 | * |
| 444 | * @return The number of active partitionings in this selection. |
| 445 | */ |
| 446 | static unsigned int compute_kmeans_partition_ordering( |
| 447 | const block_size_descriptor& bsd, |
| 448 | const image_block& blk, |
| 449 | unsigned int partition_count, |
| 450 | unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS] |
| 451 | ) { |
| 452 | vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]; |
| 453 | uint8_t texel_partitions[BLOCK_MAX_TEXELS]; |
| 454 | |
| 455 | // Use three passes of k-means clustering to partition the block data |
| 456 | for (unsigned int i = 0; i < 3; i++) |
| 457 | { |
| 458 | if (i == 0) |
| 459 | { |
| 460 | kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers); |
| 461 | } |
| 462 | else |
| 463 | { |
| 464 | kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions); |
| 465 | } |
| 466 | |
| 467 | kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions); |
| 468 | } |
| 469 | |
| 470 | // Construct the block bitmaps of texel assignments to each partition |
| 471 | uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 }; |
| 472 | unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS); |
| 473 | promise(texels_to_process > 0); |
| 474 | for (unsigned int i = 0; i < texels_to_process; i++) |
| 475 | { |
| 476 | unsigned int idx = bsd.kmeans_texels[i]; |
| 477 | bitmaps[texel_partitions[idx]] |= 1ULL << i; |
| 478 | } |
| 479 | |
| 480 | // Count the mismatch between the block and the format's partition tables |
| 481 | unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]; |
| 482 | count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts); |
| 483 | |
| 484 | // Sort the partitions based on the number of mismatched bits |
| 485 | return get_partition_ordering_by_mismatch_bits( |
| 486 | bsd.partitioning_count_selected[partition_count - 1], |
| 487 | mismatch_counts, partition_ordering); |
| 488 | } |
| 489 | |
| 490 | /** |
| 491 | * @brief Insert a partitioning into an order list of results, sorted by error. |
| 492 | * |
| 493 | * @param max_values The max number of entries in the best result arrays. |
| 494 | * @param this_error The error of the new entry. |
| 495 | * @param this_partition The partition ID of the new entry. |
| 496 | * @param[out] best_errors The array of best error values. |
| 497 | * @param[out] best_partitions The array of best partition values. |
| 498 | */ |
| 499 | static void insert_result( |
| 500 | unsigned int max_values, |
| 501 | float this_error, |
| 502 | unsigned int this_partition, |
| 503 | float* best_errors, |
| 504 | unsigned int* best_partitions) |
| 505 | { |
| 506 | promise(max_values > 0); |
| 507 | |
| 508 | // Don't bother searching if the current worst error beats the new error |
| 509 | if (this_error >= best_errors[max_values - 1]) |
| 510 | { |
| 511 | return; |
| 512 | } |
| 513 | |
| 514 | // Else insert into the list in error-order |
| 515 | for (unsigned int i = 0; i < max_values; i++) |
| 516 | { |
| 517 | // Existing result is better - move on ... |
| 518 | if (this_error > best_errors[i]) |
| 519 | { |
| 520 | continue; |
| 521 | } |
| 522 | |
| 523 | // Move existing results down one |
| 524 | for (unsigned int j = max_values - 1; j > i; j--) |
| 525 | { |
| 526 | best_errors[j] = best_errors[j - 1]; |
| 527 | best_partitions[j] = best_partitions[j - 1]; |
| 528 | } |
| 529 | |
| 530 | // Insert new result |
| 531 | best_errors[i] = this_error; |
| 532 | best_partitions[i] = this_partition; |
| 533 | break; |
| 534 | } |
| 535 | } |
| 536 | |
| 537 | /* See header for documentation. */ |
| 538 | unsigned int find_best_partition_candidates( |
| 539 | const block_size_descriptor& bsd, |
| 540 | const image_block& blk, |
| 541 | unsigned int partition_count, |
| 542 | unsigned int partition_search_limit, |
| 543 | unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES], |
| 544 | unsigned int requested_candidates |
| 545 | ) { |
| 546 | // Constant used to estimate quantization error for a given partitioning; the optimal value for |
| 547 | // this depends on bitrate. These values have been determined empirically. |
| 548 | unsigned int texels_per_block = bsd.texel_count; |
| 549 | float weight_imprecision_estim = 0.055f; |
| 550 | if (texels_per_block <= 20) |
| 551 | { |
| 552 | weight_imprecision_estim = 0.03f; |
| 553 | } |
| 554 | else if (texels_per_block <= 31) |
| 555 | { |
| 556 | weight_imprecision_estim = 0.04f; |
| 557 | } |
| 558 | else if (texels_per_block <= 41) |
| 559 | { |
| 560 | weight_imprecision_estim = 0.05f; |
| 561 | } |
| 562 | |
| 563 | promise(partition_count > 0); |
| 564 | promise(partition_search_limit > 0); |
| 565 | |
| 566 | weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim; |
| 567 | |
| 568 | unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS]; |
| 569 | unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence); |
| 570 | partition_search_limit = astc::min(partition_search_limit, sequence_len); |
| 571 | requested_candidates = astc::min(partition_search_limit, requested_candidates); |
| 572 | |
| 573 | bool uses_alpha = !blk.is_constant_channel(3); |
| 574 | |
| 575 | // Partitioning errors assuming uncorrelated-chrominance endpoints |
| 576 | float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES]; |
| 577 | unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES]; |
| 578 | |
| 579 | // Partitioning errors assuming same-chrominance endpoints |
| 580 | float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES]; |
| 581 | unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES]; |
| 582 | |
| 583 | for (unsigned int i = 0; i < requested_candidates; i++) |
| 584 | { |
| 585 | uncor_best_errors[i] = ERROR_CALC_DEFAULT; |
| 586 | samec_best_errors[i] = ERROR_CALC_DEFAULT; |
| 587 | } |
| 588 | |
| 589 | if (uses_alpha) |
| 590 | { |
| 591 | for (unsigned int i = 0; i < partition_search_limit; i++) |
| 592 | { |
| 593 | unsigned int partition = partition_sequence[i]; |
| 594 | const auto& pi = bsd.get_raw_partition_info(partition_count, partition); |
| 595 | |
| 596 | // Compute weighting to give to each component in each partition |
| 597 | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
| 598 | |
| 599 | compute_avgs_and_dirs_4_comp(pi, blk, pms); |
| 600 | |
| 601 | line4 uncor_lines[BLOCK_MAX_PARTITIONS]; |
| 602 | line4 samec_lines[BLOCK_MAX_PARTITIONS]; |
| 603 | |
| 604 | processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS]; |
| 605 | processed_line4 samec_plines[BLOCK_MAX_PARTITIONS]; |
| 606 | |
| 607 | float line_lengths[BLOCK_MAX_PARTITIONS]; |
| 608 | |
| 609 | for (unsigned int j = 0; j < partition_count; j++) |
| 610 | { |
| 611 | partition_metrics& pm = pms[j]; |
| 612 | |
| 613 | uncor_lines[j].a = pm.avg; |
| 614 | uncor_lines[j].b = normalize_safe(pm.dir, unit4()); |
| 615 | |
| 616 | uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b); |
| 617 | uncor_plines[j].bs = uncor_lines[j].b; |
| 618 | |
| 619 | samec_lines[j].a = vfloat4::zero(); |
| 620 | samec_lines[j].b = normalize_safe(pm.avg, unit4()); |
| 621 | |
| 622 | samec_plines[j].amod = vfloat4::zero(); |
| 623 | samec_plines[j].bs = samec_lines[j].b; |
| 624 | } |
| 625 | |
| 626 | float uncor_error = 0.0f; |
| 627 | float samec_error = 0.0f; |
| 628 | |
| 629 | compute_error_squared_rgba(pi, |
| 630 | blk, |
| 631 | uncor_plines, |
| 632 | samec_plines, |
| 633 | line_lengths, |
| 634 | uncor_error, |
| 635 | samec_error); |
| 636 | |
| 637 | // Compute an estimate of error introduced by weight quantization imprecision. |
| 638 | // This error is computed as follows, for each partition |
| 639 | // 1: compute the principal-axis vector (full length) in error-space |
| 640 | // 2: convert the principal-axis vector to regular RGB-space |
| 641 | // 3: scale the vector by a constant that estimates average quantization error |
| 642 | // 4: for each texel, square the vector, then do a dot-product with the texel's |
| 643 | // error weight; sum up the results across all texels. |
| 644 | // 4(optimized): square the vector once, then do a dot-product with the average |
| 645 | // texel error, then multiply by the number of texels. |
| 646 | |
| 647 | for (unsigned int j = 0; j < partition_count; j++) |
| 648 | { |
| 649 | float tpp = static_cast<float>(pi.partition_texel_count[j]); |
| 650 | vfloat4 error_weights(tpp * weight_imprecision_estim); |
| 651 | |
| 652 | vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j]; |
| 653 | vfloat4 samec_vector = samec_lines[j].b * line_lengths[j]; |
| 654 | |
| 655 | uncor_error += dot_s(uncor_vector * uncor_vector, error_weights); |
| 656 | samec_error += dot_s(samec_vector * samec_vector, error_weights); |
| 657 | } |
| 658 | |
| 659 | insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions); |
| 660 | insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions); |
| 661 | } |
| 662 | } |
| 663 | else |
| 664 | { |
| 665 | for (unsigned int i = 0; i < partition_search_limit; i++) |
| 666 | { |
| 667 | unsigned int partition = partition_sequence[i]; |
| 668 | const auto& pi = bsd.get_raw_partition_info(partition_count, partition); |
| 669 | |
| 670 | // Compute weighting to give to each component in each partition |
| 671 | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
| 672 | compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); |
| 673 | |
| 674 | partition_lines3 plines[BLOCK_MAX_PARTITIONS]; |
| 675 | |
| 676 | for (unsigned int j = 0; j < partition_count; j++) |
| 677 | { |
| 678 | partition_metrics& pm = pms[j]; |
| 679 | partition_lines3& pl = plines[j]; |
| 680 | |
| 681 | pl.uncor_line.a = pm.avg; |
| 682 | pl.uncor_line.b = normalize_safe(pm.dir, unit3()); |
| 683 | |
| 684 | pl.samec_line.a = vfloat4::zero(); |
| 685 | pl.samec_line.b = normalize_safe(pm.avg, unit3()); |
| 686 | |
| 687 | pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b); |
| 688 | pl.uncor_pline.bs = pl.uncor_line.b; |
| 689 | |
| 690 | pl.samec_pline.amod = vfloat4::zero(); |
| 691 | pl.samec_pline.bs = pl.samec_line.b; |
| 692 | } |
| 693 | |
| 694 | float uncor_error = 0.0f; |
| 695 | float samec_error = 0.0f; |
| 696 | |
| 697 | compute_error_squared_rgb(pi, |
| 698 | blk, |
| 699 | plines, |
| 700 | uncor_error, |
| 701 | samec_error); |
| 702 | |
| 703 | // Compute an estimate of error introduced by weight quantization imprecision. |
| 704 | // This error is computed as follows, for each partition |
| 705 | // 1: compute the principal-axis vector (full length) in error-space |
| 706 | // 2: convert the principal-axis vector to regular RGB-space |
| 707 | // 3: scale the vector by a constant that estimates average quantization error |
| 708 | // 4: for each texel, square the vector, then do a dot-product with the texel's |
| 709 | // error weight; sum up the results across all texels. |
| 710 | // 4(optimized): square the vector once, then do a dot-product with the average |
| 711 | // texel error, then multiply by the number of texels. |
| 712 | |
| 713 | for (unsigned int j = 0; j < partition_count; j++) |
| 714 | { |
| 715 | partition_lines3& pl = plines[j]; |
| 716 | |
| 717 | float tpp = static_cast<float>(pi.partition_texel_count[j]); |
| 718 | vfloat4 error_weights(tpp * weight_imprecision_estim); |
| 719 | |
| 720 | vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length; |
| 721 | vfloat4 samec_vector = pl.samec_line.b * pl.line_length; |
| 722 | |
| 723 | uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights); |
| 724 | samec_error += dot3_s(samec_vector * samec_vector, error_weights); |
| 725 | } |
| 726 | |
| 727 | insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions); |
| 728 | insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions); |
| 729 | } |
| 730 | } |
| 731 | |
| 732 | unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES]; |
| 733 | for (unsigned int i = 0; i < requested_candidates; i++) |
| 734 | { |
| 735 | interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index; |
| 736 | interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index; |
| 737 | } |
| 738 | |
| 739 | uint64_t bitmasks[1024/64] { 0 }; |
| 740 | unsigned int emitted = 0; |
| 741 | |
| 742 | // Deduplicate the first "requested" entries |
| 743 | for (unsigned int i = 0; i < requested_candidates * 2; i++) |
| 744 | { |
| 745 | unsigned int partition = interleave[i]; |
| 746 | |
| 747 | unsigned int word = partition / 64; |
| 748 | unsigned int bit = partition % 64; |
| 749 | |
| 750 | bool written = bitmasks[word] & (1ull << bit); |
| 751 | |
| 752 | if (!written) |
| 753 | { |
| 754 | best_partitions[emitted] = partition; |
| 755 | bitmasks[word] |= 1ull << bit; |
| 756 | emitted++; |
| 757 | |
| 758 | if (emitted == requested_candidates) |
| 759 | { |
| 760 | break; |
| 761 | } |
| 762 | } |
| 763 | } |
| 764 | |
| 765 | return emitted; |
| 766 | } |
| 767 | |
| 768 | #endif |
| 769 | |