1// SPDX-License-Identifier: Apache-2.0
2// ----------------------------------------------------------------------------
3// Copyright 2011-2023 Arm Limited
4//
5// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6// use this file except in compliance with the License. You may obtain a copy
7// of the License at:
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14// License for the specific language governing permissions and limitations
15// under the License.
16// ----------------------------------------------------------------------------
17
18#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20/**
21 * @brief Functions for finding best partition for a block.
22 *
23 * The partition search operates in two stages. The first pass uses kmeans clustering to group
24 * texels into an ideal partitioning for the requested partition count, and then compares that
25 * against the 1024 partitionings generated by the ASTC partition hash function. The generated
26 * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
27 * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
28 * partitionings that actually generate fewer than the requested partition count, but only the top
29 * N candidates are actually put through a more detailed search. N is determined by the compressor
30 * quality preset.
31 *
32 * For the detailed search, each candidate is checked against two possible encoding methods:
33 *
34 * - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
35 * - The best partitioning assuming same chroma colors (RGB + scale endpoints).
36 *
37 * This is implemented by computing the compute mean color and dominant direction for each
38 * partition. This defines two lines, both of which go through the mean color value.
39 *
40 * - One line has a direction defined by the dominant direction; this is used to assess the error
41 * from using an uncorrelated color representation.
42 * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
43 * (RGB + scale) color representation.
44 *
45 * The best candidate is selected by computing the squared-errors that result from using these
46 * lines for endpoint selection.
47 */
48
49#include <limits>
50#include "astcenc_internal.h"
51
52/**
53 * @brief Pick some initial kmeans cluster centers.
54 *
55 * @param blk The image block color data to compress.
56 * @param texel_count The number of texels in the block.
57 * @param partition_count The number of partitions in the block.
58 * @param[out] cluster_centers The initial partition cluster center colors.
59 */
60static void kmeans_init(
61 const image_block& blk,
62 unsigned int texel_count,
63 unsigned int partition_count,
64 vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
65) {
66 promise(texel_count > 0);
67 promise(partition_count > 0);
68
69 unsigned int clusters_selected = 0;
70 float distances[BLOCK_MAX_TEXELS];
71
72 // Pick a random sample as first cluster center; 145897 from random.org
73 unsigned int sample = 145897 % texel_count;
74 vfloat4 center_color = blk.texel(sample);
75 cluster_centers[clusters_selected] = center_color;
76 clusters_selected++;
77
78 // Compute the distance to the first cluster center
79 float distance_sum = 0.0f;
80 for (unsigned int i = 0; i < texel_count; i++)
81 {
82 vfloat4 color = blk.texel(i);
83 vfloat4 diff = color - center_color;
84 float distance = dot_s(diff * diff, blk.channel_weight);
85 distance_sum += distance;
86 distances[i] = distance;
87 }
88
89 // More numbers from random.org for weighted-random center selection
90 const float cluster_cutoffs[9] {
91 0.626220f, 0.932770f, 0.275454f,
92 0.318558f, 0.240113f, 0.009190f,
93 0.347661f, 0.731960f, 0.156391f
94 };
95
96 unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
97
98 // Pick the remaining samples as needed
99 while (true)
100 {
101 // Pick the next center in a weighted-random fashion.
102 float summa = 0.0f;
103 float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
104 for (sample = 0; sample < texel_count; sample++)
105 {
106 summa += distances[sample];
107 if (summa >= distance_cutoff)
108 {
109 break;
110 }
111 }
112
113 // Clamp to a valid range and store the selected cluster center
114 sample = astc::min(sample, texel_count - 1);
115
116 center_color = blk.texel(sample);
117 cluster_centers[clusters_selected++] = center_color;
118 if (clusters_selected >= partition_count)
119 {
120 break;
121 }
122
123 // Compute the distance to the new cluster center, keep the min dist
124 distance_sum = 0.0f;
125 for (unsigned int i = 0; i < texel_count; i++)
126 {
127 vfloat4 color = blk.texel(i);
128 vfloat4 diff = color - center_color;
129 float distance = dot_s(diff * diff, blk.channel_weight);
130 distance = astc::min(distance, distances[i]);
131 distance_sum += distance;
132 distances[i] = distance;
133 }
134 }
135}
136
137/**
138 * @brief Assign texels to clusters, based on a set of chosen center points.
139 *
140 * @param blk The image block color data to compress.
141 * @param texel_count The number of texels in the block.
142 * @param partition_count The number of partitions in the block.
143 * @param cluster_centers The partition cluster center colors.
144 * @param[out] partition_of_texel The partition assigned for each texel.
145 */
146static void kmeans_assign(
147 const image_block& blk,
148 unsigned int texel_count,
149 unsigned int partition_count,
150 const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
151 uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
152) {
153 promise(texel_count > 0);
154 promise(partition_count > 0);
155
156 uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
157
158 // Find the best partition for every texel
159 for (unsigned int i = 0; i < texel_count; i++)
160 {
161 float best_distance = std::numeric_limits<float>::max();
162 unsigned int best_partition = 0;
163
164 vfloat4 color = blk.texel(i);
165 for (unsigned int j = 0; j < partition_count; j++)
166 {
167 vfloat4 diff = color - cluster_centers[j];
168 float distance = dot_s(diff * diff, blk.channel_weight);
169 if (distance < best_distance)
170 {
171 best_distance = distance;
172 best_partition = j;
173 }
174 }
175
176 partition_of_texel[i] = static_cast<uint8_t>(best_partition);
177 partition_texel_count[best_partition]++;
178 }
179
180 // It is possible to get a situation where a partition ends up without any texels. In this case,
181 // assign texel N to partition N. This is silly, but ensures that every partition retains at
182 // least one texel. Reassigning a texel in this manner may cause another partition to go empty,
183 // so if we actually did a reassignment, run the whole loop over again.
184 bool problem_case;
185 do
186 {
187 problem_case = false;
188 for (unsigned int i = 0; i < partition_count; i++)
189 {
190 if (partition_texel_count[i] == 0)
191 {
192 partition_texel_count[partition_of_texel[i]]--;
193 partition_texel_count[i]++;
194 partition_of_texel[i] = static_cast<uint8_t>(i);
195 problem_case = true;
196 }
197 }
198 } while (problem_case);
199}
200
201/**
202 * @brief Compute new cluster centers based on their center of gravity.
203 *
204 * @param blk The image block color data to compress.
205 * @param texel_count The number of texels in the block.
206 * @param partition_count The number of partitions in the block.
207 * @param[out] cluster_centers The new cluster center colors.
208 * @param partition_of_texel The partition assigned for each texel.
209 */
210static void kmeans_update(
211 const image_block& blk,
212 unsigned int texel_count,
213 unsigned int partition_count,
214 vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
215 const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
216) {
217 promise(texel_count > 0);
218 promise(partition_count > 0);
219
220 vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
221 vfloat4::zero(),
222 vfloat4::zero(),
223 vfloat4::zero(),
224 vfloat4::zero()
225 };
226
227 uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
228
229 // Find the center-of-gravity in each cluster
230 for (unsigned int i = 0; i < texel_count; i++)
231 {
232 uint8_t partition = partition_of_texel[i];
233 color_sum[partition] += blk.texel(i);
234 partition_texel_count[partition]++;
235 }
236
237 // Set the center of gravity to be the new cluster center
238 for (unsigned int i = 0; i < partition_count; i++)
239 {
240 float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
241 cluster_centers[i] = color_sum[i] * scale;
242 }
243}
244
245/**
246 * @brief Compute bit-mismatch for partitioning in 2-partition mode.
247 *
248 * @param a The texel assignment bitvector for the block.
249 * @param b The texel assignment bitvector for the partition table.
250 *
251 * @return The number of bit mismatches.
252 */
253static inline unsigned int partition_mismatch2(
254 const uint64_t a[2],
255 const uint64_t b[2]
256) {
257 int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
258 int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
259 return astc::min(v1, v2);
260}
261
262/**
263 * @brief Compute bit-mismatch for partitioning in 3-partition mode.
264 *
265 * @param a The texel assignment bitvector for the block.
266 * @param b The texel assignment bitvector for the partition table.
267 *
268 * @return The number of bit mismatches.
269 */
270static inline unsigned int partition_mismatch3(
271 const uint64_t a[3],
272 const uint64_t b[3]
273) {
274 int p00 = popcount(a[0] ^ b[0]);
275 int p01 = popcount(a[0] ^ b[1]);
276 int p02 = popcount(a[0] ^ b[2]);
277
278 int p10 = popcount(a[1] ^ b[0]);
279 int p11 = popcount(a[1] ^ b[1]);
280 int p12 = popcount(a[1] ^ b[2]);
281
282 int p20 = popcount(a[2] ^ b[0]);
283 int p21 = popcount(a[2] ^ b[1]);
284 int p22 = popcount(a[2] ^ b[2]);
285
286 int s0 = p11 + p22;
287 int s1 = p12 + p21;
288 int v0 = astc::min(s0, s1) + p00;
289
290 int s2 = p10 + p22;
291 int s3 = p12 + p20;
292 int v1 = astc::min(s2, s3) + p01;
293
294 int s4 = p10 + p21;
295 int s5 = p11 + p20;
296 int v2 = astc::min(s4, s5) + p02;
297
298 return astc::min(v0, v1, v2);
299}
300
301/**
302 * @brief Compute bit-mismatch for partitioning in 4-partition mode.
303 *
304 * @param a The texel assignment bitvector for the block.
305 * @param b The texel assignment bitvector for the partition table.
306 *
307 * @return The number of bit mismatches.
308 */
309static inline unsigned int partition_mismatch4(
310 const uint64_t a[4],
311 const uint64_t b[4]
312) {
313 int p00 = popcount(a[0] ^ b[0]);
314 int p01 = popcount(a[0] ^ b[1]);
315 int p02 = popcount(a[0] ^ b[2]);
316 int p03 = popcount(a[0] ^ b[3]);
317
318 int p10 = popcount(a[1] ^ b[0]);
319 int p11 = popcount(a[1] ^ b[1]);
320 int p12 = popcount(a[1] ^ b[2]);
321 int p13 = popcount(a[1] ^ b[3]);
322
323 int p20 = popcount(a[2] ^ b[0]);
324 int p21 = popcount(a[2] ^ b[1]);
325 int p22 = popcount(a[2] ^ b[2]);
326 int p23 = popcount(a[2] ^ b[3]);
327
328 int p30 = popcount(a[3] ^ b[0]);
329 int p31 = popcount(a[3] ^ b[1]);
330 int p32 = popcount(a[3] ^ b[2]);
331 int p33 = popcount(a[3] ^ b[3]);
332
333 int mx23 = astc::min(p22 + p33, p23 + p32);
334 int mx13 = astc::min(p21 + p33, p23 + p31);
335 int mx12 = astc::min(p21 + p32, p22 + p31);
336 int mx03 = astc::min(p20 + p33, p23 + p30);
337 int mx02 = astc::min(p20 + p32, p22 + p30);
338 int mx01 = astc::min(p21 + p30, p20 + p31);
339
340 int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
341 int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
342 int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
343 int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
344
345 return astc::min(v0, v1, v2, v3);
346}
347
348using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
349
350/**
351 * @brief Count the partition table mismatches vs the data clustering.
352 *
353 * @param bsd The block size information.
354 * @param partition_count The number of partitions in the block.
355 * @param bitmaps The block texel partition assignment patterns.
356 * @param[out] mismatch_counts The array storing per partitioning mismatch counts.
357 */
358static void count_partition_mismatch_bits(
359 const block_size_descriptor& bsd,
360 unsigned int partition_count,
361 const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
362 unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]
363) {
364 unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
365 promise(active_count > 0);
366
367 if (partition_count == 2)
368 {
369 for (unsigned int i = 0; i < active_count; i++)
370 {
371 mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
372 }
373 }
374 else if (partition_count == 3)
375 {
376 for (unsigned int i = 0; i < active_count; i++)
377 {
378 mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
379 }
380 }
381 else
382 {
383 for (unsigned int i = 0; i < active_count; i++)
384 {
385 mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
386 }
387 }
388}
389
390/**
391 * @brief Use counting sort on the mismatch array to sort partition candidates.
392 *
393 * @param partitioning_count The number of packed partitionings.
394 * @param mismatch_count Partitioning mismatch counts, in index order.
395 * @param[out] partition_ordering Partition index values, in mismatch order.
396 *
397 * @return The number of active partitions in this selection.
398 */
399static unsigned int get_partition_ordering_by_mismatch_bits(
400 unsigned int partitioning_count,
401 const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS],
402 unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
403) {
404 promise(partitioning_count > 0);
405 unsigned int mscount[256] { 0 };
406
407 // Create the histogram of mismatch counts
408 for (unsigned int i = 0; i < partitioning_count; i++)
409 {
410 mscount[mismatch_count[i]]++;
411 }
412
413 unsigned int active_count = partitioning_count - mscount[255];
414
415 // Create a running sum from the histogram array
416 // Cells store previous values only; i.e. exclude self after sum
417 unsigned int summa = 0;
418 for (unsigned int i = 0; i < 256; i++)
419 {
420 unsigned int cnt = mscount[i];
421 mscount[i] = summa;
422 summa += cnt;
423 }
424
425 // Use the running sum as the index, incrementing after read to allow
426 // sequential entries with the same count
427 for (unsigned int i = 0; i < partitioning_count; i++)
428 {
429 unsigned int idx = mscount[mismatch_count[i]]++;
430 partition_ordering[idx] = i;
431 }
432
433 return active_count;
434}
435
436/**
437 * @brief Use k-means clustering to compute a partition ordering for a block..
438 *
439 * @param bsd The block size information.
440 * @param blk The image block color data to compress.
441 * @param partition_count The desired number of partitions in the block.
442 * @param[out] partition_ordering The list of recommended partition indices, in priority order.
443 *
444 * @return The number of active partitionings in this selection.
445 */
446static unsigned int compute_kmeans_partition_ordering(
447 const block_size_descriptor& bsd,
448 const image_block& blk,
449 unsigned int partition_count,
450 unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
451) {
452 vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
453 uint8_t texel_partitions[BLOCK_MAX_TEXELS];
454
455 // Use three passes of k-means clustering to partition the block data
456 for (unsigned int i = 0; i < 3; i++)
457 {
458 if (i == 0)
459 {
460 kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
461 }
462 else
463 {
464 kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
465 }
466
467 kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
468 }
469
470 // Construct the block bitmaps of texel assignments to each partition
471 uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
472 unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
473 promise(texels_to_process > 0);
474 for (unsigned int i = 0; i < texels_to_process; i++)
475 {
476 unsigned int idx = bsd.kmeans_texels[i];
477 bitmaps[texel_partitions[idx]] |= 1ULL << i;
478 }
479
480 // Count the mismatch between the block and the format's partition tables
481 unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS];
482 count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
483
484 // Sort the partitions based on the number of mismatched bits
485 return get_partition_ordering_by_mismatch_bits(
486 bsd.partitioning_count_selected[partition_count - 1],
487 mismatch_counts, partition_ordering);
488}
489
490/**
491 * @brief Insert a partitioning into an order list of results, sorted by error.
492 *
493 * @param max_values The max number of entries in the best result arrays.
494 * @param this_error The error of the new entry.
495 * @param this_partition The partition ID of the new entry.
496 * @param[out] best_errors The array of best error values.
497 * @param[out] best_partitions The array of best partition values.
498 */
499static void insert_result(
500 unsigned int max_values,
501 float this_error,
502 unsigned int this_partition,
503 float* best_errors,
504 unsigned int* best_partitions)
505{
506 promise(max_values > 0);
507
508 // Don't bother searching if the current worst error beats the new error
509 if (this_error >= best_errors[max_values - 1])
510 {
511 return;
512 }
513
514 // Else insert into the list in error-order
515 for (unsigned int i = 0; i < max_values; i++)
516 {
517 // Existing result is better - move on ...
518 if (this_error > best_errors[i])
519 {
520 continue;
521 }
522
523 // Move existing results down one
524 for (unsigned int j = max_values - 1; j > i; j--)
525 {
526 best_errors[j] = best_errors[j - 1];
527 best_partitions[j] = best_partitions[j - 1];
528 }
529
530 // Insert new result
531 best_errors[i] = this_error;
532 best_partitions[i] = this_partition;
533 break;
534 }
535}
536
537/* See header for documentation. */
538unsigned int find_best_partition_candidates(
539 const block_size_descriptor& bsd,
540 const image_block& blk,
541 unsigned int partition_count,
542 unsigned int partition_search_limit,
543 unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
544 unsigned int requested_candidates
545) {
546 // Constant used to estimate quantization error for a given partitioning; the optimal value for
547 // this depends on bitrate. These values have been determined empirically.
548 unsigned int texels_per_block = bsd.texel_count;
549 float weight_imprecision_estim = 0.055f;
550 if (texels_per_block <= 20)
551 {
552 weight_imprecision_estim = 0.03f;
553 }
554 else if (texels_per_block <= 31)
555 {
556 weight_imprecision_estim = 0.04f;
557 }
558 else if (texels_per_block <= 41)
559 {
560 weight_imprecision_estim = 0.05f;
561 }
562
563 promise(partition_count > 0);
564 promise(partition_search_limit > 0);
565
566 weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
567
568 unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
569 unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
570 partition_search_limit = astc::min(partition_search_limit, sequence_len);
571 requested_candidates = astc::min(partition_search_limit, requested_candidates);
572
573 bool uses_alpha = !blk.is_constant_channel(3);
574
575 // Partitioning errors assuming uncorrelated-chrominance endpoints
576 float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
577 unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
578
579 // Partitioning errors assuming same-chrominance endpoints
580 float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
581 unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
582
583 for (unsigned int i = 0; i < requested_candidates; i++)
584 {
585 uncor_best_errors[i] = ERROR_CALC_DEFAULT;
586 samec_best_errors[i] = ERROR_CALC_DEFAULT;
587 }
588
589 if (uses_alpha)
590 {
591 for (unsigned int i = 0; i < partition_search_limit; i++)
592 {
593 unsigned int partition = partition_sequence[i];
594 const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
595
596 // Compute weighting to give to each component in each partition
597 partition_metrics pms[BLOCK_MAX_PARTITIONS];
598
599 compute_avgs_and_dirs_4_comp(pi, blk, pms);
600
601 line4 uncor_lines[BLOCK_MAX_PARTITIONS];
602 line4 samec_lines[BLOCK_MAX_PARTITIONS];
603
604 processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
605 processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
606
607 float line_lengths[BLOCK_MAX_PARTITIONS];
608
609 for (unsigned int j = 0; j < partition_count; j++)
610 {
611 partition_metrics& pm = pms[j];
612
613 uncor_lines[j].a = pm.avg;
614 uncor_lines[j].b = normalize_safe(pm.dir, unit4());
615
616 uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
617 uncor_plines[j].bs = uncor_lines[j].b;
618
619 samec_lines[j].a = vfloat4::zero();
620 samec_lines[j].b = normalize_safe(pm.avg, unit4());
621
622 samec_plines[j].amod = vfloat4::zero();
623 samec_plines[j].bs = samec_lines[j].b;
624 }
625
626 float uncor_error = 0.0f;
627 float samec_error = 0.0f;
628
629 compute_error_squared_rgba(pi,
630 blk,
631 uncor_plines,
632 samec_plines,
633 line_lengths,
634 uncor_error,
635 samec_error);
636
637 // Compute an estimate of error introduced by weight quantization imprecision.
638 // This error is computed as follows, for each partition
639 // 1: compute the principal-axis vector (full length) in error-space
640 // 2: convert the principal-axis vector to regular RGB-space
641 // 3: scale the vector by a constant that estimates average quantization error
642 // 4: for each texel, square the vector, then do a dot-product with the texel's
643 // error weight; sum up the results across all texels.
644 // 4(optimized): square the vector once, then do a dot-product with the average
645 // texel error, then multiply by the number of texels.
646
647 for (unsigned int j = 0; j < partition_count; j++)
648 {
649 float tpp = static_cast<float>(pi.partition_texel_count[j]);
650 vfloat4 error_weights(tpp * weight_imprecision_estim);
651
652 vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
653 vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
654
655 uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
656 samec_error += dot_s(samec_vector * samec_vector, error_weights);
657 }
658
659 insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
660 insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
661 }
662 }
663 else
664 {
665 for (unsigned int i = 0; i < partition_search_limit; i++)
666 {
667 unsigned int partition = partition_sequence[i];
668 const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
669
670 // Compute weighting to give to each component in each partition
671 partition_metrics pms[BLOCK_MAX_PARTITIONS];
672 compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
673
674 partition_lines3 plines[BLOCK_MAX_PARTITIONS];
675
676 for (unsigned int j = 0; j < partition_count; j++)
677 {
678 partition_metrics& pm = pms[j];
679 partition_lines3& pl = plines[j];
680
681 pl.uncor_line.a = pm.avg;
682 pl.uncor_line.b = normalize_safe(pm.dir, unit3());
683
684 pl.samec_line.a = vfloat4::zero();
685 pl.samec_line.b = normalize_safe(pm.avg, unit3());
686
687 pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
688 pl.uncor_pline.bs = pl.uncor_line.b;
689
690 pl.samec_pline.amod = vfloat4::zero();
691 pl.samec_pline.bs = pl.samec_line.b;
692 }
693
694 float uncor_error = 0.0f;
695 float samec_error = 0.0f;
696
697 compute_error_squared_rgb(pi,
698 blk,
699 plines,
700 uncor_error,
701 samec_error);
702
703 // Compute an estimate of error introduced by weight quantization imprecision.
704 // This error is computed as follows, for each partition
705 // 1: compute the principal-axis vector (full length) in error-space
706 // 2: convert the principal-axis vector to regular RGB-space
707 // 3: scale the vector by a constant that estimates average quantization error
708 // 4: for each texel, square the vector, then do a dot-product with the texel's
709 // error weight; sum up the results across all texels.
710 // 4(optimized): square the vector once, then do a dot-product with the average
711 // texel error, then multiply by the number of texels.
712
713 for (unsigned int j = 0; j < partition_count; j++)
714 {
715 partition_lines3& pl = plines[j];
716
717 float tpp = static_cast<float>(pi.partition_texel_count[j]);
718 vfloat4 error_weights(tpp * weight_imprecision_estim);
719
720 vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
721 vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
722
723 uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
724 samec_error += dot3_s(samec_vector * samec_vector, error_weights);
725 }
726
727 insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
728 insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
729 }
730 }
731
732 unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
733 for (unsigned int i = 0; i < requested_candidates; i++)
734 {
735 interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
736 interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
737 }
738
739 uint64_t bitmasks[1024/64] { 0 };
740 unsigned int emitted = 0;
741
742 // Deduplicate the first "requested" entries
743 for (unsigned int i = 0; i < requested_candidates * 2; i++)
744 {
745 unsigned int partition = interleave[i];
746
747 unsigned int word = partition / 64;
748 unsigned int bit = partition % 64;
749
750 bool written = bitmasks[word] & (1ull << bit);
751
752 if (!written)
753 {
754 best_partitions[emitted] = partition;
755 bitmasks[word] |= 1ull << bit;
756 emitted++;
757
758 if (emitted == requested_candidates)
759 {
760 break;
761 }
762 }
763 }
764
765 return emitted;
766}
767
768#endif
769