1 | // SPDX-License-Identifier: Apache-2.0 |
2 | // ---------------------------------------------------------------------------- |
3 | // Copyright 2011-2023 Arm Limited |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | // use this file except in compliance with the License. You may obtain a copy |
7 | // of the License at: |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, software |
12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | // License for the specific language governing permissions and limitations |
15 | // under the License. |
16 | // ---------------------------------------------------------------------------- |
17 | |
18 | /** |
19 | * @brief Functions for finding dominant direction of a set of colors. |
20 | */ |
21 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
22 | |
23 | #include "astcenc_internal.h" |
24 | |
25 | #include <cassert> |
26 | |
27 | /** |
28 | * @brief Compute the average RGB color of each partition. |
29 | * |
30 | * The algorithm here uses a vectorized sequential scan and per-partition |
31 | * color accumulators, using select() to mask texel lanes in other partitions. |
32 | * |
33 | * We only accumulate sums for N-1 partitions during the scan; the value for |
34 | * the last partition can be computed given that we know the block-wide average |
35 | * already. |
36 | * |
37 | * Because of this we could reduce the loop iteration count so it "just" spans |
38 | * the max texel index needed for the N-1 partitions, which could need fewer |
39 | * iterations than the full block texel count. However, this makes the loop |
40 | * count erratic and causes more branch mispredictions so is a net loss. |
41 | * |
42 | * @param pi The partitioning to use. |
43 | * @param blk The block data to process. |
44 | * @param[out] averages The output averages. Unused partition indices will |
45 | * not be initialized, and lane<3> will be zero. |
46 | */ |
47 | static void compute_partition_averages_rgb( |
48 | const partition_info& pi, |
49 | const image_block& blk, |
50 | vfloat4 averages[BLOCK_MAX_PARTITIONS] |
51 | ) { |
52 | unsigned int partition_count = pi.partition_count; |
53 | unsigned int texel_count = blk.texel_count; |
54 | promise(texel_count > 0); |
55 | |
56 | // For 1 partition just use the precomputed mean |
57 | if (partition_count == 1) |
58 | { |
59 | averages[0] = blk.data_mean.swz<0, 1, 2>(); |
60 | } |
61 | // For 2 partitions scan results for partition 0, compute partition 1 |
62 | else if (partition_count == 2) |
63 | { |
64 | vfloatacc pp_avg_rgb[3] {}; |
65 | |
66 | vint lane_id = vint::lane_id(); |
67 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
68 | { |
69 | vint texel_partition(pi.partition_of_texel + i); |
70 | |
71 | vmask lane_mask = lane_id < vint(texel_count); |
72 | lane_id += vint(ASTCENC_SIMD_WIDTH); |
73 | |
74 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); |
75 | |
76 | vfloat data_r = loada(blk.data_r + i); |
77 | haccumulate(pp_avg_rgb[0], data_r, p0_mask); |
78 | |
79 | vfloat data_g = loada(blk.data_g + i); |
80 | haccumulate(pp_avg_rgb[1], data_g, p0_mask); |
81 | |
82 | vfloat data_b = loada(blk.data_b + i); |
83 | haccumulate(pp_avg_rgb[2], data_b, p0_mask); |
84 | } |
85 | |
86 | vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count); |
87 | |
88 | vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]), |
89 | hadd_s(pp_avg_rgb[1]), |
90 | hadd_s(pp_avg_rgb[2])); |
91 | |
92 | vfloat4 p1_total = block_total - p0_total; |
93 | |
94 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); |
95 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); |
96 | } |
97 | // For 3 partitions scan results for partition 0/1, compute partition 2 |
98 | else if (partition_count == 3) |
99 | { |
100 | vfloatacc pp_avg_rgb[2][3] {}; |
101 | |
102 | vint lane_id = vint::lane_id(); |
103 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
104 | { |
105 | vint texel_partition(pi.partition_of_texel + i); |
106 | |
107 | vmask lane_mask = lane_id < vint(texel_count); |
108 | lane_id += vint(ASTCENC_SIMD_WIDTH); |
109 | |
110 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); |
111 | vmask p1_mask = lane_mask & (texel_partition == vint(1)); |
112 | |
113 | vfloat data_r = loada(blk.data_r + i); |
114 | haccumulate(pp_avg_rgb[0][0], data_r, p0_mask); |
115 | haccumulate(pp_avg_rgb[1][0], data_r, p1_mask); |
116 | |
117 | vfloat data_g = loada(blk.data_g + i); |
118 | haccumulate(pp_avg_rgb[0][1], data_g, p0_mask); |
119 | haccumulate(pp_avg_rgb[1][1], data_g, p1_mask); |
120 | |
121 | vfloat data_b = loada(blk.data_b + i); |
122 | haccumulate(pp_avg_rgb[0][2], data_b, p0_mask); |
123 | haccumulate(pp_avg_rgb[1][2], data_b, p1_mask); |
124 | } |
125 | |
126 | vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count); |
127 | |
128 | vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]), |
129 | hadd_s(pp_avg_rgb[0][1]), |
130 | hadd_s(pp_avg_rgb[0][2])); |
131 | |
132 | vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]), |
133 | hadd_s(pp_avg_rgb[1][1]), |
134 | hadd_s(pp_avg_rgb[1][2])); |
135 | |
136 | vfloat4 p2_total = block_total - p0_total - p1_total; |
137 | |
138 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); |
139 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); |
140 | averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); |
141 | } |
142 | else |
143 | { |
144 | // For 4 partitions scan results for partition 0/1/2, compute partition 3 |
145 | vfloatacc pp_avg_rgb[3][3] {}; |
146 | |
147 | vint lane_id = vint::lane_id(); |
148 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
149 | { |
150 | vint texel_partition(pi.partition_of_texel + i); |
151 | |
152 | vmask lane_mask = lane_id < vint(texel_count); |
153 | lane_id += vint(ASTCENC_SIMD_WIDTH); |
154 | |
155 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); |
156 | vmask p1_mask = lane_mask & (texel_partition == vint(1)); |
157 | vmask p2_mask = lane_mask & (texel_partition == vint(2)); |
158 | |
159 | vfloat data_r = loada(blk.data_r + i); |
160 | haccumulate(pp_avg_rgb[0][0], data_r, p0_mask); |
161 | haccumulate(pp_avg_rgb[1][0], data_r, p1_mask); |
162 | haccumulate(pp_avg_rgb[2][0], data_r, p2_mask); |
163 | |
164 | vfloat data_g = loada(blk.data_g + i); |
165 | haccumulate(pp_avg_rgb[0][1], data_g, p0_mask); |
166 | haccumulate(pp_avg_rgb[1][1], data_g, p1_mask); |
167 | haccumulate(pp_avg_rgb[2][1], data_g, p2_mask); |
168 | |
169 | vfloat data_b = loada(blk.data_b + i); |
170 | haccumulate(pp_avg_rgb[0][2], data_b, p0_mask); |
171 | haccumulate(pp_avg_rgb[1][2], data_b, p1_mask); |
172 | haccumulate(pp_avg_rgb[2][2], data_b, p2_mask); |
173 | } |
174 | |
175 | vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count); |
176 | |
177 | vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]), |
178 | hadd_s(pp_avg_rgb[0][1]), |
179 | hadd_s(pp_avg_rgb[0][2])); |
180 | |
181 | vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]), |
182 | hadd_s(pp_avg_rgb[1][1]), |
183 | hadd_s(pp_avg_rgb[1][2])); |
184 | |
185 | vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]), |
186 | hadd_s(pp_avg_rgb[2][1]), |
187 | hadd_s(pp_avg_rgb[2][2])); |
188 | |
189 | vfloat4 p3_total = block_total - p0_total - p1_total- p2_total; |
190 | |
191 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); |
192 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); |
193 | averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); |
194 | averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]); |
195 | } |
196 | } |
197 | |
198 | /** |
199 | * @brief Compute the average RGBA color of each partition. |
200 | * |
201 | * The algorithm here uses a vectorized sequential scan and per-partition |
202 | * color accumulators, using select() to mask texel lanes in other partitions. |
203 | * |
204 | * We only accumulate sums for N-1 partitions during the scan; the value for |
205 | * the last partition can be computed given that we know the block-wide average |
206 | * already. |
207 | * |
208 | * Because of this we could reduce the loop iteration count so it "just" spans |
209 | * the max texel index needed for the N-1 partitions, which could need fewer |
210 | * iterations than the full block texel count. However, this makes the loop |
211 | * count erratic and causes more branch mispredictions so is a net loss. |
212 | * |
213 | * @param pi The partitioning to use. |
214 | * @param blk The block data to process. |
215 | * @param[out] averages The output averages. Unused partition indices will |
216 | * not be initialized. |
217 | */ |
218 | static void compute_partition_averages_rgba( |
219 | const partition_info& pi, |
220 | const image_block& blk, |
221 | vfloat4 averages[BLOCK_MAX_PARTITIONS] |
222 | ) { |
223 | unsigned int partition_count = pi.partition_count; |
224 | unsigned int texel_count = blk.texel_count; |
225 | promise(texel_count > 0); |
226 | |
227 | // For 1 partition just use the precomputed mean |
228 | if (partition_count == 1) |
229 | { |
230 | averages[0] = blk.data_mean; |
231 | } |
232 | // For 2 partitions scan results for partition 0, compute partition 1 |
233 | else if (partition_count == 2) |
234 | { |
235 | vfloat4 pp_avg_rgba[4] {}; |
236 | |
237 | vint lane_id = vint::lane_id(); |
238 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
239 | { |
240 | vint texel_partition(pi.partition_of_texel + i); |
241 | |
242 | vmask lane_mask = lane_id < vint(texel_count); |
243 | lane_id += vint(ASTCENC_SIMD_WIDTH); |
244 | |
245 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); |
246 | |
247 | vfloat data_r = loada(blk.data_r + i); |
248 | haccumulate(pp_avg_rgba[0], data_r, p0_mask); |
249 | |
250 | vfloat data_g = loada(blk.data_g + i); |
251 | haccumulate(pp_avg_rgba[1], data_g, p0_mask); |
252 | |
253 | vfloat data_b = loada(blk.data_b + i); |
254 | haccumulate(pp_avg_rgba[2], data_b, p0_mask); |
255 | |
256 | vfloat data_a = loada(blk.data_a + i); |
257 | haccumulate(pp_avg_rgba[3], data_a, p0_mask); |
258 | } |
259 | |
260 | vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count); |
261 | |
262 | vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]), |
263 | hadd_s(pp_avg_rgba[1]), |
264 | hadd_s(pp_avg_rgba[2]), |
265 | hadd_s(pp_avg_rgba[3])); |
266 | |
267 | vfloat4 p1_total = block_total - p0_total; |
268 | |
269 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); |
270 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); |
271 | } |
272 | // For 3 partitions scan results for partition 0/1, compute partition 2 |
273 | else if (partition_count == 3) |
274 | { |
275 | vfloat4 pp_avg_rgba[2][4] {}; |
276 | |
277 | vint lane_id = vint::lane_id(); |
278 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
279 | { |
280 | vint texel_partition(pi.partition_of_texel + i); |
281 | |
282 | vmask lane_mask = lane_id < vint(texel_count); |
283 | lane_id += vint(ASTCENC_SIMD_WIDTH); |
284 | |
285 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); |
286 | vmask p1_mask = lane_mask & (texel_partition == vint(1)); |
287 | |
288 | vfloat data_r = loada(blk.data_r + i); |
289 | haccumulate(pp_avg_rgba[0][0], data_r, p0_mask); |
290 | haccumulate(pp_avg_rgba[1][0], data_r, p1_mask); |
291 | |
292 | vfloat data_g = loada(blk.data_g + i); |
293 | haccumulate(pp_avg_rgba[0][1], data_g, p0_mask); |
294 | haccumulate(pp_avg_rgba[1][1], data_g, p1_mask); |
295 | |
296 | vfloat data_b = loada(blk.data_b + i); |
297 | haccumulate(pp_avg_rgba[0][2], data_b, p0_mask); |
298 | haccumulate(pp_avg_rgba[1][2], data_b, p1_mask); |
299 | |
300 | vfloat data_a = loada(blk.data_a + i); |
301 | haccumulate(pp_avg_rgba[0][3], data_a, p0_mask); |
302 | haccumulate(pp_avg_rgba[1][3], data_a, p1_mask); |
303 | } |
304 | |
305 | vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count); |
306 | |
307 | vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]), |
308 | hadd_s(pp_avg_rgba[0][1]), |
309 | hadd_s(pp_avg_rgba[0][2]), |
310 | hadd_s(pp_avg_rgba[0][3])); |
311 | |
312 | vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]), |
313 | hadd_s(pp_avg_rgba[1][1]), |
314 | hadd_s(pp_avg_rgba[1][2]), |
315 | hadd_s(pp_avg_rgba[1][3])); |
316 | |
317 | vfloat4 p2_total = block_total - p0_total - p1_total; |
318 | |
319 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); |
320 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); |
321 | averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); |
322 | } |
323 | else |
324 | { |
325 | // For 4 partitions scan results for partition 0/1/2, compute partition 3 |
326 | vfloat4 pp_avg_rgba[3][4] {}; |
327 | |
328 | vint lane_id = vint::lane_id(); |
329 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
330 | { |
331 | vint texel_partition(pi.partition_of_texel + i); |
332 | |
333 | vmask lane_mask = lane_id < vint(texel_count); |
334 | lane_id += vint(ASTCENC_SIMD_WIDTH); |
335 | |
336 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); |
337 | vmask p1_mask = lane_mask & (texel_partition == vint(1)); |
338 | vmask p2_mask = lane_mask & (texel_partition == vint(2)); |
339 | |
340 | vfloat data_r = loada(blk.data_r + i); |
341 | haccumulate(pp_avg_rgba[0][0], data_r, p0_mask); |
342 | haccumulate(pp_avg_rgba[1][0], data_r, p1_mask); |
343 | haccumulate(pp_avg_rgba[2][0], data_r, p2_mask); |
344 | |
345 | vfloat data_g = loada(blk.data_g + i); |
346 | haccumulate(pp_avg_rgba[0][1], data_g, p0_mask); |
347 | haccumulate(pp_avg_rgba[1][1], data_g, p1_mask); |
348 | haccumulate(pp_avg_rgba[2][1], data_g, p2_mask); |
349 | |
350 | vfloat data_b = loada(blk.data_b + i); |
351 | haccumulate(pp_avg_rgba[0][2], data_b, p0_mask); |
352 | haccumulate(pp_avg_rgba[1][2], data_b, p1_mask); |
353 | haccumulate(pp_avg_rgba[2][2], data_b, p2_mask); |
354 | |
355 | vfloat data_a = loada(blk.data_a + i); |
356 | haccumulate(pp_avg_rgba[0][3], data_a, p0_mask); |
357 | haccumulate(pp_avg_rgba[1][3], data_a, p1_mask); |
358 | haccumulate(pp_avg_rgba[2][3], data_a, p2_mask); |
359 | } |
360 | |
361 | vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count); |
362 | |
363 | vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]), |
364 | hadd_s(pp_avg_rgba[0][1]), |
365 | hadd_s(pp_avg_rgba[0][2]), |
366 | hadd_s(pp_avg_rgba[0][3])); |
367 | |
368 | vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]), |
369 | hadd_s(pp_avg_rgba[1][1]), |
370 | hadd_s(pp_avg_rgba[1][2]), |
371 | hadd_s(pp_avg_rgba[1][3])); |
372 | |
373 | vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]), |
374 | hadd_s(pp_avg_rgba[2][1]), |
375 | hadd_s(pp_avg_rgba[2][2]), |
376 | hadd_s(pp_avg_rgba[2][3])); |
377 | |
378 | vfloat4 p3_total = block_total - p0_total - p1_total- p2_total; |
379 | |
380 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); |
381 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); |
382 | averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); |
383 | averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]); |
384 | } |
385 | } |
386 | |
387 | /* See header for documentation. */ |
388 | void compute_avgs_and_dirs_4_comp( |
389 | const partition_info& pi, |
390 | const image_block& blk, |
391 | partition_metrics pm[BLOCK_MAX_PARTITIONS] |
392 | ) { |
393 | int partition_count = pi.partition_count; |
394 | promise(partition_count > 0); |
395 | |
396 | // Pre-compute partition_averages |
397 | vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; |
398 | compute_partition_averages_rgba(pi, blk, partition_averages); |
399 | |
400 | for (int partition = 0; partition < partition_count; partition++) |
401 | { |
402 | const uint8_t *texel_indexes = pi.texels_of_partition[partition]; |
403 | unsigned int texel_count = pi.partition_texel_count[partition]; |
404 | promise(texel_count > 0); |
405 | |
406 | vfloat4 average = partition_averages[partition]; |
407 | pm[partition].avg = average; |
408 | |
409 | vfloat4 sum_xp = vfloat4::zero(); |
410 | vfloat4 sum_yp = vfloat4::zero(); |
411 | vfloat4 sum_zp = vfloat4::zero(); |
412 | vfloat4 sum_wp = vfloat4::zero(); |
413 | |
414 | for (unsigned int i = 0; i < texel_count; i++) |
415 | { |
416 | unsigned int iwt = texel_indexes[i]; |
417 | vfloat4 texel_datum = blk.texel(iwt); |
418 | texel_datum = texel_datum - average; |
419 | |
420 | vfloat4 zero = vfloat4::zero(); |
421 | |
422 | vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; |
423 | sum_xp += select(zero, texel_datum, tdm0); |
424 | |
425 | vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; |
426 | sum_yp += select(zero, texel_datum, tdm1); |
427 | |
428 | vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; |
429 | sum_zp += select(zero, texel_datum, tdm2); |
430 | |
431 | vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero; |
432 | sum_wp += select(zero, texel_datum, tdm3); |
433 | } |
434 | |
435 | vfloat4 prod_xp = dot(sum_xp, sum_xp); |
436 | vfloat4 prod_yp = dot(sum_yp, sum_yp); |
437 | vfloat4 prod_zp = dot(sum_zp, sum_zp); |
438 | vfloat4 prod_wp = dot(sum_wp, sum_wp); |
439 | |
440 | vfloat4 best_vector = sum_xp; |
441 | vfloat4 best_sum = prod_xp; |
442 | |
443 | vmask4 mask = prod_yp > best_sum; |
444 | best_vector = select(best_vector, sum_yp, mask); |
445 | best_sum = select(best_sum, prod_yp, mask); |
446 | |
447 | mask = prod_zp > best_sum; |
448 | best_vector = select(best_vector, sum_zp, mask); |
449 | best_sum = select(best_sum, prod_zp, mask); |
450 | |
451 | mask = prod_wp > best_sum; |
452 | best_vector = select(best_vector, sum_wp, mask); |
453 | |
454 | pm[partition].dir = best_vector; |
455 | } |
456 | } |
457 | |
458 | /* See header for documentation. */ |
459 | void compute_avgs_and_dirs_3_comp( |
460 | const partition_info& pi, |
461 | const image_block& blk, |
462 | unsigned int omitted_component, |
463 | partition_metrics pm[BLOCK_MAX_PARTITIONS] |
464 | ) { |
465 | // Pre-compute partition_averages |
466 | vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; |
467 | compute_partition_averages_rgba(pi, blk, partition_averages); |
468 | |
469 | const float* data_vr = blk.data_r; |
470 | const float* data_vg = blk.data_g; |
471 | const float* data_vb = blk.data_b; |
472 | |
473 | // TODO: Data-driven permute would be useful to avoid this ... |
474 | if (omitted_component == 0) |
475 | { |
476 | partition_averages[0] = partition_averages[0].swz<1, 2, 3>(); |
477 | partition_averages[1] = partition_averages[1].swz<1, 2, 3>(); |
478 | partition_averages[2] = partition_averages[2].swz<1, 2, 3>(); |
479 | partition_averages[3] = partition_averages[3].swz<1, 2, 3>(); |
480 | |
481 | data_vr = blk.data_g; |
482 | data_vg = blk.data_b; |
483 | data_vb = blk.data_a; |
484 | } |
485 | else if (omitted_component == 1) |
486 | { |
487 | partition_averages[0] = partition_averages[0].swz<0, 2, 3>(); |
488 | partition_averages[1] = partition_averages[1].swz<0, 2, 3>(); |
489 | partition_averages[2] = partition_averages[2].swz<0, 2, 3>(); |
490 | partition_averages[3] = partition_averages[3].swz<0, 2, 3>(); |
491 | |
492 | data_vg = blk.data_b; |
493 | data_vb = blk.data_a; |
494 | } |
495 | else if (omitted_component == 2) |
496 | { |
497 | partition_averages[0] = partition_averages[0].swz<0, 1, 3>(); |
498 | partition_averages[1] = partition_averages[1].swz<0, 1, 3>(); |
499 | partition_averages[2] = partition_averages[2].swz<0, 1, 3>(); |
500 | partition_averages[3] = partition_averages[3].swz<0, 1, 3>(); |
501 | |
502 | data_vb = blk.data_a; |
503 | } |
504 | else |
505 | { |
506 | partition_averages[0] = partition_averages[0].swz<0, 1, 2>(); |
507 | partition_averages[1] = partition_averages[1].swz<0, 1, 2>(); |
508 | partition_averages[2] = partition_averages[2].swz<0, 1, 2>(); |
509 | partition_averages[3] = partition_averages[3].swz<0, 1, 2>(); |
510 | } |
511 | |
512 | unsigned int partition_count = pi.partition_count; |
513 | promise(partition_count > 0); |
514 | |
515 | for (unsigned int partition = 0; partition < partition_count; partition++) |
516 | { |
517 | const uint8_t *texel_indexes = pi.texels_of_partition[partition]; |
518 | unsigned int texel_count = pi.partition_texel_count[partition]; |
519 | promise(texel_count > 0); |
520 | |
521 | vfloat4 average = partition_averages[partition]; |
522 | pm[partition].avg = average; |
523 | |
524 | vfloat4 sum_xp = vfloat4::zero(); |
525 | vfloat4 sum_yp = vfloat4::zero(); |
526 | vfloat4 sum_zp = vfloat4::zero(); |
527 | |
528 | for (unsigned int i = 0; i < texel_count; i++) |
529 | { |
530 | unsigned int iwt = texel_indexes[i]; |
531 | |
532 | vfloat4 texel_datum = vfloat3(data_vr[iwt], |
533 | data_vg[iwt], |
534 | data_vb[iwt]); |
535 | texel_datum = texel_datum - average; |
536 | |
537 | vfloat4 zero = vfloat4::zero(); |
538 | |
539 | vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; |
540 | sum_xp += select(zero, texel_datum, tdm0); |
541 | |
542 | vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; |
543 | sum_yp += select(zero, texel_datum, tdm1); |
544 | |
545 | vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; |
546 | sum_zp += select(zero, texel_datum, tdm2); |
547 | } |
548 | |
549 | vfloat4 prod_xp = dot(sum_xp, sum_xp); |
550 | vfloat4 prod_yp = dot(sum_yp, sum_yp); |
551 | vfloat4 prod_zp = dot(sum_zp, sum_zp); |
552 | |
553 | vfloat4 best_vector = sum_xp; |
554 | vfloat4 best_sum = prod_xp; |
555 | |
556 | vmask4 mask = prod_yp > best_sum; |
557 | best_vector = select(best_vector, sum_yp, mask); |
558 | best_sum = select(best_sum, prod_yp, mask); |
559 | |
560 | mask = prod_zp > best_sum; |
561 | best_vector = select(best_vector, sum_zp, mask); |
562 | |
563 | pm[partition].dir = best_vector; |
564 | } |
565 | } |
566 | |
567 | /* See header for documentation. */ |
568 | void compute_avgs_and_dirs_3_comp_rgb( |
569 | const partition_info& pi, |
570 | const image_block& blk, |
571 | partition_metrics pm[BLOCK_MAX_PARTITIONS] |
572 | ) { |
573 | unsigned int partition_count = pi.partition_count; |
574 | promise(partition_count > 0); |
575 | |
576 | // Pre-compute partition_averages |
577 | vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; |
578 | compute_partition_averages_rgb(pi, blk, partition_averages); |
579 | |
580 | for (unsigned int partition = 0; partition < partition_count; partition++) |
581 | { |
582 | const uint8_t *texel_indexes = pi.texels_of_partition[partition]; |
583 | unsigned int texel_count = pi.partition_texel_count[partition]; |
584 | promise(texel_count > 0); |
585 | |
586 | vfloat4 average = partition_averages[partition]; |
587 | pm[partition].avg = average; |
588 | |
589 | vfloat4 sum_xp = vfloat4::zero(); |
590 | vfloat4 sum_yp = vfloat4::zero(); |
591 | vfloat4 sum_zp = vfloat4::zero(); |
592 | |
593 | for (unsigned int i = 0; i < texel_count; i++) |
594 | { |
595 | unsigned int iwt = texel_indexes[i]; |
596 | |
597 | vfloat4 texel_datum = blk.texel3(iwt); |
598 | texel_datum = texel_datum - average; |
599 | |
600 | vfloat4 zero = vfloat4::zero(); |
601 | |
602 | vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; |
603 | sum_xp += select(zero, texel_datum, tdm0); |
604 | |
605 | vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; |
606 | sum_yp += select(zero, texel_datum, tdm1); |
607 | |
608 | vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; |
609 | sum_zp += select(zero, texel_datum, tdm2); |
610 | } |
611 | |
612 | vfloat4 prod_xp = dot(sum_xp, sum_xp); |
613 | vfloat4 prod_yp = dot(sum_yp, sum_yp); |
614 | vfloat4 prod_zp = dot(sum_zp, sum_zp); |
615 | |
616 | vfloat4 best_vector = sum_xp; |
617 | vfloat4 best_sum = prod_xp; |
618 | |
619 | vmask4 mask = prod_yp > best_sum; |
620 | best_vector = select(best_vector, sum_yp, mask); |
621 | best_sum = select(best_sum, prod_yp, mask); |
622 | |
623 | mask = prod_zp > best_sum; |
624 | best_vector = select(best_vector, sum_zp, mask); |
625 | |
626 | pm[partition].dir = best_vector; |
627 | } |
628 | } |
629 | |
630 | /* See header for documentation. */ |
631 | void compute_avgs_and_dirs_2_comp( |
632 | const partition_info& pt, |
633 | const image_block& blk, |
634 | unsigned int component1, |
635 | unsigned int component2, |
636 | partition_metrics pm[BLOCK_MAX_PARTITIONS] |
637 | ) { |
638 | vfloat4 average; |
639 | |
640 | const float* data_vr = nullptr; |
641 | const float* data_vg = nullptr; |
642 | |
643 | if (component1 == 0 && component2 == 1) |
644 | { |
645 | average = blk.data_mean.swz<0, 1>(); |
646 | |
647 | data_vr = blk.data_r; |
648 | data_vg = blk.data_g; |
649 | } |
650 | else if (component1 == 0 && component2 == 2) |
651 | { |
652 | average = blk.data_mean.swz<0, 2>(); |
653 | |
654 | data_vr = blk.data_r; |
655 | data_vg = blk.data_b; |
656 | } |
657 | else // (component1 == 1 && component2 == 2) |
658 | { |
659 | assert(component1 == 1 && component2 == 2); |
660 | |
661 | average = blk.data_mean.swz<1, 2>(); |
662 | |
663 | data_vr = blk.data_g; |
664 | data_vg = blk.data_b; |
665 | } |
666 | |
667 | unsigned int partition_count = pt.partition_count; |
668 | promise(partition_count > 0); |
669 | |
670 | for (unsigned int partition = 0; partition < partition_count; partition++) |
671 | { |
672 | const uint8_t *texel_indexes = pt.texels_of_partition[partition]; |
673 | unsigned int texel_count = pt.partition_texel_count[partition]; |
674 | promise(texel_count > 0); |
675 | |
676 | // Only compute a partition mean if more than one partition |
677 | if (partition_count > 1) |
678 | { |
679 | average = vfloat4::zero(); |
680 | for (unsigned int i = 0; i < texel_count; i++) |
681 | { |
682 | unsigned int iwt = texel_indexes[i]; |
683 | average += vfloat2(data_vr[iwt], data_vg[iwt]); |
684 | } |
685 | |
686 | average = average / static_cast<float>(texel_count); |
687 | } |
688 | |
689 | pm[partition].avg = average; |
690 | |
691 | vfloat4 sum_xp = vfloat4::zero(); |
692 | vfloat4 sum_yp = vfloat4::zero(); |
693 | |
694 | for (unsigned int i = 0; i < texel_count; i++) |
695 | { |
696 | unsigned int iwt = texel_indexes[i]; |
697 | vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]); |
698 | texel_datum = texel_datum - average; |
699 | |
700 | vfloat4 zero = vfloat4::zero(); |
701 | |
702 | vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; |
703 | sum_xp += select(zero, texel_datum, tdm0); |
704 | |
705 | vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; |
706 | sum_yp += select(zero, texel_datum, tdm1); |
707 | } |
708 | |
709 | vfloat4 prod_xp = dot(sum_xp, sum_xp); |
710 | vfloat4 prod_yp = dot(sum_yp, sum_yp); |
711 | |
712 | vfloat4 best_vector = sum_xp; |
713 | vfloat4 best_sum = prod_xp; |
714 | |
715 | vmask4 mask = prod_yp > best_sum; |
716 | best_vector = select(best_vector, sum_yp, mask); |
717 | |
718 | pm[partition].dir = best_vector; |
719 | } |
720 | } |
721 | |
722 | /* See header for documentation. */ |
723 | void compute_error_squared_rgba( |
724 | const partition_info& pi, |
725 | const image_block& blk, |
726 | const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS], |
727 | const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS], |
728 | float line_lengths[BLOCK_MAX_PARTITIONS], |
729 | float& uncor_error, |
730 | float& samec_error |
731 | ) { |
732 | unsigned int partition_count = pi.partition_count; |
733 | promise(partition_count > 0); |
734 | |
735 | vfloatacc uncor_errorsumv = vfloatacc::zero(); |
736 | vfloatacc samec_errorsumv = vfloatacc::zero(); |
737 | |
738 | for (unsigned int partition = 0; partition < partition_count; partition++) |
739 | { |
740 | const uint8_t *texel_indexes = pi.texels_of_partition[partition]; |
741 | |
742 | processed_line4 l_uncor = uncor_plines[partition]; |
743 | processed_line4 l_samec = samec_plines[partition]; |
744 | |
745 | unsigned int texel_count = pi.partition_texel_count[partition]; |
746 | promise(texel_count > 0); |
747 | |
748 | // Vectorize some useful scalar inputs |
749 | vfloat l_uncor_bs0(l_uncor.bs.lane<0>()); |
750 | vfloat l_uncor_bs1(l_uncor.bs.lane<1>()); |
751 | vfloat l_uncor_bs2(l_uncor.bs.lane<2>()); |
752 | vfloat l_uncor_bs3(l_uncor.bs.lane<3>()); |
753 | |
754 | vfloat l_uncor_amod0(l_uncor.amod.lane<0>()); |
755 | vfloat l_uncor_amod1(l_uncor.amod.lane<1>()); |
756 | vfloat l_uncor_amod2(l_uncor.amod.lane<2>()); |
757 | vfloat l_uncor_amod3(l_uncor.amod.lane<3>()); |
758 | |
759 | vfloat l_samec_bs0(l_samec.bs.lane<0>()); |
760 | vfloat l_samec_bs1(l_samec.bs.lane<1>()); |
761 | vfloat l_samec_bs2(l_samec.bs.lane<2>()); |
762 | vfloat l_samec_bs3(l_samec.bs.lane<3>()); |
763 | |
764 | assert(all(l_samec.amod == vfloat4(0.0f))); |
765 | |
766 | vfloat uncor_loparamv(1e10f); |
767 | vfloat uncor_hiparamv(-1e10f); |
768 | |
769 | vfloat ew_r(blk.channel_weight.lane<0>()); |
770 | vfloat ew_g(blk.channel_weight.lane<1>()); |
771 | vfloat ew_b(blk.channel_weight.lane<2>()); |
772 | vfloat ew_a(blk.channel_weight.lane<3>()); |
773 | |
774 | // This implementation over-shoots, but this is safe as we initialize the texel_indexes |
775 | // array to extend the last value. This means min/max are not impacted, but we need to mask |
776 | // out the dummy values when we compute the line weighting. |
777 | vint lane_ids = vint::lane_id(); |
778 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
779 | { |
780 | vmask mask = lane_ids < vint(texel_count); |
781 | vint texel_idxs(texel_indexes + i); |
782 | |
783 | vfloat data_r = gatherf(blk.data_r, texel_idxs); |
784 | vfloat data_g = gatherf(blk.data_g, texel_idxs); |
785 | vfloat data_b = gatherf(blk.data_b, texel_idxs); |
786 | vfloat data_a = gatherf(blk.data_a, texel_idxs); |
787 | |
788 | vfloat uncor_param = (data_r * l_uncor_bs0) |
789 | + (data_g * l_uncor_bs1) |
790 | + (data_b * l_uncor_bs2) |
791 | + (data_a * l_uncor_bs3); |
792 | |
793 | uncor_loparamv = min(uncor_param, uncor_loparamv); |
794 | uncor_hiparamv = max(uncor_param, uncor_hiparamv); |
795 | |
796 | vfloat uncor_dist0 = (l_uncor_amod0 - data_r) |
797 | + (uncor_param * l_uncor_bs0); |
798 | vfloat uncor_dist1 = (l_uncor_amod1 - data_g) |
799 | + (uncor_param * l_uncor_bs1); |
800 | vfloat uncor_dist2 = (l_uncor_amod2 - data_b) |
801 | + (uncor_param * l_uncor_bs2); |
802 | vfloat uncor_dist3 = (l_uncor_amod3 - data_a) |
803 | + (uncor_param * l_uncor_bs3); |
804 | |
805 | vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0) |
806 | + (ew_g * uncor_dist1 * uncor_dist1) |
807 | + (ew_b * uncor_dist2 * uncor_dist2) |
808 | + (ew_a * uncor_dist3 * uncor_dist3); |
809 | |
810 | haccumulate(uncor_errorsumv, uncor_err, mask); |
811 | |
812 | // Process samechroma data |
813 | vfloat samec_param = (data_r * l_samec_bs0) |
814 | + (data_g * l_samec_bs1) |
815 | + (data_b * l_samec_bs2) |
816 | + (data_a * l_samec_bs3); |
817 | |
818 | vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r; |
819 | vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g; |
820 | vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b; |
821 | vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a; |
822 | |
823 | vfloat samec_err = (ew_r * samec_dist0 * samec_dist0) |
824 | + (ew_g * samec_dist1 * samec_dist1) |
825 | + (ew_b * samec_dist2 * samec_dist2) |
826 | + (ew_a * samec_dist3 * samec_dist3); |
827 | |
828 | haccumulate(samec_errorsumv, samec_err, mask); |
829 | |
830 | lane_ids += vint(ASTCENC_SIMD_WIDTH); |
831 | } |
832 | |
833 | // Turn very small numbers and NaNs into a small number |
834 | float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv); |
835 | line_lengths[partition] = astc::max(uncor_linelen, 1e-7f); |
836 | } |
837 | |
838 | uncor_error = hadd_s(uncor_errorsumv); |
839 | samec_error = hadd_s(samec_errorsumv); |
840 | } |
841 | |
842 | /* See header for documentation. */ |
843 | void compute_error_squared_rgb( |
844 | const partition_info& pi, |
845 | const image_block& blk, |
846 | partition_lines3 plines[BLOCK_MAX_PARTITIONS], |
847 | float& uncor_error, |
848 | float& samec_error |
849 | ) { |
850 | unsigned int partition_count = pi.partition_count; |
851 | promise(partition_count > 0); |
852 | |
853 | vfloatacc uncor_errorsumv = vfloatacc::zero(); |
854 | vfloatacc samec_errorsumv = vfloatacc::zero(); |
855 | |
856 | for (unsigned int partition = 0; partition < partition_count; partition++) |
857 | { |
858 | partition_lines3& pl = plines[partition]; |
859 | const uint8_t *texel_indexes = pi.texels_of_partition[partition]; |
860 | unsigned int texel_count = pi.partition_texel_count[partition]; |
861 | promise(texel_count > 0); |
862 | |
863 | processed_line3 l_uncor = pl.uncor_pline; |
864 | processed_line3 l_samec = pl.samec_pline; |
865 | |
866 | // Vectorize some useful scalar inputs |
867 | vfloat l_uncor_bs0(l_uncor.bs.lane<0>()); |
868 | vfloat l_uncor_bs1(l_uncor.bs.lane<1>()); |
869 | vfloat l_uncor_bs2(l_uncor.bs.lane<2>()); |
870 | |
871 | vfloat l_uncor_amod0(l_uncor.amod.lane<0>()); |
872 | vfloat l_uncor_amod1(l_uncor.amod.lane<1>()); |
873 | vfloat l_uncor_amod2(l_uncor.amod.lane<2>()); |
874 | |
875 | vfloat l_samec_bs0(l_samec.bs.lane<0>()); |
876 | vfloat l_samec_bs1(l_samec.bs.lane<1>()); |
877 | vfloat l_samec_bs2(l_samec.bs.lane<2>()); |
878 | |
879 | assert(all(l_samec.amod == vfloat4(0.0f))); |
880 | |
881 | vfloat uncor_loparamv(1e10f); |
882 | vfloat uncor_hiparamv(-1e10f); |
883 | |
884 | vfloat ew_r(blk.channel_weight.lane<0>()); |
885 | vfloat ew_g(blk.channel_weight.lane<1>()); |
886 | vfloat ew_b(blk.channel_weight.lane<2>()); |
887 | |
888 | // This implementation over-shoots, but this is safe as we initialize the weights array |
889 | // to extend the last value. This means min/max are not impacted, but we need to mask |
890 | // out the dummy values when we compute the line weighting. |
891 | vint lane_ids = vint::lane_id(); |
892 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
893 | { |
894 | vmask mask = lane_ids < vint(texel_count); |
895 | vint texel_idxs(texel_indexes + i); |
896 | |
897 | vfloat data_r = gatherf(blk.data_r, texel_idxs); |
898 | vfloat data_g = gatherf(blk.data_g, texel_idxs); |
899 | vfloat data_b = gatherf(blk.data_b, texel_idxs); |
900 | |
901 | vfloat uncor_param = (data_r * l_uncor_bs0) |
902 | + (data_g * l_uncor_bs1) |
903 | + (data_b * l_uncor_bs2); |
904 | |
905 | uncor_loparamv = min(uncor_param, uncor_loparamv); |
906 | uncor_hiparamv = max(uncor_param, uncor_hiparamv); |
907 | |
908 | vfloat uncor_dist0 = (l_uncor_amod0 - data_r) |
909 | + (uncor_param * l_uncor_bs0); |
910 | vfloat uncor_dist1 = (l_uncor_amod1 - data_g) |
911 | + (uncor_param * l_uncor_bs1); |
912 | vfloat uncor_dist2 = (l_uncor_amod2 - data_b) |
913 | + (uncor_param * l_uncor_bs2); |
914 | |
915 | vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0) |
916 | + (ew_g * uncor_dist1 * uncor_dist1) |
917 | + (ew_b * uncor_dist2 * uncor_dist2); |
918 | |
919 | haccumulate(uncor_errorsumv, uncor_err, mask); |
920 | |
921 | // Process samechroma data |
922 | vfloat samec_param = (data_r * l_samec_bs0) |
923 | + (data_g * l_samec_bs1) |
924 | + (data_b * l_samec_bs2); |
925 | |
926 | vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r; |
927 | vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g; |
928 | vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b; |
929 | |
930 | vfloat samec_err = (ew_r * samec_dist0 * samec_dist0) |
931 | + (ew_g * samec_dist1 * samec_dist1) |
932 | + (ew_b * samec_dist2 * samec_dist2); |
933 | |
934 | haccumulate(samec_errorsumv, samec_err, mask); |
935 | |
936 | lane_ids += vint(ASTCENC_SIMD_WIDTH); |
937 | } |
938 | |
939 | // Turn very small numbers and NaNs into a small number |
940 | float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv); |
941 | pl.line_length = astc::max(uncor_linelen, 1e-7f); |
942 | } |
943 | |
944 | uncor_error = hadd_s(uncor_errorsumv); |
945 | samec_error = hadd_s(samec_errorsumv); |
946 | } |
947 | |
948 | #endif |
949 | |