1 | // SPDX-License-Identifier: Apache-2.0 |
2 | // ---------------------------------------------------------------------------- |
3 | // Copyright 2011-2023 Arm Limited |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | // use this file except in compliance with the License. You may obtain a copy |
7 | // of the License at: |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, software |
12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | // License for the specific language governing permissions and limitations |
15 | // under the License. |
16 | // ---------------------------------------------------------------------------- |
17 | |
18 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
19 | |
20 | /** |
21 | * @brief Functions for computing color endpoints and texel weights. |
22 | */ |
23 | |
24 | #include <cassert> |
25 | |
26 | #include "astcenc_internal.h" |
27 | #include "astcenc_vecmathlib.h" |
28 | |
29 | /** |
30 | * @brief Compute the infilled weight for N texel indices in a decimated grid. |
31 | * |
32 | * @param di The weight grid decimation to use. |
33 | * @param weights The decimated weight values to use. |
34 | * @param index The first texel index to interpolate. |
35 | * |
36 | * @return The interpolated weight for the given set of SIMD_WIDTH texels. |
37 | */ |
38 | static vfloat bilinear_infill_vla( |
39 | const decimation_info& di, |
40 | const float* weights, |
41 | unsigned int index |
42 | ) { |
43 | // Load the bilinear filter texel weight indexes in the decimated grid |
44 | vint weight_idx0 = vint(di.texel_weights_tr[0] + index); |
45 | vint weight_idx1 = vint(di.texel_weights_tr[1] + index); |
46 | vint weight_idx2 = vint(di.texel_weights_tr[2] + index); |
47 | vint weight_idx3 = vint(di.texel_weights_tr[3] + index); |
48 | |
49 | // Load the bilinear filter weights from the decimated grid |
50 | vfloat weight_val0 = gatherf(weights, weight_idx0); |
51 | vfloat weight_val1 = gatherf(weights, weight_idx1); |
52 | vfloat weight_val2 = gatherf(weights, weight_idx2); |
53 | vfloat weight_val3 = gatherf(weights, weight_idx3); |
54 | |
55 | // Load the weight contribution factors for each decimated weight |
56 | vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); |
57 | vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index); |
58 | vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index); |
59 | vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index); |
60 | |
61 | // Compute the bilinear interpolation to generate the per-texel weight |
62 | return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) + |
63 | (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3); |
64 | } |
65 | |
66 | /** |
67 | * @brief Compute the infilled weight for N texel indices in a decimated grid. |
68 | * |
69 | * This is specialized version which computes only two weights per texel for |
70 | * encodings that are only decimated in a single axis. |
71 | * |
72 | * @param di The weight grid decimation to use. |
73 | * @param weights The decimated weight values to use. |
74 | * @param index The first texel index to interpolate. |
75 | * |
76 | * @return The interpolated weight for the given set of SIMD_WIDTH texels. |
77 | */ |
78 | static vfloat bilinear_infill_vla_2( |
79 | const decimation_info& di, |
80 | const float* weights, |
81 | unsigned int index |
82 | ) { |
83 | // Load the bilinear filter texel weight indexes in the decimated grid |
84 | vint weight_idx0 = vint(di.texel_weights_tr[0] + index); |
85 | vint weight_idx1 = vint(di.texel_weights_tr[1] + index); |
86 | |
87 | // Load the bilinear filter weights from the decimated grid |
88 | vfloat weight_val0 = gatherf(weights, weight_idx0); |
89 | vfloat weight_val1 = gatherf(weights, weight_idx1); |
90 | |
91 | // Load the weight contribution factors for each decimated weight |
92 | vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); |
93 | vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index); |
94 | |
95 | // Compute the bilinear interpolation to generate the per-texel weight |
96 | return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1); |
97 | } |
98 | |
99 | /** |
100 | * @brief Compute the ideal endpoints and weights for 1 color component. |
101 | * |
102 | * @param blk The image block color data to compress. |
103 | * @param pi The partition info for the current trial. |
104 | * @param[out] ei The computed ideal endpoints and weights. |
105 | * @param component The color component to compute. |
106 | */ |
107 | static void compute_ideal_colors_and_weights_1_comp( |
108 | const image_block& blk, |
109 | const partition_info& pi, |
110 | endpoints_and_weights& ei, |
111 | unsigned int component |
112 | ) { |
113 | unsigned int partition_count = pi.partition_count; |
114 | ei.ep.partition_count = partition_count; |
115 | promise(partition_count > 0); |
116 | |
117 | unsigned int texel_count = blk.texel_count; |
118 | promise(texel_count > 0); |
119 | |
120 | float error_weight; |
121 | const float* data_vr = nullptr; |
122 | |
123 | assert(component < BLOCK_MAX_COMPONENTS); |
124 | switch (component) |
125 | { |
126 | case 0: |
127 | error_weight = blk.channel_weight.lane<0>(); |
128 | data_vr = blk.data_r; |
129 | break; |
130 | case 1: |
131 | error_weight = blk.channel_weight.lane<1>(); |
132 | data_vr = blk.data_g; |
133 | break; |
134 | case 2: |
135 | error_weight = blk.channel_weight.lane<2>(); |
136 | data_vr = blk.data_b; |
137 | break; |
138 | default: |
139 | assert(component == 3); |
140 | error_weight = blk.channel_weight.lane<3>(); |
141 | data_vr = blk.data_a; |
142 | break; |
143 | } |
144 | |
145 | vmask4 sep_mask = vint4::lane_id() == vint4(component); |
146 | bool is_constant_wes { true }; |
147 | float partition0_len_sq { 0.0f }; |
148 | |
149 | for (unsigned int i = 0; i < partition_count; i++) |
150 | { |
151 | float lowvalue { 1e10f }; |
152 | float highvalue { -1e10f }; |
153 | |
154 | unsigned int partition_texel_count = pi.partition_texel_count[i]; |
155 | for (unsigned int j = 0; j < partition_texel_count; j++) |
156 | { |
157 | unsigned int tix = pi.texels_of_partition[i][j]; |
158 | float value = data_vr[tix]; |
159 | lowvalue = astc::min(value, lowvalue); |
160 | highvalue = astc::max(value, highvalue); |
161 | } |
162 | |
163 | if (highvalue <= lowvalue) |
164 | { |
165 | lowvalue = 0.0f; |
166 | highvalue = 1e-7f; |
167 | } |
168 | |
169 | float length = highvalue - lowvalue; |
170 | float length_squared = length * length; |
171 | float scale = 1.0f / length; |
172 | |
173 | if (i == 0) |
174 | { |
175 | partition0_len_sq = length_squared; |
176 | } |
177 | else |
178 | { |
179 | is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; |
180 | } |
181 | |
182 | for (unsigned int j = 0; j < partition_texel_count; j++) |
183 | { |
184 | unsigned int tix = pi.texels_of_partition[i][j]; |
185 | float value = (data_vr[tix] - lowvalue) * scale; |
186 | value = astc::clamp1f(value); |
187 | |
188 | ei.weights[tix] = value; |
189 | ei.weight_error_scale[tix] = length_squared * error_weight; |
190 | assert(!astc::isnan(ei.weight_error_scale[tix])); |
191 | } |
192 | |
193 | ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask); |
194 | ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask); |
195 | } |
196 | |
197 | // Zero initialize any SIMD over-fetch |
198 | unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); |
199 | for (unsigned int i = texel_count; i < texel_count_simd; i++) |
200 | { |
201 | ei.weights[i] = 0.0f; |
202 | ei.weight_error_scale[i] = 0.0f; |
203 | } |
204 | |
205 | ei.is_constant_weight_error_scale = is_constant_wes; |
206 | } |
207 | |
208 | /** |
209 | * @brief Compute the ideal endpoints and weights for 2 color components. |
210 | * |
211 | * @param blk The image block color data to compress. |
212 | * @param pi The partition info for the current trial. |
213 | * @param[out] ei The computed ideal endpoints and weights. |
214 | * @param component1 The first color component to compute. |
215 | * @param component2 The second color component to compute. |
216 | */ |
217 | static void compute_ideal_colors_and_weights_2_comp( |
218 | const image_block& blk, |
219 | const partition_info& pi, |
220 | endpoints_and_weights& ei, |
221 | int component1, |
222 | int component2 |
223 | ) { |
224 | unsigned int partition_count = pi.partition_count; |
225 | ei.ep.partition_count = partition_count; |
226 | promise(partition_count > 0); |
227 | |
228 | unsigned int texel_count = blk.texel_count; |
229 | promise(texel_count > 0); |
230 | |
231 | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
232 | |
233 | float error_weight; |
234 | const float* data_vr = nullptr; |
235 | const float* data_vg = nullptr; |
236 | |
237 | if (component1 == 0 && component2 == 1) |
238 | { |
239 | error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f; |
240 | |
241 | data_vr = blk.data_r; |
242 | data_vg = blk.data_g; |
243 | } |
244 | else if (component1 == 0 && component2 == 2) |
245 | { |
246 | error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f; |
247 | |
248 | data_vr = blk.data_r; |
249 | data_vg = blk.data_b; |
250 | } |
251 | else // (component1 == 1 && component2 == 2) |
252 | { |
253 | assert(component1 == 1 && component2 == 2); |
254 | |
255 | error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f; |
256 | |
257 | data_vr = blk.data_g; |
258 | data_vg = blk.data_b; |
259 | } |
260 | |
261 | compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms); |
262 | |
263 | bool is_constant_wes { true }; |
264 | float partition0_len_sq { 0.0f }; |
265 | |
266 | vmask4 comp1_mask = vint4::lane_id() == vint4(component1); |
267 | vmask4 comp2_mask = vint4::lane_id() == vint4(component2); |
268 | |
269 | for (unsigned int i = 0; i < partition_count; i++) |
270 | { |
271 | vfloat4 dir = pms[i].dir; |
272 | if (hadd_s(dir) < 0.0f) |
273 | { |
274 | dir = vfloat4::zero() - dir; |
275 | } |
276 | |
277 | line2 line { pms[i].avg, normalize_safe(dir, unit2()) }; |
278 | float lowparam { 1e10f }; |
279 | float highparam { -1e10f }; |
280 | |
281 | unsigned int partition_texel_count = pi.partition_texel_count[i]; |
282 | for (unsigned int j = 0; j < partition_texel_count; j++) |
283 | { |
284 | unsigned int tix = pi.texels_of_partition[i][j]; |
285 | vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]); |
286 | float param = dot_s(point - line.a, line.b); |
287 | ei.weights[tix] = param; |
288 | |
289 | lowparam = astc::min(param, lowparam); |
290 | highparam = astc::max(param, highparam); |
291 | } |
292 | |
293 | // It is possible for a uniform-color partition to produce length=0; |
294 | // this causes NaN issues so set to small value to avoid this problem |
295 | if (highparam <= lowparam) |
296 | { |
297 | lowparam = 0.0f; |
298 | highparam = 1e-7f; |
299 | } |
300 | |
301 | float length = highparam - lowparam; |
302 | float length_squared = length * length; |
303 | float scale = 1.0f / length; |
304 | |
305 | if (i == 0) |
306 | { |
307 | partition0_len_sq = length_squared; |
308 | } |
309 | else |
310 | { |
311 | is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; |
312 | } |
313 | |
314 | for (unsigned int j = 0; j < partition_texel_count; j++) |
315 | { |
316 | unsigned int tix = pi.texels_of_partition[i][j]; |
317 | float idx = (ei.weights[tix] - lowparam) * scale; |
318 | idx = astc::clamp1f(idx); |
319 | |
320 | ei.weights[tix] = idx; |
321 | ei.weight_error_scale[tix] = length_squared * error_weight; |
322 | assert(!astc::isnan(ei.weight_error_scale[tix])); |
323 | } |
324 | |
325 | vfloat4 lowvalue = line.a + line.b * lowparam; |
326 | vfloat4 highvalue = line.a + line.b * highparam; |
327 | |
328 | vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask); |
329 | vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask); |
330 | |
331 | ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask); |
332 | ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask); |
333 | } |
334 | |
335 | // Zero initialize any SIMD over-fetch |
336 | unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); |
337 | for (unsigned int i = texel_count; i < texel_count_simd; i++) |
338 | { |
339 | ei.weights[i] = 0.0f; |
340 | ei.weight_error_scale[i] = 0.0f; |
341 | } |
342 | |
343 | ei.is_constant_weight_error_scale = is_constant_wes; |
344 | } |
345 | |
346 | /** |
347 | * @brief Compute the ideal endpoints and weights for 3 color components. |
348 | * |
349 | * @param blk The image block color data to compress. |
350 | * @param pi The partition info for the current trial. |
351 | * @param[out] ei The computed ideal endpoints and weights. |
352 | * @param omitted_component The color component excluded from the calculation. |
353 | */ |
354 | static void compute_ideal_colors_and_weights_3_comp( |
355 | const image_block& blk, |
356 | const partition_info& pi, |
357 | endpoints_and_weights& ei, |
358 | unsigned int omitted_component |
359 | ) { |
360 | unsigned int partition_count = pi.partition_count; |
361 | ei.ep.partition_count = partition_count; |
362 | promise(partition_count > 0); |
363 | |
364 | unsigned int texel_count = blk.texel_count; |
365 | promise(texel_count > 0); |
366 | |
367 | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
368 | |
369 | float error_weight; |
370 | const float* data_vr = nullptr; |
371 | const float* data_vg = nullptr; |
372 | const float* data_vb = nullptr; |
373 | if (omitted_component == 0) |
374 | { |
375 | error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()); |
376 | data_vr = blk.data_g; |
377 | data_vg = blk.data_b; |
378 | data_vb = blk.data_a; |
379 | } |
380 | else if (omitted_component == 1) |
381 | { |
382 | error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()); |
383 | data_vr = blk.data_r; |
384 | data_vg = blk.data_b; |
385 | data_vb = blk.data_a; |
386 | } |
387 | else if (omitted_component == 2) |
388 | { |
389 | error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()); |
390 | data_vr = blk.data_r; |
391 | data_vg = blk.data_g; |
392 | data_vb = blk.data_a; |
393 | } |
394 | else |
395 | { |
396 | assert(omitted_component == 3); |
397 | |
398 | error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()); |
399 | data_vr = blk.data_r; |
400 | data_vg = blk.data_g; |
401 | data_vb = blk.data_b; |
402 | } |
403 | |
404 | error_weight = error_weight * (1.0f / 3.0f); |
405 | |
406 | if (omitted_component == 3) |
407 | { |
408 | compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); |
409 | } |
410 | else |
411 | { |
412 | compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms); |
413 | } |
414 | |
415 | bool is_constant_wes { true }; |
416 | float partition0_len_sq { 0.0f }; |
417 | |
418 | for (unsigned int i = 0; i < partition_count; i++) |
419 | { |
420 | vfloat4 dir = pms[i].dir; |
421 | if (hadd_rgb_s(dir) < 0.0f) |
422 | { |
423 | dir = vfloat4::zero() - dir; |
424 | } |
425 | |
426 | line3 line { pms[i].avg, normalize_safe(dir, unit3()) }; |
427 | float lowparam { 1e10f }; |
428 | float highparam { -1e10f }; |
429 | |
430 | unsigned int partition_texel_count = pi.partition_texel_count[i]; |
431 | for (unsigned int j = 0; j < partition_texel_count; j++) |
432 | { |
433 | unsigned int tix = pi.texels_of_partition[i][j]; |
434 | vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]); |
435 | float param = dot3_s(point - line.a, line.b); |
436 | ei.weights[tix] = param; |
437 | |
438 | lowparam = astc::min(param, lowparam); |
439 | highparam = astc::max(param, highparam); |
440 | } |
441 | |
442 | // It is possible for a uniform-color partition to produce length=0; |
443 | // this causes NaN issues so set to small value to avoid this problem |
444 | if (highparam <= lowparam) |
445 | { |
446 | lowparam = 0.0f; |
447 | highparam = 1e-7f; |
448 | } |
449 | |
450 | float length = highparam - lowparam; |
451 | float length_squared = length * length; |
452 | float scale = 1.0f / length; |
453 | |
454 | if (i == 0) |
455 | { |
456 | partition0_len_sq = length_squared; |
457 | } |
458 | else |
459 | { |
460 | is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; |
461 | } |
462 | |
463 | for (unsigned int j = 0; j < partition_texel_count; j++) |
464 | { |
465 | unsigned int tix = pi.texels_of_partition[i][j]; |
466 | float idx = (ei.weights[tix] - lowparam) * scale; |
467 | idx = astc::clamp1f(idx); |
468 | |
469 | ei.weights[tix] = idx; |
470 | ei.weight_error_scale[tix] = length_squared * error_weight; |
471 | assert(!astc::isnan(ei.weight_error_scale[tix])); |
472 | } |
473 | |
474 | vfloat4 ep0 = line.a + line.b * lowparam; |
475 | vfloat4 ep1 = line.a + line.b * highparam; |
476 | |
477 | vfloat4 bmin = blk.data_min; |
478 | vfloat4 bmax = blk.data_max; |
479 | |
480 | assert(omitted_component < BLOCK_MAX_COMPONENTS); |
481 | switch (omitted_component) |
482 | { |
483 | case 0: |
484 | ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>()); |
485 | ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>()); |
486 | break; |
487 | case 1: |
488 | ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>()); |
489 | ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>()); |
490 | break; |
491 | case 2: |
492 | ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>()); |
493 | ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>()); |
494 | break; |
495 | default: |
496 | ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>()); |
497 | ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>()); |
498 | break; |
499 | } |
500 | } |
501 | |
502 | // Zero initialize any SIMD over-fetch |
503 | unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); |
504 | for (unsigned int i = texel_count; i < texel_count_simd; i++) |
505 | { |
506 | ei.weights[i] = 0.0f; |
507 | ei.weight_error_scale[i] = 0.0f; |
508 | } |
509 | |
510 | ei.is_constant_weight_error_scale = is_constant_wes; |
511 | } |
512 | |
513 | /** |
514 | * @brief Compute the ideal endpoints and weights for 4 color components. |
515 | * |
516 | * @param blk The image block color data to compress. |
517 | * @param pi The partition info for the current trial. |
518 | * @param[out] ei The computed ideal endpoints and weights. |
519 | */ |
520 | static void compute_ideal_colors_and_weights_4_comp( |
521 | const image_block& blk, |
522 | const partition_info& pi, |
523 | endpoints_and_weights& ei |
524 | ) { |
525 | const float error_weight = hadd_s(blk.channel_weight) / 4.0f; |
526 | |
527 | unsigned int partition_count = pi.partition_count; |
528 | |
529 | unsigned int texel_count = blk.texel_count; |
530 | promise(texel_count > 0); |
531 | promise(partition_count > 0); |
532 | |
533 | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
534 | |
535 | compute_avgs_and_dirs_4_comp(pi, blk, pms); |
536 | |
537 | bool is_constant_wes { true }; |
538 | float partition0_len_sq { 0.0f }; |
539 | |
540 | for (unsigned int i = 0; i < partition_count; i++) |
541 | { |
542 | vfloat4 dir = pms[i].dir; |
543 | if (hadd_rgb_s(dir) < 0.0f) |
544 | { |
545 | dir = vfloat4::zero() - dir; |
546 | } |
547 | |
548 | line4 line { pms[i].avg, normalize_safe(dir, unit4()) }; |
549 | float lowparam { 1e10f }; |
550 | float highparam { -1e10f }; |
551 | |
552 | unsigned int partition_texel_count = pi.partition_texel_count[i]; |
553 | for (unsigned int j = 0; j < partition_texel_count; j++) |
554 | { |
555 | unsigned int tix = pi.texels_of_partition[i][j]; |
556 | vfloat4 point = blk.texel(tix); |
557 | float param = dot_s(point - line.a, line.b); |
558 | ei.weights[tix] = param; |
559 | |
560 | lowparam = astc::min(param, lowparam); |
561 | highparam = astc::max(param, highparam); |
562 | } |
563 | |
564 | // It is possible for a uniform-color partition to produce length=0; |
565 | // this causes NaN issues so set to small value to avoid this problem |
566 | if (highparam <= lowparam) |
567 | { |
568 | lowparam = 0.0f; |
569 | highparam = 1e-7f; |
570 | } |
571 | |
572 | float length = highparam - lowparam; |
573 | float length_squared = length * length; |
574 | float scale = 1.0f / length; |
575 | |
576 | if (i == 0) |
577 | { |
578 | partition0_len_sq = length_squared; |
579 | } |
580 | else |
581 | { |
582 | is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; |
583 | } |
584 | |
585 | ei.ep.endpt0[i] = line.a + line.b * lowparam; |
586 | ei.ep.endpt1[i] = line.a + line.b * highparam; |
587 | |
588 | for (unsigned int j = 0; j < partition_texel_count; j++) |
589 | { |
590 | unsigned int tix = pi.texels_of_partition[i][j]; |
591 | float idx = (ei.weights[tix] - lowparam) * scale; |
592 | idx = astc::clamp1f(idx); |
593 | |
594 | ei.weights[tix] = idx; |
595 | ei.weight_error_scale[tix] = length_squared * error_weight; |
596 | assert(!astc::isnan(ei.weight_error_scale[tix])); |
597 | } |
598 | } |
599 | |
600 | // Zero initialize any SIMD over-fetch |
601 | unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); |
602 | for (unsigned int i = texel_count; i < texel_count_simd; i++) |
603 | { |
604 | ei.weights[i] = 0.0f; |
605 | ei.weight_error_scale[i] = 0.0f; |
606 | } |
607 | |
608 | ei.is_constant_weight_error_scale = is_constant_wes; |
609 | } |
610 | |
611 | /* See header for documentation. */ |
612 | void compute_ideal_colors_and_weights_1plane( |
613 | const image_block& blk, |
614 | const partition_info& pi, |
615 | endpoints_and_weights& ei |
616 | ) { |
617 | bool uses_alpha = !blk.is_constant_channel(3); |
618 | |
619 | if (uses_alpha) |
620 | { |
621 | compute_ideal_colors_and_weights_4_comp(blk, pi, ei); |
622 | } |
623 | else |
624 | { |
625 | compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3); |
626 | } |
627 | } |
628 | |
629 | /* See header for documentation. */ |
630 | void compute_ideal_colors_and_weights_2planes( |
631 | const block_size_descriptor& bsd, |
632 | const image_block& blk, |
633 | unsigned int plane2_component, |
634 | endpoints_and_weights& ei1, |
635 | endpoints_and_weights& ei2 |
636 | ) { |
637 | const auto& pi = bsd.get_partition_info(1, 0); |
638 | bool uses_alpha = !blk.is_constant_channel(3); |
639 | |
640 | assert(plane2_component < BLOCK_MAX_COMPONENTS); |
641 | switch (plane2_component) |
642 | { |
643 | case 0: // Separate weights for red |
644 | if (uses_alpha) |
645 | { |
646 | compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0); |
647 | } |
648 | else |
649 | { |
650 | compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2); |
651 | } |
652 | compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0); |
653 | break; |
654 | |
655 | case 1: // Separate weights for green |
656 | if (uses_alpha) |
657 | { |
658 | compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1); |
659 | } |
660 | else |
661 | { |
662 | compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2); |
663 | } |
664 | compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1); |
665 | break; |
666 | |
667 | case 2: // Separate weights for blue |
668 | if (uses_alpha) |
669 | { |
670 | compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2); |
671 | } |
672 | else |
673 | { |
674 | compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1); |
675 | } |
676 | compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2); |
677 | break; |
678 | |
679 | default: // Separate weights for alpha |
680 | assert(uses_alpha); |
681 | compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3); |
682 | compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3); |
683 | break; |
684 | } |
685 | } |
686 | |
687 | /* See header for documentation. */ |
688 | float compute_error_of_weight_set_1plane( |
689 | const endpoints_and_weights& eai, |
690 | const decimation_info& di, |
691 | const float* dec_weight_quant_uvalue |
692 | ) { |
693 | vfloatacc error_summav = vfloatacc::zero(); |
694 | unsigned int texel_count = di.texel_count; |
695 | promise(texel_count > 0); |
696 | |
697 | // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized |
698 | if (di.max_texel_weight_count > 2) |
699 | { |
700 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
701 | { |
702 | // Compute the bilinear interpolation of the decimated weight grid |
703 | vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i); |
704 | |
705 | // Compute the error between the computed value and the ideal weight |
706 | vfloat actual_values = loada(eai.weights + i); |
707 | vfloat diff = current_values - actual_values; |
708 | vfloat significance = loada(eai.weight_error_scale + i); |
709 | vfloat error = diff * diff * significance; |
710 | |
711 | haccumulate(error_summav, error); |
712 | } |
713 | } |
714 | else if (di.max_texel_weight_count > 1) |
715 | { |
716 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
717 | { |
718 | // Compute the bilinear interpolation of the decimated weight grid |
719 | vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i); |
720 | |
721 | // Compute the error between the computed value and the ideal weight |
722 | vfloat actual_values = loada(eai.weights + i); |
723 | vfloat diff = current_values - actual_values; |
724 | vfloat significance = loada(eai.weight_error_scale + i); |
725 | vfloat error = diff * diff * significance; |
726 | |
727 | haccumulate(error_summav, error); |
728 | } |
729 | } |
730 | else |
731 | { |
732 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
733 | { |
734 | // Load the weight set directly, without interpolation |
735 | vfloat current_values = loada(dec_weight_quant_uvalue + i); |
736 | |
737 | // Compute the error between the computed value and the ideal weight |
738 | vfloat actual_values = loada(eai.weights + i); |
739 | vfloat diff = current_values - actual_values; |
740 | vfloat significance = loada(eai.weight_error_scale + i); |
741 | vfloat error = diff * diff * significance; |
742 | |
743 | haccumulate(error_summav, error); |
744 | } |
745 | } |
746 | |
747 | // Resolve the final scalar accumulator sum |
748 | return hadd_s(error_summav); |
749 | } |
750 | |
751 | /* See header for documentation. */ |
752 | float compute_error_of_weight_set_2planes( |
753 | const endpoints_and_weights& eai1, |
754 | const endpoints_and_weights& eai2, |
755 | const decimation_info& di, |
756 | const float* dec_weight_quant_uvalue_plane1, |
757 | const float* dec_weight_quant_uvalue_plane2 |
758 | ) { |
759 | vfloatacc error_summav = vfloatacc::zero(); |
760 | unsigned int texel_count = di.texel_count; |
761 | promise(texel_count > 0); |
762 | |
763 | // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized |
764 | if (di.max_texel_weight_count > 2) |
765 | { |
766 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
767 | { |
768 | // Plane 1 |
769 | // Compute the bilinear interpolation of the decimated weight grid |
770 | vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i); |
771 | |
772 | // Compute the error between the computed value and the ideal weight |
773 | vfloat actual_values1 = loada(eai1.weights + i); |
774 | vfloat diff = current_values1 - actual_values1; |
775 | vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); |
776 | |
777 | // Plane 2 |
778 | // Compute the bilinear interpolation of the decimated weight grid |
779 | vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i); |
780 | |
781 | // Compute the error between the computed value and the ideal weight |
782 | vfloat actual_values2 = loada(eai2.weights + i); |
783 | diff = current_values2 - actual_values2; |
784 | vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); |
785 | |
786 | haccumulate(error_summav, error1 + error2); |
787 | } |
788 | } |
789 | else if (di.max_texel_weight_count > 1) |
790 | { |
791 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
792 | { |
793 | // Plane 1 |
794 | // Compute the bilinear interpolation of the decimated weight grid |
795 | vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i); |
796 | |
797 | // Compute the error between the computed value and the ideal weight |
798 | vfloat actual_values1 = loada(eai1.weights + i); |
799 | vfloat diff = current_values1 - actual_values1; |
800 | vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); |
801 | |
802 | // Plane 2 |
803 | // Compute the bilinear interpolation of the decimated weight grid |
804 | vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i); |
805 | |
806 | // Compute the error between the computed value and the ideal weight |
807 | vfloat actual_values2 = loada(eai2.weights + i); |
808 | diff = current_values2 - actual_values2; |
809 | vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); |
810 | |
811 | haccumulate(error_summav, error1 + error2); |
812 | } |
813 | } |
814 | else |
815 | { |
816 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
817 | { |
818 | // Plane 1 |
819 | // Load the weight set directly, without interpolation |
820 | vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i); |
821 | |
822 | // Compute the error between the computed value and the ideal weight |
823 | vfloat actual_values1 = loada(eai1.weights + i); |
824 | vfloat diff = current_values1 - actual_values1; |
825 | vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); |
826 | |
827 | // Plane 2 |
828 | // Load the weight set directly, without interpolation |
829 | vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i); |
830 | |
831 | // Compute the error between the computed value and the ideal weight |
832 | vfloat actual_values2 = loada(eai2.weights + i); |
833 | diff = current_values2 - actual_values2; |
834 | vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); |
835 | |
836 | haccumulate(error_summav, error1 + error2); |
837 | } |
838 | } |
839 | |
840 | // Resolve the final scalar accumulator sum |
841 | return hadd_s(error_summav); |
842 | } |
843 | |
844 | /* See header for documentation. */ |
845 | void compute_ideal_weights_for_decimation( |
846 | const endpoints_and_weights& ei, |
847 | const decimation_info& di, |
848 | float* dec_weight_ideal_value |
849 | ) { |
850 | unsigned int texel_count = di.texel_count; |
851 | unsigned int weight_count = di.weight_count; |
852 | bool is_direct = texel_count == weight_count; |
853 | promise(texel_count > 0); |
854 | promise(weight_count > 0); |
855 | |
856 | // Ensure that the end of the output arrays that are used for SIMD paths later are filled so we |
857 | // can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight |
858 | // arrays always contain space for 64 elements |
859 | unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1); |
860 | storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd); |
861 | |
862 | // If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the |
863 | // zero-initialized SIMD over-fetch region |
864 | if (is_direct) |
865 | { |
866 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
867 | { |
868 | vfloat weight(ei.weights + i); |
869 | storea(weight, dec_weight_ideal_value + i); |
870 | } |
871 | |
872 | return; |
873 | } |
874 | |
875 | // Otherwise compute an estimate and perform single refinement iteration |
876 | alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS]; |
877 | |
878 | // Compute an initial average for each decimated weight |
879 | bool constant_wes = ei.is_constant_weight_error_scale; |
880 | vfloat weight_error_scale(ei.weight_error_scale[0]); |
881 | |
882 | // This overshoots - this is OK as we initialize the array tails in the |
883 | // decimation table structures to safe values ... |
884 | for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
885 | { |
886 | // Start with a small value to avoid div-by-zero later |
887 | vfloat weight_weight(1e-10f); |
888 | vfloat initial_weight = vfloat::zero(); |
889 | |
890 | // Accumulate error weighting of all the texels using this weight |
891 | vint weight_texel_count(di.weight_texel_count + i); |
892 | unsigned int max_texel_count = hmax(weight_texel_count).lane<0>(); |
893 | promise(max_texel_count > 0); |
894 | |
895 | for (unsigned int j = 0; j < max_texel_count; j++) |
896 | { |
897 | vint texel(di.weight_texels_tr[j] + i); |
898 | vfloat weight = loada(di.weights_texel_contribs_tr[j] + i); |
899 | |
900 | if (!constant_wes) |
901 | { |
902 | weight_error_scale = gatherf(ei.weight_error_scale, texel); |
903 | } |
904 | |
905 | vfloat contrib_weight = weight * weight_error_scale; |
906 | |
907 | weight_weight += contrib_weight; |
908 | initial_weight += gatherf(ei.weights, texel) * contrib_weight; |
909 | } |
910 | |
911 | storea(initial_weight / weight_weight, dec_weight_ideal_value + i); |
912 | } |
913 | |
914 | // Populate the interpolated weight grid based on the initial average |
915 | // Process SIMD-width texel coordinates at at time while we can. Safe to |
916 | // over-process full SIMD vectors - the tail is zeroed. |
917 | if (di.max_texel_weight_count <= 2) |
918 | { |
919 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
920 | { |
921 | vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i); |
922 | storea(weight, infilled_weights + i); |
923 | } |
924 | } |
925 | else |
926 | { |
927 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
928 | { |
929 | vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i); |
930 | storea(weight, infilled_weights + i); |
931 | } |
932 | } |
933 | |
934 | // Perform a single iteration of refinement |
935 | // Empirically determined step size; larger values don't help but smaller drops image quality |
936 | constexpr float stepsize = 0.25f; |
937 | constexpr float chd_scale = -WEIGHTS_TEXEL_SUM; |
938 | |
939 | for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
940 | { |
941 | vfloat weight_val = loada(dec_weight_ideal_value + i); |
942 | |
943 | // Accumulate error weighting of all the texels using this weight |
944 | // Start with a small value to avoid div-by-zero later |
945 | vfloat error_change0(1e-10f); |
946 | vfloat error_change1(0.0f); |
947 | |
948 | // Accumulate error weighting of all the texels using this weight |
949 | vint weight_texel_count(di.weight_texel_count + i); |
950 | unsigned int max_texel_count = hmax(weight_texel_count).lane<0>(); |
951 | promise(max_texel_count > 0); |
952 | |
953 | for (unsigned int j = 0; j < max_texel_count; j++) |
954 | { |
955 | vint texel(di.weight_texels_tr[j] + i); |
956 | vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i); |
957 | |
958 | if (!constant_wes) |
959 | { |
960 | weight_error_scale = gatherf(ei.weight_error_scale, texel); |
961 | } |
962 | |
963 | vfloat scale = weight_error_scale * contrib_weight; |
964 | vfloat old_weight = gatherf(infilled_weights, texel); |
965 | vfloat ideal_weight = gatherf(ei.weights, texel); |
966 | |
967 | error_change0 += contrib_weight * scale; |
968 | error_change1 += (old_weight - ideal_weight) * scale; |
969 | } |
970 | |
971 | vfloat step = (error_change1 * chd_scale) / error_change0; |
972 | step = clamp(-stepsize, stepsize, step); |
973 | |
974 | // Update the weight; note this can store negative values |
975 | storea(weight_val + step, dec_weight_ideal_value + i); |
976 | } |
977 | } |
978 | |
979 | /* See header for documentation. */ |
980 | void compute_quantized_weights_for_decimation( |
981 | const decimation_info& di, |
982 | float low_bound, |
983 | float high_bound, |
984 | const float* dec_weight_ideal_value, |
985 | float* weight_set_out, |
986 | uint8_t* quantized_weight_set, |
987 | quant_method quant_level |
988 | ) { |
989 | int weight_count = di.weight_count; |
990 | promise(weight_count > 0); |
991 | const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level]; |
992 | |
993 | // The available quant levels, stored with a minus 1 bias |
994 | static const float quant_levels_m1[12] { |
995 | 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f |
996 | }; |
997 | |
998 | vint steps_m1(get_quant_level(quant_level) - 1); |
999 | float quant_level_m1 = quant_levels_m1[quant_level]; |
1000 | |
1001 | // Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds |
1002 | |
1003 | // TODO: Oddity to investigate; triggered by test in issue #265. |
1004 | if (high_bound <= low_bound) |
1005 | { |
1006 | low_bound = 0.0f; |
1007 | high_bound = 1.0f; |
1008 | } |
1009 | |
1010 | float rscale = high_bound - low_bound; |
1011 | float scale = 1.0f / rscale; |
1012 | |
1013 | float scaled_low_bound = low_bound * scale; |
1014 | rscale *= 1.0f / 64.0f; |
1015 | |
1016 | vfloat scalev(scale); |
1017 | vfloat scaled_low_boundv(scaled_low_bound); |
1018 | vfloat quant_level_m1v(quant_level_m1); |
1019 | vfloat rscalev(rscale); |
1020 | vfloat low_boundv(low_bound); |
1021 | |
1022 | // This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known |
1023 | // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements |
1024 | if (get_quant_level(quant_level) <= 16) |
1025 | { |
1026 | vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant)); |
1027 | vint tab0p; |
1028 | vtable_prepare(tab0, tab0p); |
1029 | |
1030 | for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
1031 | { |
1032 | vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv; |
1033 | ix = clampzo(ix); |
1034 | |
1035 | // Look up the two closest indexes and return the one that was closest |
1036 | vfloat ix1 = ix * quant_level_m1v; |
1037 | |
1038 | vint weightl = float_to_int(ix1); |
1039 | vint weighth = min(weightl + vint(1), steps_m1); |
1040 | |
1041 | vint ixli = vtable_8bt_32bi(tab0p, weightl); |
1042 | vint ixhi = vtable_8bt_32bi(tab0p, weighth); |
1043 | |
1044 | vfloat ixl = int_to_float(ixli); |
1045 | vfloat ixh = int_to_float(ixhi); |
1046 | |
1047 | vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix); |
1048 | vint weight = select(ixli, ixhi, mask); |
1049 | ixl = select(ixl, ixh, mask); |
1050 | |
1051 | // Invert the weight-scaling that was done initially |
1052 | storea(ixl * rscalev + low_boundv, weight_set_out + i); |
1053 | vint scn = pack_low_bytes(weight); |
1054 | store_nbytes(scn, quantized_weight_set + i); |
1055 | } |
1056 | } |
1057 | else |
1058 | { |
1059 | vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant)); |
1060 | vint4 tab1(reinterpret_cast<const int*>(qat.quant_to_unquant + 16)); |
1061 | vint tab0p, tab1p; |
1062 | vtable_prepare(tab0, tab1, tab0p, tab1p); |
1063 | |
1064 | for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
1065 | { |
1066 | vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv; |
1067 | ix = clampzo(ix); |
1068 | |
1069 | // Look up the two closest indexes and return the one that was closest |
1070 | vfloat ix1 = ix * quant_level_m1v; |
1071 | |
1072 | vint weightl = float_to_int(ix1); |
1073 | vint weighth = min(weightl + vint(1), steps_m1); |
1074 | |
1075 | vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl); |
1076 | vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth); |
1077 | |
1078 | vfloat ixl = int_to_float(ixli); |
1079 | vfloat ixh = int_to_float(ixhi); |
1080 | |
1081 | vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix); |
1082 | vint weight = select(ixli, ixhi, mask); |
1083 | ixl = select(ixl, ixh, mask); |
1084 | |
1085 | // Invert the weight-scaling that was done initially |
1086 | storea(ixl * rscalev + low_boundv, weight_set_out + i); |
1087 | vint scn = pack_low_bytes(weight); |
1088 | store_nbytes(scn, quantized_weight_set + i); |
1089 | } |
1090 | } |
1091 | } |
1092 | |
1093 | /** |
1094 | * @brief Compute the RGB + offset for a HDR endpoint mode #7. |
1095 | * |
1096 | * Since the matrix needed has a regular structure we can simplify the inverse calculation. This |
1097 | * gives us ~24 multiplications vs. 96 for a generic inverse. |
1098 | * |
1099 | * mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x); |
1100 | * mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y); |
1101 | * mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z); |
1102 | * mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum); |
1103 | * mat = invert(mat); |
1104 | * |
1105 | * @param rgba_weight_sum Sum of partition component error weights. |
1106 | * @param weight_weight_sum Sum of partition component error weights * texel weight. |
1107 | * @param rgbq_sum Sum of partition component error weights * texel weight * color data. |
1108 | * @param psum Sum of RGB color weights * texel weight^2. |
1109 | */ |
1110 | static inline vfloat4 compute_rgbo_vector( |
1111 | vfloat4 rgba_weight_sum, |
1112 | vfloat4 weight_weight_sum, |
1113 | vfloat4 rgbq_sum, |
1114 | float psum |
1115 | ) { |
1116 | float X = rgba_weight_sum.lane<0>(); |
1117 | float Y = rgba_weight_sum.lane<1>(); |
1118 | float Z = rgba_weight_sum.lane<2>(); |
1119 | float P = weight_weight_sum.lane<0>(); |
1120 | float Q = weight_weight_sum.lane<1>(); |
1121 | float R = weight_weight_sum.lane<2>(); |
1122 | float S = psum; |
1123 | |
1124 | float PP = P * P; |
1125 | float QQ = Q * Q; |
1126 | float RR = R * R; |
1127 | |
1128 | float SZmRR = S * Z - RR; |
1129 | float DT = SZmRR * Y - Z * QQ; |
1130 | float YP = Y * P; |
1131 | float QX = Q * X; |
1132 | float YX = Y * X; |
1133 | float mZYP = -Z * YP; |
1134 | float mZQX = -Z * QX; |
1135 | float mRYX = -R * YX; |
1136 | float ZQP = Z * Q * P; |
1137 | float RYP = R * YP; |
1138 | float RQX = R * QX; |
1139 | |
1140 | // Compute the reciprocal of matrix determinant |
1141 | float rdet = 1.0f / (DT * X + mZYP * P); |
1142 | |
1143 | // Actually compute the adjugate, and then apply 1/det separately |
1144 | vfloat4 mat0(DT, ZQP, RYP, mZYP); |
1145 | vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX); |
1146 | vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX); |
1147 | vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX); |
1148 | vfloat4 vect = rgbq_sum * rdet; |
1149 | |
1150 | return vfloat4(dot_s(mat0, vect), |
1151 | dot_s(mat1, vect), |
1152 | dot_s(mat2, vect), |
1153 | dot_s(mat3, vect)); |
1154 | } |
1155 | |
1156 | /* See header for documentation. */ |
1157 | void recompute_ideal_colors_1plane( |
1158 | const image_block& blk, |
1159 | const partition_info& pi, |
1160 | const decimation_info& di, |
1161 | const uint8_t* dec_weights_uquant, |
1162 | endpoints& ep, |
1163 | vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS], |
1164 | vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS] |
1165 | ) { |
1166 | unsigned int weight_count = di.weight_count; |
1167 | unsigned int total_texel_count = blk.texel_count; |
1168 | unsigned int partition_count = pi.partition_count; |
1169 | |
1170 | promise(weight_count > 0); |
1171 | promise(total_texel_count > 0); |
1172 | promise(partition_count > 0); |
1173 | |
1174 | alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS]; |
1175 | for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
1176 | { |
1177 | vint unquant_value(dec_weights_uquant + i); |
1178 | vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f); |
1179 | storea(unquant_valuef, dec_weight + i); |
1180 | } |
1181 | |
1182 | alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS]; |
1183 | float* undec_weight_ref; |
1184 | if (di.max_texel_weight_count == 1) |
1185 | { |
1186 | undec_weight_ref = dec_weight; |
1187 | } |
1188 | else if (di.max_texel_weight_count <= 2) |
1189 | { |
1190 | for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) |
1191 | { |
1192 | vfloat weight = bilinear_infill_vla_2(di, dec_weight, i); |
1193 | storea(weight, undec_weight + i); |
1194 | } |
1195 | |
1196 | undec_weight_ref = undec_weight; |
1197 | } |
1198 | else |
1199 | { |
1200 | for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) |
1201 | { |
1202 | vfloat weight = bilinear_infill_vla(di, dec_weight, i); |
1203 | storea(weight, undec_weight + i); |
1204 | } |
1205 | |
1206 | undec_weight_ref = undec_weight; |
1207 | } |
1208 | |
1209 | vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count)); |
1210 | |
1211 | for (unsigned int i = 0; i < partition_count; i++) |
1212 | { |
1213 | unsigned int texel_count = pi.partition_texel_count[i]; |
1214 | const uint8_t *texel_indexes = pi.texels_of_partition[i]; |
1215 | |
1216 | // Only compute a partition mean if more than one partition |
1217 | if (partition_count > 1) |
1218 | { |
1219 | rgba_sum = vfloat4::zero(); |
1220 | promise(texel_count > 0); |
1221 | for (unsigned int j = 0; j < texel_count; j++) |
1222 | { |
1223 | unsigned int tix = texel_indexes[j]; |
1224 | rgba_sum += blk.texel(tix); |
1225 | } |
1226 | } |
1227 | |
1228 | rgba_sum = rgba_sum * blk.channel_weight; |
1229 | vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f); |
1230 | vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>()); |
1231 | |
1232 | float scale_max = 0.0f; |
1233 | float scale_min = 1e10f; |
1234 | |
1235 | float wmin1 = 1.0f; |
1236 | float wmax1 = 0.0f; |
1237 | |
1238 | float left_sum_s = 0.0f; |
1239 | float middle_sum_s = 0.0f; |
1240 | float right_sum_s = 0.0f; |
1241 | |
1242 | vfloat4 color_vec_x = vfloat4::zero(); |
1243 | vfloat4 color_vec_y = vfloat4::zero(); |
1244 | |
1245 | vfloat4 scale_vec = vfloat4::zero(); |
1246 | |
1247 | float weight_weight_sum_s = 1e-17f; |
1248 | |
1249 | vfloat4 color_weight = blk.channel_weight; |
1250 | float ls_weight = hadd_rgb_s(color_weight); |
1251 | |
1252 | for (unsigned int j = 0; j < texel_count; j++) |
1253 | { |
1254 | unsigned int tix = texel_indexes[j]; |
1255 | vfloat4 rgba = blk.texel(tix); |
1256 | |
1257 | float idx0 = undec_weight_ref[tix]; |
1258 | |
1259 | float om_idx0 = 1.0f - idx0; |
1260 | wmin1 = astc::min(idx0, wmin1); |
1261 | wmax1 = astc::max(idx0, wmax1); |
1262 | |
1263 | float scale = dot3_s(scale_dir, rgba); |
1264 | scale_min = astc::min(scale, scale_min); |
1265 | scale_max = astc::max(scale, scale_max); |
1266 | |
1267 | left_sum_s += om_idx0 * om_idx0; |
1268 | middle_sum_s += om_idx0 * idx0; |
1269 | right_sum_s += idx0 * idx0; |
1270 | weight_weight_sum_s += idx0; |
1271 | |
1272 | vfloat4 color_idx(idx0); |
1273 | vfloat4 cwprod = rgba; |
1274 | vfloat4 cwiprod = cwprod * color_idx; |
1275 | |
1276 | color_vec_y += cwiprod; |
1277 | color_vec_x += cwprod - cwiprod; |
1278 | |
1279 | scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight); |
1280 | } |
1281 | |
1282 | vfloat4 left_sum = vfloat4(left_sum_s) * color_weight; |
1283 | vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight; |
1284 | vfloat4 right_sum = vfloat4(right_sum_s) * color_weight; |
1285 | vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight; |
1286 | |
1287 | color_vec_x = color_vec_x * color_weight; |
1288 | color_vec_y = color_vec_y * color_weight; |
1289 | |
1290 | // Initialize the luminance and scale vectors with a reasonable default |
1291 | float scalediv = scale_min / astc::max(scale_max, 1e-10f); |
1292 | scalediv = astc::clamp1f(scalediv); |
1293 | |
1294 | vfloat4 sds = scale_dir * scale_max; |
1295 | |
1296 | rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv); |
1297 | |
1298 | if (wmin1 >= wmax1 * 0.999f) |
1299 | { |
1300 | // If all weights in the partition were equal, then just take average of all colors in |
1301 | // the partition and use that as both endpoint colors |
1302 | vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; |
1303 | |
1304 | vmask4 notnan_mask = avg == avg; |
1305 | ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask); |
1306 | ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask); |
1307 | |
1308 | rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f); |
1309 | } |
1310 | else |
1311 | { |
1312 | // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given |
1313 | // set of texel weights and pixel colors |
1314 | vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum); |
1315 | vfloat4 color_rdet1 = 1.0f / color_det1; |
1316 | |
1317 | float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>()); |
1318 | float ls_rdet1 = 1.0f / ls_det1; |
1319 | |
1320 | vfloat4 color_mss1 = (left_sum * left_sum) |
1321 | + (2.0f * middle_sum * middle_sum) |
1322 | + (right_sum * right_sum); |
1323 | |
1324 | float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>()) |
1325 | + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>()) |
1326 | + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>()); |
1327 | |
1328 | vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1; |
1329 | vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1; |
1330 | |
1331 | vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f); |
1332 | vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); |
1333 | vmask4 full_mask = det_mask & notnan_mask; |
1334 | |
1335 | ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask); |
1336 | ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask); |
1337 | |
1338 | float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1; |
1339 | float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1; |
1340 | |
1341 | if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) |
1342 | { |
1343 | float scalediv2 = scale_ep0 / scale_ep1; |
1344 | vfloat4 sdsm = scale_dir * scale_ep1; |
1345 | rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2); |
1346 | } |
1347 | } |
1348 | |
1349 | // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR |
1350 | if (blk.rgb_lns[0] || blk.alpha_lns[0]) |
1351 | { |
1352 | vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight; |
1353 | float psum = right_sum_s * hadd_rgb_s(color_weight); |
1354 | |
1355 | vfloat4 rgbq_sum = color_vec_x + color_vec_y; |
1356 | rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); |
1357 | |
1358 | vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); |
1359 | rgbo_vectors[i] = rgbovec; |
1360 | |
1361 | // We can get a failure due to the use of a singular (non-invertible) matrix |
1362 | // If it failed, compute rgbo_vectors[] with a different method ... |
1363 | if (astc::isnan(dot_s(rgbovec, rgbovec))) |
1364 | { |
1365 | vfloat4 v0 = ep.endpt0[i]; |
1366 | vfloat4 v1 = ep.endpt1[i]; |
1367 | |
1368 | float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); |
1369 | avgdif = astc::max(avgdif, 0.0f); |
1370 | |
1371 | vfloat4 avg = (v0 + v1) * 0.5f; |
1372 | vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; |
1373 | rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); |
1374 | } |
1375 | } |
1376 | } |
1377 | } |
1378 | |
1379 | /* See header for documentation. */ |
1380 | void recompute_ideal_colors_2planes( |
1381 | const image_block& blk, |
1382 | const block_size_descriptor& bsd, |
1383 | const decimation_info& di, |
1384 | const uint8_t* dec_weights_uquant_plane1, |
1385 | const uint8_t* dec_weights_uquant_plane2, |
1386 | endpoints& ep, |
1387 | vfloat4& rgbs_vector, |
1388 | vfloat4& rgbo_vector, |
1389 | int plane2_component |
1390 | ) { |
1391 | unsigned int weight_count = di.weight_count; |
1392 | unsigned int total_texel_count = blk.texel_count; |
1393 | |
1394 | promise(total_texel_count > 0); |
1395 | promise(weight_count > 0); |
1396 | |
1397 | alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; |
1398 | alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; |
1399 | |
1400 | assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE); |
1401 | |
1402 | for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
1403 | { |
1404 | vint unquant_value1(dec_weights_uquant_plane1 + i); |
1405 | vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f); |
1406 | storea(unquant_value1f, dec_weight_plane1 + i); |
1407 | |
1408 | vint unquant_value2(dec_weights_uquant_plane2 + i); |
1409 | vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f); |
1410 | storea(unquant_value2f, dec_weight_plane2 + i); |
1411 | } |
1412 | |
1413 | alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS]; |
1414 | alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS]; |
1415 | |
1416 | float* undec_weight_plane1_ref; |
1417 | float* undec_weight_plane2_ref; |
1418 | |
1419 | if (di.max_texel_weight_count == 1) |
1420 | { |
1421 | undec_weight_plane1_ref = dec_weight_plane1; |
1422 | undec_weight_plane2_ref = dec_weight_plane2; |
1423 | } |
1424 | else if (di.max_texel_weight_count <= 2) |
1425 | { |
1426 | for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) |
1427 | { |
1428 | vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i); |
1429 | storea(weight, undec_weight_plane1 + i); |
1430 | |
1431 | weight = bilinear_infill_vla_2(di, dec_weight_plane2, i); |
1432 | storea(weight, undec_weight_plane2 + i); |
1433 | } |
1434 | |
1435 | undec_weight_plane1_ref = undec_weight_plane1; |
1436 | undec_weight_plane2_ref = undec_weight_plane2; |
1437 | } |
1438 | else |
1439 | { |
1440 | for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) |
1441 | { |
1442 | vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i); |
1443 | storea(weight, undec_weight_plane1 + i); |
1444 | |
1445 | weight = bilinear_infill_vla(di, dec_weight_plane2, i); |
1446 | storea(weight, undec_weight_plane2 + i); |
1447 | } |
1448 | |
1449 | undec_weight_plane1_ref = undec_weight_plane1; |
1450 | undec_weight_plane2_ref = undec_weight_plane2; |
1451 | } |
1452 | |
1453 | unsigned int texel_count = bsd.texel_count; |
1454 | vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f); |
1455 | vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>()); |
1456 | |
1457 | float scale_max = 0.0f; |
1458 | float scale_min = 1e10f; |
1459 | |
1460 | float wmin1 = 1.0f; |
1461 | float wmax1 = 0.0f; |
1462 | |
1463 | float wmin2 = 1.0f; |
1464 | float wmax2 = 0.0f; |
1465 | |
1466 | float left1_sum_s = 0.0f; |
1467 | float middle1_sum_s = 0.0f; |
1468 | float right1_sum_s = 0.0f; |
1469 | |
1470 | float left2_sum_s = 0.0f; |
1471 | float middle2_sum_s = 0.0f; |
1472 | float right2_sum_s = 0.0f; |
1473 | |
1474 | vfloat4 color_vec_x = vfloat4::zero(); |
1475 | vfloat4 color_vec_y = vfloat4::zero(); |
1476 | |
1477 | vfloat4 scale_vec = vfloat4::zero(); |
1478 | |
1479 | vfloat4 weight_weight_sum = vfloat4(1e-17f); |
1480 | |
1481 | vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component); |
1482 | vfloat4 color_weight = blk.channel_weight; |
1483 | float ls_weight = hadd_rgb_s(color_weight); |
1484 | |
1485 | for (unsigned int j = 0; j < texel_count; j++) |
1486 | { |
1487 | vfloat4 rgba = blk.texel(j); |
1488 | |
1489 | float idx0 = undec_weight_plane1_ref[j]; |
1490 | |
1491 | float om_idx0 = 1.0f - idx0; |
1492 | wmin1 = astc::min(idx0, wmin1); |
1493 | wmax1 = astc::max(idx0, wmax1); |
1494 | |
1495 | float scale = dot3_s(scale_dir, rgba); |
1496 | scale_min = astc::min(scale, scale_min); |
1497 | scale_max = astc::max(scale, scale_max); |
1498 | |
1499 | left1_sum_s += om_idx0 * om_idx0; |
1500 | middle1_sum_s += om_idx0 * idx0; |
1501 | right1_sum_s += idx0 * idx0; |
1502 | |
1503 | float idx1 = undec_weight_plane2_ref[j]; |
1504 | |
1505 | float om_idx1 = 1.0f - idx1; |
1506 | wmin2 = astc::min(idx1, wmin2); |
1507 | wmax2 = astc::max(idx1, wmax2); |
1508 | |
1509 | left2_sum_s += om_idx1 * om_idx1; |
1510 | middle2_sum_s += om_idx1 * idx1; |
1511 | right2_sum_s += idx1 * idx1; |
1512 | |
1513 | vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask); |
1514 | |
1515 | vfloat4 cwprod = rgba; |
1516 | vfloat4 cwiprod = cwprod * color_idx; |
1517 | |
1518 | color_vec_y += cwiprod; |
1519 | color_vec_x += cwprod - cwiprod; |
1520 | |
1521 | scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale); |
1522 | weight_weight_sum += color_idx; |
1523 | } |
1524 | |
1525 | vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight; |
1526 | vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight; |
1527 | vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight; |
1528 | vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight; |
1529 | |
1530 | vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight; |
1531 | vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight; |
1532 | vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight; |
1533 | |
1534 | color_vec_x = color_vec_x * color_weight; |
1535 | color_vec_y = color_vec_y * color_weight; |
1536 | |
1537 | // Initialize the luminance and scale vectors with a reasonable default |
1538 | float scalediv = scale_min / astc::max(scale_max, 1e-10f); |
1539 | scalediv = astc::clamp1f(scalediv); |
1540 | |
1541 | vfloat4 sds = scale_dir * scale_max; |
1542 | |
1543 | rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv); |
1544 | |
1545 | if (wmin1 >= wmax1 * 0.999f) |
1546 | { |
1547 | // If all weights in the partition were equal, then just take average of all colors in |
1548 | // the partition and use that as both endpoint colors |
1549 | vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; |
1550 | |
1551 | vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component); |
1552 | vmask4 notnan_mask = avg == avg; |
1553 | vmask4 full_mask = p1_mask & notnan_mask; |
1554 | |
1555 | ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask); |
1556 | ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask); |
1557 | |
1558 | rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f); |
1559 | } |
1560 | else |
1561 | { |
1562 | // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given |
1563 | // set of texel weights and pixel colors |
1564 | vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum); |
1565 | vfloat4 color_rdet1 = 1.0f / color_det1; |
1566 | |
1567 | float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>()); |
1568 | float ls_rdet1 = 1.0f / ls_det1; |
1569 | |
1570 | vfloat4 color_mss1 = (left1_sum * left1_sum) |
1571 | + (2.0f * middle1_sum * middle1_sum) |
1572 | + (right1_sum * right1_sum); |
1573 | |
1574 | float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>()) |
1575 | + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>()) |
1576 | + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>()); |
1577 | |
1578 | vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1; |
1579 | vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1; |
1580 | |
1581 | float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1; |
1582 | float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1; |
1583 | |
1584 | vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component); |
1585 | vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f); |
1586 | vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); |
1587 | vmask4 full_mask = p1_mask & det_mask & notnan_mask; |
1588 | |
1589 | ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask); |
1590 | ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask); |
1591 | |
1592 | if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) |
1593 | { |
1594 | float scalediv2 = scale_ep0 / scale_ep1; |
1595 | vfloat4 sdsm = scale_dir * scale_ep1; |
1596 | rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2); |
1597 | } |
1598 | } |
1599 | |
1600 | if (wmin2 >= wmax2 * 0.999f) |
1601 | { |
1602 | // If all weights in the partition were equal, then just take average of all colors in |
1603 | // the partition and use that as both endpoint colors |
1604 | vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; |
1605 | |
1606 | vmask4 notnan_mask = avg == avg; |
1607 | vmask4 full_mask = p2_mask & notnan_mask; |
1608 | |
1609 | ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask); |
1610 | ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask); |
1611 | } |
1612 | else |
1613 | { |
1614 | // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given |
1615 | // set of texel weights and pixel colors |
1616 | vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum); |
1617 | vfloat4 color_rdet2 = 1.0f / color_det2; |
1618 | |
1619 | vfloat4 color_mss2 = (left2_sum * left2_sum) |
1620 | + (2.0f * middle2_sum * middle2_sum) |
1621 | + (right2_sum * right2_sum); |
1622 | |
1623 | vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2; |
1624 | vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2; |
1625 | |
1626 | vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f); |
1627 | vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); |
1628 | vmask4 full_mask = p2_mask & det_mask & notnan_mask; |
1629 | |
1630 | ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask); |
1631 | ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask); |
1632 | } |
1633 | |
1634 | // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR |
1635 | if (blk.rgb_lns[0] || blk.alpha_lns[0]) |
1636 | { |
1637 | weight_weight_sum = weight_weight_sum * color_weight; |
1638 | float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight); |
1639 | |
1640 | vfloat4 rgbq_sum = color_vec_x + color_vec_y; |
1641 | rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); |
1642 | |
1643 | rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); |
1644 | |
1645 | // We can get a failure due to the use of a singular (non-invertible) matrix |
1646 | // If it failed, compute rgbo_vectors[] with a different method ... |
1647 | if (astc::isnan(dot_s(rgbo_vector, rgbo_vector))) |
1648 | { |
1649 | vfloat4 v0 = ep.endpt0[0]; |
1650 | vfloat4 v1 = ep.endpt1[0]; |
1651 | |
1652 | float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); |
1653 | avgdif = astc::max(avgdif, 0.0f); |
1654 | |
1655 | vfloat4 avg = (v0 + v1) * 0.5f; |
1656 | vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; |
1657 | |
1658 | rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); |
1659 | } |
1660 | } |
1661 | } |
1662 | |
1663 | #endif |
1664 | |