1/*
2Convection Texture Tools
3Copyright (c) 2018-2019 Eric Lasota
4
5Permission is hereby granted, free of charge, to any person obtaining
6a copy of this software and associated documentation files (the
7"Software"), to deal in the Software without restriction, including
8without limitation the rights to use, copy, modify, merge, publish,
9distribute, sublicense, and/or sell copies of the Software, and to
10permit persons to whom the Software is furnished to do so, subject
11to the following conditions:
12
13The above copyright notice and this permission notice shall be included
14in all copies or substantial portions of the Software.
15
16THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24-------------------------------------------------------------------------------------
25
26Portions based on DirectX Texture Library (DirectXTex)
27
28Copyright (c) Microsoft Corporation. All rights reserved.
29Licensed under the MIT License.
30
31http://go.microsoft.com/fwlink/?LinkId=248926
32*/
33#include "ConvectionKernels_Config.h"
34
35#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
36
37#include "ConvectionKernels_S3TC.h"
38
39#include "ConvectionKernels_AggregatedError.h"
40#include "ConvectionKernels_BCCommon.h"
41#include "ConvectionKernels_EndpointRefiner.h"
42#include "ConvectionKernels_EndpointSelector.h"
43#include "ConvectionKernels_IndexSelector.h"
44#include "ConvectionKernels_UnfinishedEndpoints.h"
45#include "ConvectionKernels_S3TC_SingleColor.h"
46
47void cvtt::Internal::S3TCComputer::Init(MFloat& error)
48{
49 error = ParallelMath::MakeFloat(FLT_MAX);
50}
51
52void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v)
53{
54 MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
55 v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
56}
57
58void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v)
59{
60 MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
61 v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
62}
63
64void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3])
65{
66 QuantizeTo5Bits(endPoint[0]);
67 QuantizeTo6Bits(endPoint[1]);
68 QuantizeTo5Bits(endPoint[2]);
69}
70
71cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span)
72{
73 return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
74}
75
76cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
77{
78 MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
79 absDiff = absDiff + d;
80 return absDiff * absDiff;
81}
82
83void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
84 MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
85{
86 float channelWeightsSq[3];
87
88 for (int ch = 0; ch < 3; ch++)
89 channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
90
91 MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
92
93 for (int px = 0; px < 16; px++)
94 {
95 for (int ch = 0; ch < 3; ch++)
96 totals[ch] = totals[ch] + pixels[px][ch];
97 }
98
99 MUInt15 average[3];
100 for (int ch = 0; ch < 3; ch++)
101 average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
102
103 const Tables::S3TCSC::TableEntry* rbTable = NULL;
104 const Tables::S3TCSC::TableEntry* gTable = NULL;
105 if (flags & cvtt::Flags::S3TC_Paranoid)
106 {
107 if (range == 4)
108 {
109 rbTable = Tables::S3TCSC::g_singleColor5_3_p;
110 gTable = Tables::S3TCSC::g_singleColor6_3_p;
111 }
112 else
113 {
114 assert(range == 3);
115 rbTable = Tables::S3TCSC::g_singleColor5_2_p;
116 gTable = Tables::S3TCSC::g_singleColor6_2_p;
117 }
118 }
119 else
120 {
121 if (range == 4)
122 {
123 rbTable = Tables::S3TCSC::g_singleColor5_3;
124 gTable = Tables::S3TCSC::g_singleColor6_3;
125 }
126 else
127 {
128 assert(range == 3);
129 rbTable = Tables::S3TCSC::g_singleColor5_2;
130 gTable = Tables::S3TCSC::g_singleColor6_2;
131 }
132 }
133
134 MUInt15 interpolated[3];
135 MUInt15 eps[2][3];
136 MSInt16 spans[3];
137 for (int i = 0; i < ParallelMath::ParallelSize; i++)
138 {
139 for (int ch = 0; ch < 3; ch++)
140 {
141 uint16_t avg = ParallelMath::Extract(average[ch], i);
142 const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
143 ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
144 ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
145 ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
146 ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
147 }
148 }
149
150 MFloat error = ParallelMath::MakeFloatZero();
151 if (flags & cvtt::Flags::S3TC_Paranoid)
152 {
153 MFloat spanParanoidFactors[3];
154 for (int ch = 0; ch < 3; ch++)
155 spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
156
157 for (int px = 0; px < 16; px++)
158 {
159 for (int ch = 0; ch < 3; ch++)
160 error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
161 }
162 }
163 else
164 {
165 for (int px = 0; px < 16; px++)
166 {
167 for (int ch = 0; ch < 3; ch++)
168 error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
169 }
170 }
171
172 ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
173 ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
174
175 if (ParallelMath::AnySet(better16))
176 {
177 bestError = ParallelMath::Min(bestError, error);
178 for (int epi = 0; epi < 2; epi++)
179 for (int ch = 0; ch < 3; ch++)
180 ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
181
182 MUInt15 vindexes = ParallelMath::MakeUInt15(1);
183 for (int px = 0; px < 16; px++)
184 ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
185
186 ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
187 }
188}
189
190void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
191 MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
192{
193 float channelWeightsSq[3];
194
195 for (int ch = 0; ch < 3; ch++)
196 channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
197
198 MUInt15 endPoints[2][3];
199
200 for (int ep = 0; ep < 2; ep++)
201 for (int ch = 0; ch < 3; ch++)
202 endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
203
204 QuantizeTo565(endPoints[0]);
205 QuantizeTo565(endPoints[1]);
206
207 IndexSelector<3> selector;
208 selector.Init<false>(channelWeights, endPoints, range);
209
210 MUInt15 indexes[16];
211
212 MFloat paranoidFactors[3];
213 for (int ch = 0; ch < 3; ch++)
214 paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
215
216 MFloat error = ParallelMath::MakeFloatZero();
217 AggregatedError<3> aggError;
218 for (int px = 0; px < 16; px++)
219 {
220 MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
221 indexes[px] = index;
222
223 if (refiner)
224 refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
225
226 MUInt15 reconstructed[3];
227 selector.ReconstructLDRPrecise(index, reconstructed);
228
229 if (flags & Flags::S3TC_Paranoid)
230 {
231 for (int ch = 0; ch < 3; ch++)
232 error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
233 }
234 else
235 BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
236 }
237
238 if (!(flags & Flags::S3TC_Paranoid))
239 error = aggError.Finalize(flags, channelWeightsSq);
240
241 ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
242
243 if (ParallelMath::AnySet(better))
244 {
245 ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
246
247 ParallelMath::ConditionalSet(bestError, better, error);
248
249 for (int ep = 0; ep < 2; ep++)
250 for (int ch = 0; ch < 3; ch++)
251 ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
252
253 for (int px = 0; px < 16; px++)
254 ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
255
256 ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
257 }
258}
259
260void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
261 const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
262 const ParallelMath::RoundTowardNearestForScope* rtn)
263{
264 UNREFERENCED_PARAMETER(alphaTest);
265 UNREFERENCED_PARAMETER(flags);
266
267 EndpointRefiner<3> refiner;
268
269 refiner.Init(nCounts, channelWeights);
270
271 bool escape = false;
272 int e = 0;
273 for (int i = 0; i < nCounts; i++)
274 {
275 for (int n = 0; n < counts[i]; n++)
276 {
277 ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
278 if (!ParallelMath::AnySet(valid))
279 {
280 escape = true;
281 break;
282 }
283
284 if (ParallelMath::AllSet(valid))
285 refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
286 else
287 {
288 MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
289 refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
290 }
291 }
292
293 if (escape)
294 break;
295 }
296
297 MUInt15 endPoints[2][3];
298 refiner.GetRefinedEndpointsLDR(endPoints, rtn);
299
300 TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
301}
302
303void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
304{
305 UNREFERENCED_PARAMETER(flags);
306 ParallelMath::RoundTowardNearestForScope rtn;
307
308 float weights[1] = { 1.0f };
309
310 MUInt15 pixels[16];
311 MFloat floatPixels[16];
312
313 for (int px = 0; px < 16; px++)
314 {
315 ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
316 floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
317 }
318
319 MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
320
321 IndexSelector<1> selector;
322 selector.Init<false>(weights, ep, 16);
323
324 MUInt15 indexes[16];
325
326 for (int px = 0; px < 16; px++)
327 indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
328
329 for (int block = 0; block < ParallelMath::ParallelSize; block++)
330 {
331 for (int px = 0; px < 16; px += 2)
332 {
333 int index0 = ParallelMath::Extract(indexes[px], block);
334 int index1 = ParallelMath::Extract(indexes[px + 1], block);
335
336 packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
337 }
338
339 packedBlocks += packedBlockStride;
340 }
341}
342
343void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
344{
345 if (maxTweakRounds < 1)
346 maxTweakRounds = 1;
347
348 if (numRefineRounds < 1)
349 numRefineRounds = 1;
350
351 ParallelMath::RoundTowardNearestForScope rtn;
352
353 float oneWeight[1] = { 1.0f };
354
355 MUInt15 pixels[16];
356 MFloat floatPixels[16];
357
358 MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
359 MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
360
361 for (int px = 0; px < 16; px++)
362 {
363 ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
364
365 if (isSigned)
366 pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
367
368 floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
369 }
370
371 MUInt15 sortedPixels[16];
372 for (int px = 0; px < 16; px++)
373 sortedPixels[px] = pixels[px];
374
375 for (int sortEnd = 15; sortEnd > 0; sortEnd--)
376 {
377 for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
378 {
379 MUInt15 a = sortedPixels[sortOffset];
380 MUInt15 b = sortedPixels[sortOffset + 1];
381
382 sortedPixels[sortOffset] = ParallelMath::Min(a, b);
383 sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
384 }
385 }
386
387 MUInt15 zero = ParallelMath::MakeUInt15(0);
388 MUInt15 one = ParallelMath::MakeUInt15(1);
389
390 MUInt15 bestIsFullRange = zero;
391 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
392 MUInt15 bestEP[2] = { zero, zero };
393 MUInt15 bestIndexes[16] = {
394 zero, zero, zero, zero,
395 zero, zero, zero, zero,
396 zero, zero, zero, zero,
397 zero, zero, zero, zero
398 };
399
400 // Full-precision
401 {
402 MUInt15 minEP = sortedPixels[0];
403 MUInt15 maxEP = sortedPixels[15];
404
405 MFloat base[1] = { ParallelMath::ToFloat(minEP) };
406 MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
407
408 UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
409
410 int numTweakRounds = BCCommon::TweakRoundsForRange(8);
411 if (numTweakRounds > maxTweakRounds)
412 numTweakRounds = maxTweakRounds;
413
414 for (int tweak = 0; tweak < numTweakRounds; tweak++)
415 {
416 MUInt15 ep[2][1];
417
418 ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
419
420 for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
421 {
422 EndpointRefiner<1> refiner;
423 refiner.Init(8, oneWeight);
424
425 if (isSigned)
426 for (int epi = 0; epi < 2; epi++)
427 ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
428
429 IndexSelector<1> indexSelector;
430 indexSelector.Init<false>(oneWeight, ep, 8);
431
432 MUInt15 indexes[16];
433
434 AggregatedError<1> aggError;
435 for (int px = 0; px < 16; px++)
436 {
437 MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
438
439 MUInt15 reconstructedPixel;
440
441 indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
442 BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
443
444 if (refinePass != numRefineRounds - 1)
445 refiner.ContributeUnweightedPW(&floatPixels[px], index);
446
447 indexes[px] = index;
448 }
449 MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
450
451 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
452 ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
453
454 if (ParallelMath::AnySet(errorBetter16))
455 {
456 bestError = ParallelMath::Min(error, bestError);
457 ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
458 for (int px = 0; px < 16; px++)
459 ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
460
461 for (int epi = 0; epi < 2; epi++)
462 ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
463 }
464
465 if (refinePass != numRefineRounds - 1)
466 refiner.GetRefinedEndpointsLDR(ep, &rtn);
467 }
468 }
469 }
470
471 // Reduced precision with special endpoints
472 {
473 MUInt15 bestHeuristicMin = sortedPixels[0];
474 MUInt15 bestHeuristicMax = sortedPixels[15];
475
476 ParallelMath::Int16CompFlag canTryClipping;
477
478 // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
479 // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
480 // This will usually not find anything, but it's cheap to check.
481
482 {
483 MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
484 MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
485
486 MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
487 canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
488 }
489
490 if (ParallelMath::AnySet(canTryClipping))
491 {
492 MUInt15 lowClearances[16];
493 MUInt15 highClearances[16];
494 MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
495
496 lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
497
498 for (int px = 1; px < 16; px++)
499 {
500 lowClearances[px] = sortedPixels[px - 1];
501 highClearances[px] = highTerminal - sortedPixels[16 - px];
502 }
503
504 for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
505 {
506 uint16_t numSkippedLow = firstIndex;
507
508 MUInt15 lowClearance = lowClearances[firstIndex];
509
510 for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
511 {
512 uint16_t numSkippedHigh = 15 - lastIndex;
513 uint16_t numSkipped = numSkippedLow + numSkippedHigh;
514
515 MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
516
517 ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
518
519 if (!ParallelMath::AnySet(areMoreSkipped))
520 continue;
521
522 MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
523 MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
524
525 MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
526
527 ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
528 ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
529 ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
530 }
531 }
532 }
533
534 MUInt15 bestSimpleMin = one;
535 MUInt15 bestSimpleMax = highTerminalMinusOne;
536
537 for (int px = 0; px < 16; px++)
538 {
539 ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
540 ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
541 }
542
543 MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
544 MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
545
546 int minEPRange = 2;
547 if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
548 minEPRange = 1;
549
550 int maxEPRange = 2;
551 if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
552 maxEPRange = 1;
553
554 for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
555 {
556 for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
557 {
558 MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
559 MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
560
561 UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
562
563 int numTweakRounds = BCCommon::TweakRoundsForRange(6);
564 if (numTweakRounds > maxTweakRounds)
565 numTweakRounds = maxTweakRounds;
566
567 for (int tweak = 0; tweak < numTweakRounds; tweak++)
568 {
569 MUInt15 ep[2][1];
570
571 ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
572
573 for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
574 {
575 EndpointRefiner<1> refiner;
576 refiner.Init(6, oneWeight);
577
578 if (isSigned)
579 for (int epi = 0; epi < 2; epi++)
580 ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
581
582 IndexSelector<1> indexSelector;
583 indexSelector.Init<false>(oneWeight, ep, 6);
584
585 MUInt15 indexes[16];
586 MFloat error = ParallelMath::MakeFloatZero();
587
588 for (int px = 0; px < 16; px++)
589 {
590 MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
591
592 MUInt15 reconstructedPixel;
593
594 indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
595
596 MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
597 MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
598 MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
599
600 MFloat bestPixelError = zeroError;
601 MUInt15 index = ParallelMath::MakeUInt15(6);
602
603 ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
604 bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
605
606 ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
607
608 if (ParallelMath::AllSet(selectedIndexBetter))
609 {
610 if (refinePass != numRefineRounds - 1)
611 refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
612 }
613 else
614 {
615 MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
616
617 if (refinePass != numRefineRounds - 1)
618 refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
619 }
620
621 ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
622 bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
623
624 error = error + bestPixelError;
625
626 indexes[px] = index;
627 }
628
629 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
630 ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
631
632 if (ParallelMath::AnySet(errorBetter16))
633 {
634 bestError = ParallelMath::Min(error, bestError);
635 ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
636 for (int px = 0; px < 16; px++)
637 ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
638
639 for (int epi = 0; epi < 2; epi++)
640 ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
641 }
642
643 if (refinePass != numRefineRounds - 1)
644 refiner.GetRefinedEndpointsLDR(ep, &rtn);
645 }
646 }
647 }
648 }
649 }
650
651 for (int block = 0; block < ParallelMath::ParallelSize; block++)
652 {
653 int ep0 = ParallelMath::Extract(bestEP[0], block);
654 int ep1 = ParallelMath::Extract(bestEP[1], block);
655 int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
656
657 if (isSigned)
658 {
659 ep0 -= 127;
660 ep1 -= 127;
661
662 assert(ep0 >= -127 && ep0 <= 127);
663 assert(ep1 >= -127 && ep1 <= 127);
664 }
665
666
667 bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
668
669 if (swapEndpoints)
670 std::swap(ep0, ep1);
671
672 uint16_t dumpBits = 0;
673 int dumpBitsOffset = 0;
674 int dumpByteOffset = 2;
675 packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
676 packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
677
678 int maxValue = (isFullRange != 0) ? 7 : 5;
679
680 for (int px = 0; px < 16; px++)
681 {
682 int index = ParallelMath::Extract(bestIndexes[px], block);
683
684 if (swapEndpoints && index <= maxValue)
685 index = maxValue - index;
686
687 if (index != 0)
688 {
689 if (index == maxValue)
690 index = 1;
691 else if (index < maxValue)
692 index++;
693 }
694
695 assert(index >= 0 && index < 8);
696
697 dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
698 dumpBitsOffset += 3;
699
700 if (dumpBitsOffset >= 8)
701 {
702 assert(dumpByteOffset < 8);
703 packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
704 dumpBits >>= 8;
705 dumpBitsOffset -= 8;
706 dumpByteOffset++;
707 }
708 }
709
710 assert(dumpBitsOffset == 0);
711 assert(dumpByteOffset == 8);
712
713 packedBlocks += packedBlockStride;
714 }
715}
716
717void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
718{
719 ParallelMath::RoundTowardNearestForScope rtn;
720
721 if (numRefineRounds < 1)
722 numRefineRounds = 1;
723
724 if (maxTweakRounds < 1)
725 maxTweakRounds = 1;
726
727 EndpointSelector<3, 8> endpointSelector;
728
729 MUInt15 pixels[16][4];
730 MFloat floatPixels[16][4];
731
732 MFloat preWeightedPixels[16][4];
733
734 for (int px = 0; px < 16; px++)
735 {
736 for (int ch = 0; ch < 4; ch++)
737 ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
738 }
739
740 for (int px = 0; px < 16; px++)
741 {
742 for (int ch = 0; ch < 4; ch++)
743 floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
744 }
745
746 if (alphaTest)
747 {
748 MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
749
750 for (int px = 0; px < 16; px++)
751 {
752 ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
753 pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
754 }
755 }
756
757 BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
758
759 MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
760
761 for (int px = 0; px < 16; px++)
762 minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
763
764 MFloat pixelWeights[16];
765 for (int px = 0; px < 16; px++)
766 {
767 pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
768 if (alphaTest)
769 {
770 ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
771
772 ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
773 }
774 }
775
776 for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
777 {
778 for (int px = 0; px < 16; px++)
779 endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
780
781 endpointSelector.FinishPass(pass);
782 }
783
784 UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
785
786 MUInt15 bestEndpoints[2][3];
787 MUInt15 bestIndexes[16];
788 MUInt15 bestRange = ParallelMath::MakeUInt15(0);
789 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
790
791 for (int px = 0; px < 16; px++)
792 bestIndexes[px] = ParallelMath::MakeUInt15(0);
793
794 for (int ep = 0; ep < 2; ep++)
795 for (int ch = 0; ch < 3; ch++)
796 bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
797
798 if (exhaustive)
799 {
800 MSInt16 sortBins[16];
801
802 {
803 // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
804 // and pack the original indexes into the low bits.
805
806 MUInt15 sortEP[2][3];
807 ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
808
809 IndexSelector<3> sortSelector;
810 sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
811
812 for (int16_t px = 0; px < 16; px++)
813 {
814 MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
815
816 if (alphaTest)
817 {
818 ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
819
820 ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
821 }
822
823 sortBin = sortBin + ParallelMath::MakeSInt16(px);
824
825 sortBins[px] = sortBin;
826 }
827 }
828
829 // Sort bins
830 for (int sortEnd = 1; sortEnd < 16; sortEnd++)
831 {
832 for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
833 {
834 MSInt16 a = sortBins[sortLoc];
835 MSInt16 b = sortBins[sortLoc - 1];
836
837 sortBins[sortLoc] = ParallelMath::Max(a, b);
838 sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
839 }
840 }
841
842 MUInt15 firstElement = ParallelMath::MakeUInt15(0);
843 for (uint16_t e = 0; e < 16; e++)
844 {
845 ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
846 ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
847 if (!ParallelMath::AnySet(isInvalid))
848 break;
849 }
850
851 MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
852
853 MUInt15 sortedInputs[16][4];
854 MFloat floatSortedInputs[16][4];
855 MFloat pwFloatSortedInputs[16][4];
856
857 for (int e = 0; e < 16; e++)
858 {
859 for (int ch = 0; ch < 4; ch++)
860 sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
861 }
862
863 for (int block = 0; block < ParallelMath::ParallelSize; block++)
864 {
865 for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
866 {
867 ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
868 int originalIndex = (sortBin & 15);
869
870 for (int ch = 0; ch < 4; ch++)
871 ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
872 }
873 }
874
875 for (int e = 0; e < 16; e++)
876 {
877 for (int ch = 0; ch < 4; ch++)
878 {
879 MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
880 floatSortedInputs[e][ch] = f;
881 pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
882 }
883 }
884
885 for (int n0 = 0; n0 <= 15; n0++)
886 {
887 int remainingFor1 = 16 - n0;
888 if (remainingFor1 == 16)
889 remainingFor1 = 15;
890
891 for (int n1 = 0; n1 <= remainingFor1; n1++)
892 {
893 int remainingFor2 = 16 - n1 - n0;
894 if (remainingFor2 == 16)
895 remainingFor2 = 15;
896
897 for (int n2 = 0; n2 <= remainingFor2; n2++)
898 {
899 int n3 = 16 - n2 - n1 - n0;
900
901 if (n3 == 16)
902 continue;
903
904 int counts[4] = { n0, n1, n2, n3 };
905
906 TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
907 }
908 }
909 }
910
911 TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
912
913 if (alphaTest)
914 {
915 for (int n0 = 0; n0 <= 15; n0++)
916 {
917 int remainingFor1 = 16 - n0;
918 if (remainingFor1 == 16)
919 remainingFor1 = 15;
920
921 for (int n1 = 0; n1 <= remainingFor1; n1++)
922 {
923 int n2 = 16 - n1 - n0;
924
925 if (n2 == 16)
926 continue;
927
928 int counts[3] = { n0, n1, n2 };
929
930 TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
931 }
932 }
933
934 TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
935 }
936 }
937 else
938 {
939 int minRange = alphaTest ? 3 : 4;
940
941 for (int range = minRange; range <= 4; range++)
942 {
943 int tweakRounds = BCCommon::TweakRoundsForRange(range);
944 if (tweakRounds > maxTweakRounds)
945 tweakRounds = maxTweakRounds;
946
947 for (int tweak = 0; tweak < tweakRounds; tweak++)
948 {
949 MUInt15 endPoints[2][3];
950
951 ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
952
953 for (int refine = 0; refine < numRefineRounds; refine++)
954 {
955 EndpointRefiner<3> refiner;
956 refiner.Init(range, channelWeights);
957
958 TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
959
960 if (refine != numRefineRounds - 1)
961 refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
962 }
963 }
964 }
965 }
966
967 for (int block = 0; block < ParallelMath::ParallelSize; block++)
968 {
969 ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
970 assert(range == 3 || range == 4);
971
972 ParallelMath::ScalarUInt16 compressedEP[2];
973 for (int ep = 0; ep < 2; ep++)
974 {
975 ParallelMath::ScalarUInt16 endPoint[3];
976 for (int ch = 0; ch < 3; ch++)
977 endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
978
979 int compressed = (endPoint[0] & 0xf8) << 8;
980 compressed |= (endPoint[1] & 0xfc) << 3;
981 compressed |= (endPoint[2] & 0xf8) >> 3;
982
983 compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
984 }
985
986 int indexOrder[4];
987
988 if (range == 4)
989 {
990 if (compressedEP[0] == compressedEP[1])
991 {
992 indexOrder[0] = 0;
993 indexOrder[1] = 0;
994 indexOrder[2] = 0;
995 indexOrder[3] = 0;
996 }
997 else if (compressedEP[0] < compressedEP[1])
998 {
999 std::swap(compressedEP[0], compressedEP[1]);
1000 indexOrder[0] = 1;
1001 indexOrder[1] = 3;
1002 indexOrder[2] = 2;
1003 indexOrder[3] = 0;
1004 }
1005 else
1006 {
1007 indexOrder[0] = 0;
1008 indexOrder[1] = 2;
1009 indexOrder[2] = 3;
1010 indexOrder[3] = 1;
1011 }
1012 }
1013 else
1014 {
1015 assert(range == 3);
1016
1017 if (compressedEP[0] > compressedEP[1])
1018 {
1019 std::swap(compressedEP[0], compressedEP[1]);
1020 indexOrder[0] = 1;
1021 indexOrder[1] = 2;
1022 indexOrder[2] = 0;
1023 }
1024 else
1025 {
1026 indexOrder[0] = 0;
1027 indexOrder[1] = 2;
1028 indexOrder[2] = 1;
1029 }
1030 indexOrder[3] = 3;
1031 }
1032
1033 packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
1034 packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
1035 packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
1036 packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
1037
1038 for (int i = 0; i < 16; i += 4)
1039 {
1040 int packedIndexes = 0;
1041 for (int subi = 0; subi < 4; subi++)
1042 {
1043 ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
1044 packedIndexes |= (indexOrder[index] << (subi * 2));
1045 }
1046
1047 packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
1048 }
1049
1050 packedBlocks += packedBlockStride;
1051 }
1052}
1053
1054#endif
1055