1/*
2Convection Texture Tools
3Copyright (c) 2018-2019 Eric Lasota
4
5Permission is hereby granted, free of charge, to any person obtaining
6a copy of this software and associated documentation files (the
7"Software"), to deal in the Software without restriction, including
8without limitation the rights to use, copy, modify, merge, publish,
9distribute, sublicense, and/or sell copies of the Software, and to
10permit persons to whom the Software is furnished to do so, subject
11to the following conditions:
12
13The above copyright notice and this permission notice shall be included
14in all copies or substantial portions of the Software.
15
16THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24-------------------------------------------------------------------------------------
25
26Portions based on DirectX Texture Library (DirectXTex)
27
28Copyright (c) Microsoft Corporation. All rights reserved.
29Licensed under the MIT License.
30
31http://go.microsoft.com/fwlink/?LinkId=248926
32*/
33#include "ConvectionKernels_Config.h"
34
35#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
36
37#include "ConvectionKernels.h"
38#include "ConvectionKernels_ETC.h"
39#include "ConvectionKernels_ETC1.h"
40#include "ConvectionKernels_ETC2.h"
41#include "ConvectionKernels_ETC2_Rounding.h"
42#include "ConvectionKernels_ParallelMath.h"
43#include "ConvectionKernels_FakeBT709_Rounding.h"
44
45#include <cmath>
46
47const int cvtt::Internal::ETCComputer::g_flipTables[2][2][8] =
48{
49 {
50 { 0, 1, 4, 5, 8, 9, 12, 13 },
51 { 2, 3, 6, 7, 10, 11, 14, 15 }
52 },
53 {
54 { 0, 1, 2, 3, 4, 5, 6, 7 },
55 { 8, 9, 10, 11, 12, 13, 14, 15 }
56 },
57};
58
59cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorUniform(const MUInt15 pixelA[3], const MUInt15 pixelB[3])
60{
61 MSInt16 d0 = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[0]);
62 MFloat fd0 = ParallelMath::ToFloat(d0);
63 MFloat error = fd0 * fd0;
64 for (int ch = 1; ch < 3; ch++)
65 {
66 MSInt16 d = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[ch]);
67 MFloat fd = ParallelMath::ToFloat(d);
68 error = error + fd * fd;
69 }
70 return error;
71}
72
73cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorWeighted(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3], const Options options)
74{
75 MFloat dr = ParallelMath::ToFloat(reconstructed[0]) * options.redWeight - preWeightedPixel[0];
76 MFloat dg = ParallelMath::ToFloat(reconstructed[1]) * options.greenWeight - preWeightedPixel[1];
77 MFloat db = ParallelMath::ToFloat(reconstructed[2]) * options.blueWeight - preWeightedPixel[2];
78
79 return dr * dr + dg * dg + db * db;
80}
81
82cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorFakeBT709(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3])
83{
84 MFloat yuv[3];
85 ConvertToFakeBT709(yuv, reconstructed);
86
87 MFloat dy = yuv[0] - preWeightedPixel[0];
88 MFloat du = yuv[1] - preWeightedPixel[1];
89 MFloat dv = yuv[2] - preWeightedPixel[2];
90
91 return dy * dy + du * du + dv * dv;
92}
93
94void cvtt::Internal::ETCComputer::TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const MSInt16 modifiers[4], bool isDifferential, const Options &options)
95{
96 MUInt15 quantized[3];
97 MUInt15 unquantized[3];
98
99 for (int ch = 0; ch < 3; ch++)
100 {
101 quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
102
103 if (isDifferential)
104 unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
105 else
106 unquantized[ch] = (quantized[ch] << 4) | quantized[ch];
107 }
108
109 MUInt16 selectors = ParallelMath::MakeUInt16(0);
110 MFloat totalError = ParallelMath::MakeFloatZero();
111
112 MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
113 MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
114
115 MUInt15 unquantizedModified[4][3];
116 for (unsigned int s = 0; s < 4; s++)
117 for (int ch = 0; ch < 3; ch++)
118 unquantizedModified[s][ch] = ParallelMath::Min(ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::ToSInt16(unquantized[ch]) + modifiers[s], s16_zero)), u15_255);
119
120 bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
121 bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
122
123 for (int px = 0; px < 8; px++)
124 {
125 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
126 MUInt16 bestSelector = ParallelMath::MakeUInt16(0);
127
128 for (unsigned int s = 0; s < 4; s++)
129 {
130 MFloat error;
131 if (isFakeBT709)
132 error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
133 else if (isUniform)
134 error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
135 else
136 error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
137
138 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
139 bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt16(s), bestSelector);
140 bestError = ParallelMath::Min(error, bestError);
141 }
142
143 totalError = totalError + bestError;
144 selectors = selectors | (bestSelector << (px * 2));
145 }
146
147 outError = totalError;
148 outSelectors = selectors;
149}
150
151void cvtt::Internal::ETCComputer::TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const ParallelMath::Int16CompFlag isTransparent[8], const MUInt15 modifier, const Options &options)
152{
153 MUInt15 quantized[3];
154 MUInt15 unquantized[3];
155
156 for (int ch = 0; ch < 3; ch++)
157 {
158 quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
159 unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
160 }
161
162 MUInt16 selectors = ParallelMath::MakeUInt16(0);
163 MFloat totalError = ParallelMath::MakeFloatZero();
164
165 MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
166 MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
167
168 MUInt15 unquantizedModified[3][3];
169 for (int ch = 0; ch < 3; ch++)
170 {
171 unquantizedModified[0][ch] = ParallelMath::Max(unquantized[ch], modifier) - modifier;
172 unquantizedModified[1][ch] = unquantized[ch];
173 unquantizedModified[2][ch] = ParallelMath::Min(unquantized[ch] + modifier, u15_255);
174 }
175
176 bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
177 bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
178
179 for (int px = 0; px < 8; px++)
180 {
181 ParallelMath::FloatCompFlag isTransparentFloat = ParallelMath::Int16FlagToFloat(isTransparent[px]);
182
183 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
184 MUInt15 bestSelector = ParallelMath::MakeUInt15(0);
185
186 for (unsigned int s = 0; s < 3; s++)
187 {
188 MFloat error;
189 if (isFakeBT709)
190 error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
191 else if (isUniform)
192 error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
193 else
194 error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
195
196 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
197 bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(s), bestSelector);
198 bestError = ParallelMath::Min(error, bestError);
199 }
200
201 // Annoying quirk: The ETC encoding machinery assumes that selectors are in the table order in the spec, which isn't
202 // the same as their encoding bits, so the transparent index is actually 1 and the valid indexes are 0, 2, and 3.
203
204 // Remap selector 1 to 2, and 2 to 3
205 bestSelector = ParallelMath::Min(ParallelMath::MakeUInt15(3), bestSelector << 1);
206
207 // Mark zero transparent as
208 ParallelMath::ConditionalSet(bestError, isTransparentFloat, ParallelMath::MakeFloatZero());
209 ParallelMath::ConditionalSet(bestSelector, isTransparent[px], ParallelMath::MakeUInt15(1));
210
211 totalError = totalError + bestError;
212 selectors = selectors | (ParallelMath::LosslessCast<MUInt16>::Cast(bestSelector) << (px * 2));
213 }
214
215 outError = totalError;
216 outSelectors = selectors;
217}
218
219void cvtt::Internal::ETCComputer::FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[2], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[2], MUInt16 bestSelectors[2], MUInt15 bestTables[2], DifferentialResolveStorage &drs)
220{
221 // We do this part scalar because most of the cost benefit of parallelization is in error evaluation,
222 // and this code has a LOT of early-outs and disjointed index lookups that vary heavily between blocks
223 // and save a lot of time.
224 for (int block = 0; block < ParallelMath::ParallelSize; block++)
225 {
226 bool canIgnore[2] = { ParallelMath::Extract(canIgnoreSector[0], block), ParallelMath::Extract(canIgnoreSector[1], block) };
227 bool canIgnoreEither = canIgnore[0] || canIgnore[1];
228 float blockBestTotalError = ParallelMath::Extract(bestTotalError, block);
229 float bestDiffErrors[2] = { FLT_MAX, FLT_MAX };
230 uint16_t bestDiffSelectors[2] = { 0, 0 };
231 uint16_t bestDiffColors[2] = { 0, 0 };
232 uint16_t bestDiffTables[2] = { 0, 0 };
233 for (int sector = 0; sector < 2; sector++)
234 {
235 unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
236 for (unsigned int i = 0; i < sectorNumAttempts; i++)
237 {
238 float error = ParallelMath::Extract(drs.diffErrors[sector][i], block);
239 if (error < bestDiffErrors[sector])
240 {
241 bestDiffErrors[sector] = error;
242 bestDiffSelectors[sector] = ParallelMath::Extract(drs.diffSelectors[sector][i], block);
243 bestDiffColors[sector] = ParallelMath::Extract(drs.diffColors[sector][i], block);
244 bestDiffTables[sector] = ParallelMath::Extract(drs.diffTables[sector][i], block);
245 }
246 }
247 }
248
249 if (canIgnore[0])
250 bestDiffColors[0] = bestDiffColors[1];
251 else if (canIgnore[1])
252 bestDiffColors[1] = bestDiffColors[0];
253
254 // The best differential possibilities must be better than the best total error
255 if (bestDiffErrors[0] + bestDiffErrors[1] < blockBestTotalError)
256 {
257 // Fast path if the best possible case is legal
258 if (canIgnoreEither || ETCDifferentialIsLegalScalar(bestDiffColors[0], bestDiffColors[1]))
259 {
260 ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
261 ParallelMath::PutFloat(bestTotalError, block, bestDiffErrors[0] + bestDiffErrors[1]);
262 ParallelMath::PutUInt15(bestFlip, block, flip);
263 ParallelMath::PutUInt15(bestD, block, d);
264 for (int sector = 0; sector < 2; sector++)
265 {
266 ParallelMath::PutUInt15(bestColors[sector], block, bestDiffColors[sector]);
267 ParallelMath::PutUInt16(bestSelectors[sector], block, bestDiffSelectors[sector]);
268 ParallelMath::PutUInt15(bestTables[sector], block, bestDiffTables[sector]);
269 }
270 }
271 else
272 {
273 // Slow path: Sort the possible cases by quality, and search valid combinations
274 // TODO: Pre-flatten the error lists so this is nicer to cache
275 unsigned int numSortIndexes[2] = { 0, 0 };
276 for (int sector = 0; sector < 2; sector++)
277 {
278 unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
279
280 for (unsigned int i = 0; i < sectorNumAttempts; i++)
281 {
282 if (ParallelMath::Extract(drs.diffErrors[sector][i], block) < blockBestTotalError)
283 drs.attemptSortIndexes[sector][numSortIndexes[sector]++] = i;
284 }
285
286 struct SortPredicate
287 {
288 const MFloat *diffErrors;
289 int block;
290
291 bool operator()(uint16_t a, uint16_t b) const
292 {
293 float errorA = ParallelMath::Extract(diffErrors[a], block);
294 float errorB = ParallelMath::Extract(diffErrors[b], block);
295
296 if (errorA < errorB)
297 return true;
298 if (errorA > errorB)
299 return false;
300
301 return a < b;
302 }
303 };
304
305 SortPredicate sp;
306 sp.diffErrors = drs.diffErrors[sector];
307 sp.block = block;
308
309 std::sort<uint16_t*, const SortPredicate&>(drs.attemptSortIndexes[sector], drs.attemptSortIndexes[sector] + numSortIndexes[sector], sp);
310 }
311
312 int scannedElements = 0;
313 for (unsigned int i = 0; i < numSortIndexes[0]; i++)
314 {
315 unsigned int attemptIndex0 = drs.attemptSortIndexes[0][i];
316 float error0 = ParallelMath::Extract(drs.diffErrors[0][attemptIndex0], block);
317
318 scannedElements++;
319
320 if (error0 >= blockBestTotalError)
321 break;
322
323 float maxError1 = ParallelMath::Extract(bestTotalError, block) - error0;
324 uint16_t diffColor0 = ParallelMath::Extract(drs.diffColors[0][attemptIndex0], block);
325
326 if (maxError1 < bestDiffErrors[1])
327 break;
328
329 for (unsigned int j = 0; j < numSortIndexes[1]; j++)
330 {
331 unsigned int attemptIndex1 = drs.attemptSortIndexes[1][j];
332 float error1 = ParallelMath::Extract(drs.diffErrors[1][attemptIndex1], block);
333
334 scannedElements++;
335
336 if (error1 >= maxError1)
337 break;
338
339 uint16_t diffColor1 = ParallelMath::Extract(drs.diffColors[1][attemptIndex1], block);
340
341 if (ETCDifferentialIsLegalScalar(diffColor0, diffColor1))
342 {
343 blockBestTotalError = error0 + error1;
344
345 ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
346 ParallelMath::PutFloat(bestTotalError, block, blockBestTotalError);
347 ParallelMath::PutUInt15(bestFlip, block, flip);
348 ParallelMath::PutUInt15(bestD, block, d);
349 ParallelMath::PutUInt15(bestColors[0], block, diffColor0);
350 ParallelMath::PutUInt15(bestColors[1], block, diffColor1);
351 ParallelMath::PutUInt16(bestSelectors[0], block, ParallelMath::Extract(drs.diffSelectors[0][attemptIndex0], block));
352 ParallelMath::PutUInt16(bestSelectors[1], block, ParallelMath::Extract(drs.diffSelectors[1][attemptIndex1], block));
353 ParallelMath::PutUInt15(bestTables[0], block, ParallelMath::Extract(drs.diffTables[0][attemptIndex0], block));
354 ParallelMath::PutUInt15(bestTables[1], block, ParallelMath::Extract(drs.diffTables[1][attemptIndex1], block));
355 break;
356 }
357 }
358 }
359 }
360 }
361 }
362}
363
364cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b)
365{
366 MSInt16 diff = ParallelMath::LosslessCast<MSInt16>::Cast(b) - ParallelMath::LosslessCast<MSInt16>::Cast(a);
367
368 return ParallelMath::Less(ParallelMath::MakeSInt16(-5), diff) & ParallelMath::Less(diff, ParallelMath::MakeSInt16(4));
369}
370
371cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b)
372{
373 MUInt15 mask = ParallelMath::MakeUInt15(31);
374
375 return ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 10), ParallelMath::RightShift(b, 10))
376 & ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 5) & mask, ParallelMath::RightShift(b, 5) & mask)
377 & ETCDifferentialIsLegalForChannel(a & mask, b & mask);
378}
379
380bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b)
381{
382 int16_t diff = static_cast<int16_t>(b) - static_cast<int16_t>(a);
383
384 return (-4 <= diff) && (diff <= 3);
385}
386
387bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b)
388{
389 MUInt15 mask = ParallelMath::MakeUInt15(31);
390
391 return ETCDifferentialIsLegalForChannelScalar((a >> 10), (b >> 10))
392 & ETCDifferentialIsLegalForChannelScalar((a >> 5) & 31, (b >> 5) & 31)
393 & ETCDifferentialIsLegalForChannelScalar(a & 31, b & 31);
394}
395
396void cvtt::Internal::ETCComputer::EncodeTMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
397{
398 bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
399 bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
400
401 ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
402
403 MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
404 MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
405
406 MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
407
408 // To speed this up, we compute line total as the sum, then subtract out isolated
409 for (unsigned int px = 0; px < 16; px++)
410 {
411 for (int ch = 0; ch < 3; ch++)
412 {
413 isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
414 lineTotal[ch] = lineTotal[ch] + pixels[px][ch];
415 }
416 numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
417 }
418
419 for (int ch = 0; ch < 3; ch++)
420 lineTotal[ch] = lineTotal[ch] - isolatedTotal[ch];
421
422 MUInt15 numPixelsLine = ParallelMath::MakeUInt15(16) - numPixelsIsolated;
423
424 MUInt15 isolatedAverageQuantized[3];
425 MUInt15 isolatedAverageTargets[3];
426 {
427 int divisors[ParallelMath::ParallelSize];
428 for (int block = 0; block < ParallelMath::ParallelSize; block++)
429 divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
430
431 MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
432 for (int ch = 0; ch < 3; ch++)
433 {
434 // isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
435
436 MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
437 if (!isFakeBT709)
438 numerator = numerator + addend;
439
440 for (int block = 0; block < ParallelMath::ParallelSize; block++)
441 {
442 int divisor = divisors[block];
443 if (divisor == 0)
444 ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
445 else
446 ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
447 }
448
449 isolatedAverageTargets[ch] = numerator;
450 }
451 }
452
453 if (isFakeBT709)
454 ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
455
456 MUInt15 isolatedColor[3];
457 for (int ch = 0; ch < 3; ch++)
458 isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
459
460 MFloat isolatedError[16];
461 for (int px = 0; px < 16; px++)
462 {
463 if (isFakeBT709)
464 isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
465 else if (isUniform)
466 isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
467 else
468 isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
469 }
470
471 MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
472 MUInt15 bestTable = ParallelMath::MakeUInt15(0);
473 MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
474
475 MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
476 MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
477
478 int16_t clusterMaxLine = 0;
479 for (int block = 0; block < ParallelMath::ParallelSize; block++)
480 {
481 int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
482 if (blockMaxLine > clusterMaxLine)
483 clusterMaxLine = blockMaxLine;
484 }
485
486 int16_t clusterMinLine = -clusterMaxLine;
487
488 int lineDivisors[ParallelMath::ParallelSize];
489 for (int block = 0; block < ParallelMath::ParallelSize; block++)
490 lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
491
492 MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
493
494 for (int table = 0; table < 8; table++)
495 {
496 int numUniqueColors[ParallelMath::ParallelSize];
497 MUInt15 uniqueQuantizedColors[31];
498
499 for (int block = 0; block < ParallelMath::ParallelSize; block++)
500 numUniqueColors[block] = 0;
501
502 MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
503 MUInt15 modifierOffset = (modifier + modifier);
504
505 for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier++)
506 {
507 MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
508 MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
509
510 MUInt15 quantized[3];
511 if (isFakeBT709)
512 {
513 MUInt15 targets[3];
514 for (int ch = 0; ch < 3; ch++)
515 {
516 //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
517 MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
518 MUInt15 divided = ParallelMath::MakeUInt15(0);
519 for (int block = 0; block < ParallelMath::ParallelSize; block++)
520 {
521 int divisor = lineDivisors[block];
522 if (divisor == 0)
523 ParallelMath::PutUInt15(divided, block, 0);
524 else
525 ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
526 }
527 quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
528 targets[ch] = numerator;
529 }
530
531 ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
532 }
533 else
534 {
535 for (int ch = 0; ch < 3; ch++)
536 {
537 //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
538 MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
539 MUInt15 divided = ParallelMath::MakeUInt15(0);
540 for (int block = 0; block < ParallelMath::ParallelSize; block++)
541 {
542 int divisor = lineDivisors[block];
543 if (divisor == 0)
544 ParallelMath::PutUInt15(divided, block, 0);
545 else
546 ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
547 }
548 quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
549 }
550 }
551
552 MUInt15 packedColor = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
553
554 for (int block = 0; block < ParallelMath::ParallelSize; block++)
555 {
556 uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
557 if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
558 ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
559 }
560 }
561
562 // Stripe unfilled unique colors
563 int maxUniqueColors = 0;
564 for (int block = 0; block < ParallelMath::ParallelSize; block++)
565 {
566 if (numUniqueColors[block] > maxUniqueColors)
567 maxUniqueColors = numUniqueColors[block];
568 }
569
570 for (int block = 0; block < ParallelMath::ParallelSize; block++)
571 {
572 uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
573
574 int numUnique = numUniqueColors[block];
575 for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
576 ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
577 }
578
579 for (int ci = 0; ci < maxUniqueColors; ci++)
580 {
581 MUInt15 lineColors[3][3];
582 for (int ch = 0; ch < 3; ch++)
583 {
584 MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], (ch * 5)) & ParallelMath::MakeUInt15(15));
585
586 MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
587 lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
588 lineColors[1][ch] = unquantizedColor;
589 lineColors[2][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
590 }
591
592 MSInt32 selectors = ParallelMath::MakeSInt32(0);
593 MFloat error = ParallelMath::MakeFloatZero();
594 for (int px = 0; px < 16; px++)
595 {
596 MFloat pixelError = isolatedError[px];
597
598 MUInt15 pixelBestSelector = ParallelMath::MakeUInt15(0);
599 for (int i = 0; i < 3; i++)
600 {
601 MFloat error = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
602 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, pixelError);
603 pixelError = ParallelMath::Min(error, pixelError);
604 pixelBestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(i + 1), pixelBestSelector);
605 }
606
607 error = error + pixelError;
608 selectors = selectors | (ParallelMath::ToInt32(pixelBestSelector) << (px * 2));
609 }
610
611 ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
612 bestError = ParallelMath::Min(error, bestError);
613
614 if (ParallelMath::AnySet(errorBetter))
615 {
616 ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
617 ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
618 ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
619 bestIsThisMode = bestIsThisMode | errorBetter;
620 }
621 }
622 }
623
624 for (int block = 0; block < ParallelMath::ParallelSize; block++)
625 {
626 if (ParallelMath::Extract(bestIsThisMode, block))
627 {
628 uint32_t lowBits = 0;
629 uint32_t highBits = 0;
630
631 uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
632 ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
633
634 for (int ch = 0; ch < 3; ch++)
635 blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
636
637 uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
638 int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
639
640 ParallelMath::ScalarUInt16 lineColor[3];
641 for (int ch = 0; ch < 3; ch++)
642 lineColor[ch] = (blockBestLineColor >> (ch * 5)) & 15;
643
644 EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, true);
645 }
646 }
647}
648
649void cvtt::Internal::ETCComputer::EncodeHMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag groupings[16], const MUInt15 pixels[16][3], HModeEval &he, const MFloat preWeightedPixels[16][3], const Options &options)
650{
651 bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
652 bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
653
654 MUInt15 zero15 = ParallelMath::MakeUInt15(0);
655
656 MUInt15 counts[2] = { zero15, zero15 };
657
658 ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
659
660 MUInt15 totals[2][3] =
661 {
662 { zero15, zero15, zero15 },
663 { zero15, zero15, zero15 }
664 };
665
666 for (unsigned int px = 0; px < 16; px++)
667 {
668 for (int ch = 0; ch < 3; ch++)
669 {
670 totals[0][ch] = totals[0][ch] + pixels[px][ch];
671 totals[1][ch] = totals[1][ch] + ParallelMath::SelectOrZero(groupings[px], pixels[px][ch]);
672 }
673 counts[1] = counts[1] + ParallelMath::SelectOrZero(groupings[px], ParallelMath::MakeUInt15(1));
674 }
675
676 for (int ch = 0; ch < 3; ch++)
677 totals[0][ch] = totals[0][ch] - totals[1][ch];
678 counts[0] = ParallelMath::MakeUInt15(16) - counts[1];
679
680 MUInt16 bestSectorBits = ParallelMath::MakeUInt16(0);
681 MUInt16 bestSignBits = ParallelMath::MakeUInt16(0);
682 MUInt15 bestColors[2] = { zero15, zero15 };
683 MUInt15 bestTable = ParallelMath::MakeUInt15(0);
684
685 for (int table = 0; table < 8; table++)
686 {
687 MUInt15 numUniqueColors = zero15;
688
689 int modifier = cvtt::Tables::ETC1::g_thModifierTable[table];
690
691 for (int sector = 0; sector < 2; sector++)
692 {
693 for (int block = 0; block < ParallelMath::ParallelSize; block++)
694 {
695 int blockNumUniqueColors = 0;
696 uint16_t blockUniqueQuantizedColors[31];
697
698 int maxOffsetMultiplier = ParallelMath::Extract(counts[sector], block);
699 int minOffsetMultiplier = -maxOffsetMultiplier;
700
701 int modifierOffset = modifier * 2;
702
703 int blockSectorCounts = ParallelMath::Extract(counts[sector], block);
704 int blockSectorTotals[3];
705 for (int ch = 0; ch < 3; ch++)
706 blockSectorTotals[ch] = ParallelMath::Extract(totals[sector][ch], block);
707
708 for (int offsetPremultiplier = minOffsetMultiplier; offsetPremultiplier <= maxOffsetMultiplier; offsetPremultiplier++)
709 {
710 // TODO: This isn't ideal for FakeBT709
711 int16_t quantized[3];
712 for (int ch = 0; ch < 3; ch++)
713 {
714 if (blockSectorCounts == 0)
715 quantized[ch] = 0;
716 else
717 quantized[ch] = std::min<int16_t>(15, std::max<int16_t>(0, (blockSectorTotals[ch] * 2 + blockSectorCounts * 17 + modifierOffset * offsetPremultiplier)) / (blockSectorCounts * 34));
718 }
719
720 uint16_t packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
721 if (blockNumUniqueColors == 0 || packedColor != blockUniqueQuantizedColors[blockNumUniqueColors - 1])
722 {
723 assert(blockNumUniqueColors < 32);
724 blockUniqueQuantizedColors[blockNumUniqueColors++] = packedColor;
725 }
726 }
727
728 ParallelMath::PutUInt15(he.numUniqueColors[sector], block, blockNumUniqueColors);
729
730 int baseIndex = 0;
731 if (sector == 1)
732 baseIndex = ParallelMath::Extract(he.numUniqueColors[0], block);
733
734 for (int i = 0; i < blockNumUniqueColors; i++)
735 ParallelMath::PutUInt15(he.uniqueQuantizedColors[baseIndex + i], block, blockUniqueQuantizedColors[i]);
736 }
737 }
738
739 MUInt15 totalColors = he.numUniqueColors[0] + he.numUniqueColors[1];
740 int maxErrorColors = 0;
741 for (int block = 0; block < ParallelMath::ParallelSize; block++)
742 maxErrorColors = std::max<int>(maxErrorColors, ParallelMath::Extract(totalColors, block));
743
744 for (int block = 0; block < ParallelMath::ParallelSize; block++)
745 {
746 int lastColor = ParallelMath::Extract(totalColors, block);
747 uint16_t stripeColor = ParallelMath::Extract(he.uniqueQuantizedColors[0], block);
748 for (int i = lastColor; i < maxErrorColors; i++)
749 ParallelMath::PutUInt15(he.uniqueQuantizedColors[i], block, stripeColor);
750 }
751
752 for (int ci = 0; ci < maxErrorColors; ci++)
753 {
754 MUInt15 fifteen = ParallelMath::MakeUInt15(15);
755 MUInt15 twoFiftyFive = ParallelMath::MakeUInt15(255);
756 MSInt16 zeroS16 = ParallelMath::MakeSInt16(0);
757
758 MUInt15 colors[2][3];
759 for (int ch = 0; ch < 3; ch++)
760 {
761 MUInt15 quantizedChannel = ParallelMath::RightShift(he.uniqueQuantizedColors[ci], ((2 - ch) * 5)) & fifteen;
762
763 MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
764 colors[0][ch] = ParallelMath::Min(twoFiftyFive, unquantizedColor + modifier);
765 colors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(zeroS16, ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::MakeSInt16(modifier)));
766 }
767
768 MUInt16 signBits = ParallelMath::MakeUInt16(0);
769 for (int px = 0; px < 16; px++)
770 {
771 MFloat errors[2];
772 for (int i = 0; i < 2; i++)
773 {
774 if (isFakeBT709)
775 errors[i] = ComputeErrorFakeBT709(colors[i], preWeightedPixels[px]);
776 else if (isUniform)
777 errors[i] = ComputeErrorUniform(colors[i], pixels[px]);
778 else
779 errors[i] = ComputeErrorWeighted(colors[i], preWeightedPixels[px], options);
780 }
781
782 ParallelMath::Int16CompFlag errorOneLess = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errors[1], errors[0]));
783 he.errors[ci][px] = ParallelMath::Min(errors[0], errors[1]);
784 signBits = signBits | ParallelMath::SelectOrZero(errorOneLess, ParallelMath::MakeUInt16(1 << px));
785 }
786 he.signBits[ci] = signBits;
787 }
788
789 int maxUniqueColorCombos = 0;
790 for (int block = 0; block < ParallelMath::ParallelSize; block++)
791 {
792 int numUniqueColorCombos = ParallelMath::Extract(he.numUniqueColors[0], block) * ParallelMath::Extract(he.numUniqueColors[1], block);
793 if (numUniqueColorCombos > maxUniqueColorCombos)
794 maxUniqueColorCombos = numUniqueColorCombos;
795 }
796
797 MUInt15 indexes[2] = { zero15, zero15 };
798 MUInt15 maxIndex[2] = { he.numUniqueColors[0] - ParallelMath::MakeUInt15(1), he.numUniqueColors[1] - ParallelMath::MakeUInt15(1) };
799
800 int block1Starts[ParallelMath::ParallelSize];
801 for (int block = 0; block < ParallelMath::ParallelSize; block++)
802 block1Starts[block] = ParallelMath::Extract(he.numUniqueColors[0], block);
803
804 for (int combo = 0; combo < maxUniqueColorCombos; combo++)
805 {
806 MUInt15 index0 = indexes[0] + ParallelMath::MakeUInt15(1);
807 ParallelMath::Int16CompFlag index0Overflow = ParallelMath::Less(maxIndex[0], index0);
808 ParallelMath::ConditionalSet(index0, index0Overflow, ParallelMath::MakeUInt15(0));
809
810 MUInt15 index1 = ParallelMath::Min(maxIndex[1], indexes[1] + ParallelMath::SelectOrZero(index0Overflow, ParallelMath::MakeUInt15(1)));
811 indexes[0] = index0;
812 indexes[1] = index1;
813
814 int ci0[ParallelMath::ParallelSize];
815 int ci1[ParallelMath::ParallelSize];
816 MUInt15 color0;
817 MUInt15 color1;
818
819 for (int block = 0; block < ParallelMath::ParallelSize; block++)
820 {
821 ci0[block] = ParallelMath::Extract(index0, block);
822 ci1[block] = ParallelMath::Extract(index1, block) + block1Starts[block];
823 ParallelMath::PutUInt15(color0, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci0[block]], block));
824 ParallelMath::PutUInt15(color1, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci1[block]], block));
825 }
826
827 MFloat totalError = ParallelMath::MakeFloatZero();
828 MUInt16 sectorBits = ParallelMath::MakeUInt16(0);
829 MUInt16 signBits = ParallelMath::MakeUInt16(0);
830 for (int px = 0; px < 16; px++)
831 {
832 MFloat errorCI0;
833 MFloat errorCI1;
834 MUInt16 signBits0;
835 MUInt16 signBits1;
836
837 for (int block = 0; block < ParallelMath::ParallelSize; block++)
838 {
839 ParallelMath::PutFloat(errorCI0, block, ParallelMath::Extract(he.errors[ci0[block]][px], block));
840 ParallelMath::PutFloat(errorCI1, block, ParallelMath::Extract(he.errors[ci1[block]][px], block));
841 ParallelMath::PutUInt16(signBits0, block, ParallelMath::Extract(he.signBits[ci0[block]], block));
842 ParallelMath::PutUInt16(signBits1, block, ParallelMath::Extract(he.signBits[ci1[block]], block));
843 }
844
845 totalError = totalError + ParallelMath::Min(errorCI0, errorCI1);
846
847 MUInt16 bitPosition = ParallelMath::MakeUInt16(1 << px);
848
849 ParallelMath::Int16CompFlag error1Better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errorCI1, errorCI0));
850
851 sectorBits = sectorBits | ParallelMath::SelectOrZero(error1Better, bitPosition);
852 signBits = signBits | (bitPosition & ParallelMath::Select(error1Better, signBits1, signBits0));
853 }
854
855 ParallelMath::FloatCompFlag totalErrorBetter = ParallelMath::Less(totalError, bestError);
856 ParallelMath::Int16CompFlag totalErrorBetter16 = ParallelMath::FloatFlagToInt16(totalErrorBetter);
857 if (ParallelMath::AnySet(totalErrorBetter16))
858 {
859 bestIsThisMode = bestIsThisMode | totalErrorBetter16;
860 ParallelMath::ConditionalSet(bestTable, totalErrorBetter16, ParallelMath::MakeUInt15(table));
861 ParallelMath::ConditionalSet(bestColors[0], totalErrorBetter16, color0);
862 ParallelMath::ConditionalSet(bestColors[1], totalErrorBetter16, color1);
863 ParallelMath::ConditionalSet(bestSectorBits, totalErrorBetter16, sectorBits);
864 ParallelMath::ConditionalSet(bestSignBits, totalErrorBetter16, signBits);
865 bestError = ParallelMath::Min(totalError, bestError);
866 }
867 }
868 }
869
870 if (ParallelMath::AnySet(bestIsThisMode))
871 {
872 for (int block = 0; block < ParallelMath::ParallelSize; block++)
873 {
874 if (!ParallelMath::Extract(bestIsThisMode, block))
875 continue;
876
877 ParallelMath::ScalarUInt16 blockBestColors[2] = { ParallelMath::Extract(bestColors[0], block), ParallelMath::Extract(bestColors[1], block) };
878 ParallelMath::ScalarUInt16 blockBestSectorBits = ParallelMath::Extract(bestSectorBits, block);
879 ParallelMath::ScalarUInt16 blockBestSignBits = ParallelMath::Extract(bestSignBits, block);
880 ParallelMath::ScalarUInt16 blockBestTable = ParallelMath::Extract(bestTable, block);
881
882 EmitHModeBlock(outputBuffer + block * 8, blockBestColors, blockBestSectorBits, blockBestSignBits, blockBestTable, true);
883 }
884 }
885}
886
887void cvtt::Internal::ETCComputer::EncodeVirtualTModePunchthrough(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolatedBase[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options)
888{
889 // We treat T and H mode as the same mode ("Virtual T mode") with punchthrough, because of how the colors work:
890 //
891 // T mode: C1, C2+M, Transparent, C2-M
892 // H mode: C1+M, C1-M, Transparent, C2-M
893 //
894 // So in either case, we have 2 colors +/- a modifier, and a third unique color, which is basically T mode except without the middle color.
895 // The only thing that matters is whether it's better to store the isolated color as T mode color 1, or store it offset in H mode color 2.
896 //
897 // Sometimes it won't even be possible to store it in H mode color 2 because the table low bit derives from a numeric comparison of the colors,
898 // but unlike opaque blocks, we can't flip them.
899 bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
900 bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
901
902 ParallelMath::FloatCompFlag isTransparentF[16];
903 for (int px = 0; px < 16; px++)
904 isTransparentF[px] = ParallelMath::Int16FlagToFloat(isTransparent[px]);
905
906 ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
907 ParallelMath::Int16CompFlag bestIsHMode = ParallelMath::MakeBoolInt16(false);
908
909 MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
910 MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
911
912 MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
913 MUInt15 numPixelsLine = ParallelMath::MakeUInt15(0);
914
915 ParallelMath::Int16CompFlag isIsolated[16];
916 ParallelMath::Int16CompFlag isLine[16];
917
918 for (unsigned int px = 0; px < 16; px++)
919 {
920 ParallelMath::Int16CompFlag isOpaque = ParallelMath::Not(isTransparent[px]);
921 isIsolated[px] = isIsolatedBase[px] & isOpaque;
922 isLine[px] = ParallelMath::Not(isIsolatedBase[px]) & isOpaque;
923 }
924
925 for (unsigned int px = 0; px < 16; px++)
926 {
927 for (int ch = 0; ch < 3; ch++)
928 {
929 isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
930 lineTotal[ch] = lineTotal[ch] + ParallelMath::SelectOrZero(isLine[px], pixels[px][ch]);
931 }
932 numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
933 numPixelsLine = numPixelsLine + ParallelMath::SelectOrZero(isLine[px], ParallelMath::MakeUInt15(1));
934 }
935
936 MUInt15 isolatedAverageQuantized[3];
937 MUInt15 hModeIsolatedQuantized[8][3];
938 MUInt15 isolatedAverageTargets[3];
939 {
940 int divisors[ParallelMath::ParallelSize];
941 for (int block = 0; block < ParallelMath::ParallelSize; block++)
942 divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
943
944 MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
945 for (int ch = 0; ch < 3; ch++)
946 {
947 // isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
948
949 MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
950 if (!isFakeBT709)
951 numerator = numerator + addend;
952
953 MUInt15 hModeIsolatedNumerators[8];
954 for (int table = 0; table < 8; table++)
955 {
956 // FIXME: Handle fake BT.709 correctly
957 MUInt15 offsetTotal = isolatedTotal[ch] + ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]), numPixelsIsolated));
958
959 hModeIsolatedNumerators[table] = (offsetTotal + offsetTotal) + addend;
960 }
961
962 for (int block = 0; block < ParallelMath::ParallelSize; block++)
963 {
964 int divisor = divisors[block];
965 if (divisor == 0)
966 {
967 ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
968 for (int table = 0; table < 8; table++)
969 ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, 0);
970 }
971 else
972 {
973 ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
974 for (int table = 0; table < 8; table++)
975 ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, ParallelMath::Extract(hModeIsolatedNumerators[table], block) / divisor);
976 }
977 }
978
979 isolatedAverageTargets[ch] = numerator;
980 }
981 }
982
983 if (isFakeBT709)
984 ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
985
986 for (int table = 0; table < 8; table++)
987 for (int ch = 0; ch < 3; ch++)
988 hModeIsolatedQuantized[table][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), hModeIsolatedQuantized[table][ch]);
989
990 MUInt15 isolatedColor[3];
991 for (int ch = 0; ch < 3; ch++)
992 isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
993
994 MFloat isolatedError[16];
995 for (int px = 0; px < 16; px++)
996 {
997 if (isFakeBT709)
998 isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
999 else if (isUniform)
1000 isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
1001 else
1002 isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
1003
1004 ParallelMath::ConditionalSet(isolatedError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1005 }
1006
1007 MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
1008 MUInt15 bestTable = ParallelMath::MakeUInt15(0);
1009 MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
1010 MUInt15 bestIsolatedColor = ParallelMath::MakeUInt15(0);
1011 MUInt15 bestHModeColor2 = ParallelMath::MakeUInt15(0);
1012 ParallelMath::Int16CompFlag bestUseHMode = ParallelMath::MakeBoolInt16(false);
1013
1014 MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
1015 MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
1016
1017 int16_t clusterMaxLine = 0;
1018 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1019 {
1020 int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
1021 if (blockMaxLine > clusterMaxLine)
1022 clusterMaxLine = blockMaxLine;
1023 }
1024
1025 int16_t clusterMinLine = -clusterMaxLine;
1026
1027 int lineDivisors[ParallelMath::ParallelSize];
1028 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1029 lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
1030
1031 MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
1032
1033 for (int table = 0; table < 8; table++)
1034 {
1035 int numUniqueColors[ParallelMath::ParallelSize];
1036 MUInt15 uniqueQuantizedColors[31];
1037
1038 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1039 numUniqueColors[block] = 0;
1040
1041 MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
1042 MUInt15 modifierOffset = (modifier + modifier);
1043
1044 for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier += 2)
1045 {
1046 MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
1047 MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
1048
1049 MUInt15 quantized[3];
1050 if (isFakeBT709)
1051 {
1052 MUInt15 targets[3];
1053 for (int ch = 0; ch < 3; ch++)
1054 {
1055 //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
1056 MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
1057 MUInt15 divided = ParallelMath::MakeUInt15(0);
1058 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1059 {
1060 int divisor = lineDivisors[block];
1061 if (divisor == 0)
1062 ParallelMath::PutUInt15(divided, block, 0);
1063 else
1064 ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
1065 }
1066 quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
1067 targets[ch] = numerator;
1068 }
1069
1070 ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
1071 }
1072 else
1073 {
1074 for (int ch = 0; ch < 3; ch++)
1075 {
1076 //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
1077 MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
1078 MUInt15 divided = ParallelMath::MakeUInt15(0);
1079 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1080 {
1081 int divisor = lineDivisors[block];
1082 if (divisor == 0)
1083 ParallelMath::PutUInt15(divided, block, 0);
1084 else
1085 ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
1086 }
1087 quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
1088 }
1089 }
1090
1091 MUInt15 packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
1092
1093 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1094 {
1095 uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
1096 if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
1097 ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
1098 }
1099 }
1100
1101 // Stripe unfilled unique colors
1102 int maxUniqueColors = 0;
1103 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1104 {
1105 if (numUniqueColors[block] > maxUniqueColors)
1106 maxUniqueColors = numUniqueColors[block];
1107 }
1108
1109 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1110 {
1111 uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
1112
1113 int numUnique = numUniqueColors[block];
1114 for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
1115 ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
1116 }
1117
1118 MFloat hModeErrors[16];
1119 MUInt15 hModeUnquantizedColor[3];
1120 for (int ch = 0; ch < 3; ch++)
1121 {
1122 MUInt15 quantizedChannel = hModeIsolatedQuantized[table][ch];
1123
1124 MUInt15 unquantizedCh = (quantizedChannel << 4) | quantizedChannel;
1125 hModeUnquantizedColor[ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedCh) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
1126 }
1127
1128 for (int px = 0; px < 16; px++)
1129 {
1130 hModeErrors[px] = isUniform ? ComputeErrorUniform(hModeUnquantizedColor, pixels[px]) : ComputeErrorWeighted(hModeUnquantizedColor, preWeightedPixels[px], options);
1131 ParallelMath::ConditionalSet(hModeErrors[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1132 }
1133
1134 MUInt15 packedHModeColor2 = (hModeIsolatedQuantized[table][0] << 10) | (hModeIsolatedQuantized[table][1] << 5) | hModeIsolatedQuantized[table][2];
1135 ParallelMath::Int16CompFlag tableLowBitIsZero = ((table & 1) == 0) ? ParallelMath::MakeBoolInt16(true) : ParallelMath::MakeBoolInt16(false);
1136
1137 for (int ci = 0; ci < maxUniqueColors; ci++)
1138 {
1139 MUInt15 lineColors[2][3];
1140 for (int ch = 0; ch < 3; ch++)
1141 {
1142 MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], 10 - (ch * 5)) & ParallelMath::MakeUInt15(15));
1143
1144 MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
1145 lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
1146 lineColors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
1147 }
1148
1149 MUInt15 bestLineSelector[16];
1150 MFloat bestLineError[16];
1151 for (int px = 0; px < 16; px++)
1152 {
1153 MFloat lineErrors[2];
1154 for (int i = 0; i < 2; i++)
1155 lineErrors[i] = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
1156
1157 ParallelMath::Int16CompFlag firstIsBetter = ParallelMath::FloatFlagToInt16(ParallelMath::LessOrEqual(lineErrors[0], lineErrors[1]));
1158 bestLineSelector[px] = ParallelMath::Select(firstIsBetter, ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(3));
1159 bestLineError[px] = ParallelMath::Min(lineErrors[0], lineErrors[1]);
1160
1161 ParallelMath::ConditionalSet(bestLineError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1162 }
1163
1164 // One case considered here was if it was possible to force H mode to be valid when the line color is unused.
1165 // That case isn't actually useful because it's equivalent to the isolated color being unused at maximum offset,
1166 // which is always checked after a swap.
1167 MFloat tModeError = ParallelMath::MakeFloatZero();
1168 MFloat hModeError = ParallelMath::MakeFloatZero();
1169 for (int px = 0; px < 16; px++)
1170 {
1171 tModeError = tModeError + ParallelMath::Min(bestLineError[px], isolatedError[px]);
1172 hModeError = hModeError + ParallelMath::Min(bestLineError[px], hModeErrors[px]);
1173 }
1174
1175 ParallelMath::FloatCompFlag hLessError = ParallelMath::Less(hModeError, tModeError);
1176
1177 MUInt15 packedHModeColor1 = uniqueQuantizedColors[ci];
1178
1179 ParallelMath::Int16CompFlag hModeTableLowBitMustBeZero = ParallelMath::Less(packedHModeColor1, packedHModeColor2);
1180
1181 ParallelMath::Int16CompFlag hModeIsLegal = ParallelMath::Equal(hModeTableLowBitMustBeZero, tableLowBitIsZero);
1182 ParallelMath::Int16CompFlag useHMode = ParallelMath::FloatFlagToInt16(hLessError) & hModeIsLegal;
1183
1184 MFloat roundBestError = tModeError;
1185 ParallelMath::ConditionalSet(roundBestError, ParallelMath::Int16FlagToFloat(useHMode), hModeError);
1186
1187 ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(roundBestError, bestError));
1188 ParallelMath::FloatCompFlag useHModeF = ParallelMath::Int16FlagToFloat(useHMode);
1189
1190 if (ParallelMath::AnySet(errorBetter))
1191 {
1192 MSInt32 selectors = ParallelMath::MakeSInt32(0);
1193 for (int px = 0; px < 16; px++)
1194 {
1195 MUInt15 selector = bestLineSelector[px];
1196
1197 MFloat isolatedPixelError = ParallelMath::Select(useHModeF, hModeErrors[px], isolatedError[px]);
1198 ParallelMath::Int16CompFlag isolatedBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(isolatedPixelError, bestLineError[px]));
1199
1200 ParallelMath::ConditionalSet(selector, isolatedBetter, ParallelMath::MakeUInt15(0));
1201 ParallelMath::ConditionalSet(selector, isTransparent[px], ParallelMath::MakeUInt15(2));
1202 selectors = selectors | (ParallelMath::ToInt32(selector) << (px * 2));
1203 }
1204
1205 bestError = ParallelMath::Min(bestError, roundBestError);
1206 ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
1207 ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
1208 ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
1209 ParallelMath::ConditionalSet(bestIsHMode, errorBetter, useHMode);
1210 ParallelMath::ConditionalSet(bestHModeColor2, errorBetter, packedHModeColor2);
1211
1212 bestIsThisMode = bestIsThisMode | errorBetter;
1213 }
1214 }
1215 }
1216
1217 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1218 {
1219 if (ParallelMath::Extract(bestIsThisMode, block))
1220 {
1221 uint32_t lowBits = 0;
1222 uint32_t highBits = 0;
1223
1224 uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
1225 ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
1226
1227 for (int ch = 0; ch < 3; ch++)
1228 blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
1229
1230 uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
1231 int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
1232
1233 ParallelMath::ScalarUInt16 lineColor[3];
1234 for (int ch = 0; ch < 3; ch++)
1235 lineColor[ch] = (blockBestLineColor >> (10 - (ch * 5))) & 15;
1236
1237 if (ParallelMath::Extract(bestIsHMode, block))
1238 {
1239 // T mode: C1, C2+M, Transparent, C2-M
1240 // H mode: C1+M, C1-M, Transparent, C2-M
1241 static const ParallelMath::ScalarUInt16 selectorRemapSector[4] = { 1, 0, 1, 0 };
1242 static const ParallelMath::ScalarUInt16 selectorRemapSign[4] = { 1, 0, 0, 1 };
1243
1244 // Remap selectors
1245 ParallelMath::ScalarUInt16 signBits = 0;
1246 ParallelMath::ScalarUInt16 sectorBits = 0;
1247 int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
1248 for (int px = 0; px < 16; px++)
1249 {
1250 int32_t selector = (blockBestSelectors >> (px * 2)) & 3;
1251 sectorBits |= (selectorRemapSector[selector] << px);
1252 signBits |= (selectorRemapSign[selector] << px);
1253 }
1254
1255 ParallelMath::ScalarUInt16 blockColors[2] = { blockBestLineColor, ParallelMath::Extract(bestHModeColor2, block) };
1256
1257 EmitHModeBlock(outputBuffer + block * 8, blockColors, sectorBits, signBits, blockBestTable, false);
1258 }
1259 else
1260 EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, false);
1261 }
1262 }
1263}
1264
1265
1266cvtt::ParallelMath::UInt15 cvtt::Internal::ETCComputer::DecodePlanarCoeff(const MUInt15 &coeff, int ch)
1267{
1268 if (ch == 1)
1269 return (coeff << 1) | (ParallelMath::RightShift(coeff, 6));
1270 else
1271 return (coeff << 2) | (ParallelMath::RightShift(coeff, 4));
1272}
1273
1274void cvtt::Internal::ETCComputer::EncodePlanar(uint8_t *outputBuffer, MFloat &bestError, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
1275{
1276 // NOTE: If it's desired to do this in another color space, the best way to do it would probably be
1277 // to do everything in that color space and then transform it back to RGB.
1278
1279 // We compute H = (H-O)/4 and V= (V-O)/4 to simplify the math
1280
1281 // error = (x*H + y*V + O - C)^2
1282 MFloat h[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1283 MFloat v[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1284 MFloat o[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1285
1286 bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
1287 bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
1288
1289 MFloat totalError = ParallelMath::MakeFloatZero();
1290 MUInt15 bestCoeffs[3][3]; // [Channel][Coeff]
1291 for (int ch = 0; ch < 3; ch++)
1292 {
1293 float fhh = 0.f;
1294 float fho = 0.f;
1295 float fhv = 0.f;
1296 float foo = 0.f;
1297 float fov = 0.f;
1298 float fvv = 0.f;
1299 MFloat fc = ParallelMath::MakeFloatZero();
1300 MFloat fh = ParallelMath::MakeFloatZero();
1301 MFloat fv = ParallelMath::MakeFloatZero();
1302 MFloat fo = ParallelMath::MakeFloatZero();
1303
1304 float &foh = fho;
1305 float &fvh = fhv;
1306 float &fvo = fov;
1307
1308 for (int px = 0; px < 16; px++)
1309 {
1310 float x = static_cast<float>(px % 4);
1311 float y = static_cast<float>(px / 4);
1312 MFloat c = isFakeBT709 ? preWeightedPixels[px][ch] : ParallelMath::ToFloat(pixels[px][ch]);
1313
1314 // (x*H + y*V + O - C)^2
1315 fhh += x * x;
1316 fhv += x * y;
1317 fho += x;
1318 fh = fh - c * x;
1319
1320 fvh += y * x;
1321 fvv += y * y;
1322 fvo += y;
1323 fv = fv - c * y;
1324
1325 foh += x;
1326 fov += y;
1327 foo += 1;
1328 fo = fo - c;
1329
1330 fh = fh - c * x;
1331 fv = fv - c * y;
1332 fo = fo - c;
1333 fc = fc + c * c;
1334 }
1335
1336 //float totalError = fhh * h * h + fho * h*o + fhv * h*v + foo * o * o + fov * o*v + fvv * v * v + fh * h + fv * v + fo * o + fc;
1337
1338 // error = fhh*h^2 + fho*h*o + fhv*h*v + foo*o^2 + fov*o*v + fvv*v^2 + fh*h + fv*v + fo*o + fc
1339 // derror/dh = 2*fhh*h + fho*o + fhv*v + fh
1340 // derror/dv = fhv*h + fov*o + 2*fvv*v + fv
1341 // derror/do = fho*h + 2*foo*o + fov*v + fo
1342
1343 // Solve system of equations
1344 // h o v 1 = 0
1345 // -------
1346 // d e f g R0
1347 // i j k l R1
1348 // m n p q R2
1349
1350 float d = 2.0f * fhh;
1351 float e = fho;
1352 float f = fhv;
1353 MFloat gD = fh;
1354
1355 float i = fhv;
1356 float j = fov;
1357 float k = 2.0f * fvv;
1358 MFloat lD = fv;
1359
1360 float m = fho;
1361 float n = 2.0f * foo;
1362 float p = fov;
1363 MFloat qD = fo;
1364
1365 {
1366 // Factor out first column from R1 and R2
1367 float r0to1 = -i / d;
1368 float r0to2 = -m / d;
1369
1370 // 0 j1 k1 l1D
1371 float j1 = j + r0to1 * e;
1372 float k1 = k + r0to1 * f;
1373 MFloat l1D = lD + gD * r0to1;
1374
1375 // 0 n1 p1 q1D
1376 float n1 = n + r0to2 * e;
1377 float p1 = p + r0to2 * f;
1378 MFloat q1D = qD + gD * r0to2;
1379
1380 // Factor out third column from R2
1381 float r1to2 = -p1 / k1;
1382
1383 // 0 n2 0 q2D
1384 float n2 = n1 + r1to2 * j1;
1385 MFloat q2D = q1D + l1D * r1to2;
1386
1387 o[ch] = -q2D / n2;
1388
1389 // Factor out second column from R1
1390 // 0 n2 0 q2D
1391
1392 float r2to1 = -j1 / n2;
1393
1394 // 0 0 k1 l2D
1395 // 0 n2 0 q2D
1396 MFloat l2D = l1D + q2D * r2to1;
1397
1398 float elim2 = -f / k1;
1399 float elim1 = -e / n2;
1400
1401 // d 0 0 g2D
1402 MFloat g2D = gD + l2D * elim2 + q2D * elim1;
1403
1404 // n2*o + q2 = 0
1405 // o = -q2 / n2
1406 h[ch] = -g2D / d;
1407 v[ch] = -l2D / k1;
1408 }
1409
1410 // Undo the local transformation
1411 h[ch] = h[ch] * 4.0f + o[ch];
1412 v[ch] = v[ch] * 4.0f + o[ch];
1413 }
1414
1415 if (isFakeBT709)
1416 {
1417 MFloat oRGB[3];
1418 MFloat hRGB[3];
1419 MFloat vRGB[3];
1420
1421 ConvertFromFakeBT709(oRGB, o);
1422 ConvertFromFakeBT709(hRGB, h);
1423 ConvertFromFakeBT709(vRGB, v);
1424
1425 // Twiddling in fake BT.607 is a mess, just round off for now (the precision is pretty good anyway)
1426 {
1427 ParallelMath::RoundTowardNearestForScope rtn;
1428
1429 for (int ch = 0; ch < 3; ch++)
1430 {
1431 MFloat fcoeffs[3] = { oRGB[ch], hRGB[ch], vRGB[ch] };
1432
1433 for (int c = 0; c < 3; c++)
1434 {
1435 MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
1436 if (ch == 1)
1437 coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
1438 else
1439 coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
1440 fcoeffs[c] = coeff;
1441 }
1442
1443 for (int c = 0; c < 3; c++)
1444 bestCoeffs[ch][c] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rtn);
1445 }
1446 }
1447
1448 MUInt15 reconstructed[16][3];
1449 for (int ch = 0; ch < 3; ch++)
1450 {
1451 MUInt15 dO = DecodePlanarCoeff(bestCoeffs[ch][0], ch);
1452 MUInt15 dH = DecodePlanarCoeff(bestCoeffs[ch][1], ch);
1453 MUInt15 dV = DecodePlanarCoeff(bestCoeffs[ch][2], ch);
1454
1455 MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1456 MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1457
1458 MFloat error = ParallelMath::MakeFloatZero();
1459
1460 MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
1461
1462 for (int px = 0; px < 16; px++)
1463 {
1464 MUInt15 pxv = ParallelMath::MakeUInt15(px);
1465 MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
1466 MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
1467
1468 MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
1469 MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
1470 reconstructed[px][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
1471 }
1472 }
1473
1474 totalError = ParallelMath::MakeFloatZero();
1475 for (int px = 0; px < 16; px++)
1476 totalError = totalError + ComputeErrorFakeBT709(reconstructed[px], preWeightedPixels[px]);
1477 }
1478 else
1479 {
1480 for (int ch = 0; ch < 3; ch++)
1481 {
1482 MFloat fcoeffs[3] = { o[ch], h[ch], v[ch] };
1483 MUInt15 coeffRanges[3][2];
1484
1485 for (int c = 0; c < 3; c++)
1486 {
1487 MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
1488 if (ch == 1)
1489 coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
1490 else
1491 coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
1492 fcoeffs[c] = coeff;
1493 }
1494
1495 {
1496 ParallelMath::RoundDownForScope rd;
1497 for (int c = 0; c < 3; c++)
1498 coeffRanges[c][0] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rd);
1499 }
1500
1501 {
1502 ParallelMath::RoundUpForScope ru;
1503 for (int c = 0; c < 3; c++)
1504 coeffRanges[c][1] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &ru);
1505 }
1506
1507 MFloat bestChannelError = ParallelMath::MakeFloat(FLT_MAX);
1508 for (int io = 0; io < 2; io++)
1509 {
1510 MUInt15 dO = DecodePlanarCoeff(coeffRanges[0][io], ch);
1511
1512 for (int ih = 0; ih < 2; ih++)
1513 {
1514 MUInt15 dH = DecodePlanarCoeff(coeffRanges[1][ih], ch);
1515 MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1516
1517 for (int iv = 0; iv < 2; iv++)
1518 {
1519 MUInt15 dV = DecodePlanarCoeff(coeffRanges[2][iv], ch);
1520 MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1521
1522 MFloat error = ParallelMath::MakeFloatZero();
1523
1524 MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
1525
1526 for (int px = 0; px < 16; px++)
1527 {
1528 MUInt15 pxv = ParallelMath::MakeUInt15(px);
1529 MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
1530 MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
1531
1532 MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
1533 MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
1534 MUInt15 dec = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
1535
1536 MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(dec);
1537
1538 MFloat deltaF = ParallelMath::ToFloat(delta);
1539 error = error + deltaF * deltaF;
1540 }
1541
1542 ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestChannelError));
1543 if (ParallelMath::AnySet(errorBetter))
1544 {
1545 bestChannelError = ParallelMath::Min(error, bestChannelError);
1546 ParallelMath::ConditionalSet(bestCoeffs[ch][0], errorBetter, coeffRanges[0][io]);
1547 ParallelMath::ConditionalSet(bestCoeffs[ch][1], errorBetter, coeffRanges[1][ih]);
1548 ParallelMath::ConditionalSet(bestCoeffs[ch][2], errorBetter, coeffRanges[2][iv]);
1549 }
1550 }
1551 }
1552 }
1553
1554 if (!isUniform)
1555 {
1556 switch (ch)
1557 {
1558 case 0:
1559 bestChannelError = bestChannelError * (options.redWeight * options.redWeight);
1560 break;
1561 case 1:
1562 bestChannelError = bestChannelError * (options.greenWeight * options.greenWeight);
1563 break;
1564 case 2:
1565 bestChannelError = bestChannelError * (options.blueWeight * options.blueWeight);
1566 break;
1567 default:
1568 break;
1569 }
1570 }
1571
1572 totalError = totalError + bestChannelError;
1573 }
1574 }
1575
1576 ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(totalError, bestError));
1577 if (ParallelMath::AnySet(errorBetter))
1578 {
1579 bestError = ParallelMath::Min(bestError, totalError);
1580
1581 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1582 {
1583 if (!ParallelMath::Extract(errorBetter, block))
1584 continue;
1585
1586 int ro = ParallelMath::Extract(bestCoeffs[0][0], block);
1587 int rh = ParallelMath::Extract(bestCoeffs[0][1], block);
1588 int rv = ParallelMath::Extract(bestCoeffs[0][2], block);
1589
1590 int go = ParallelMath::Extract(bestCoeffs[1][0], block);
1591 int gh = ParallelMath::Extract(bestCoeffs[1][1], block);
1592 int gv = ParallelMath::Extract(bestCoeffs[1][2], block);
1593
1594 int bo = ParallelMath::Extract(bestCoeffs[2][0], block);
1595 int bh = ParallelMath::Extract(bestCoeffs[2][1], block);
1596 int bv = ParallelMath::Extract(bestCoeffs[2][2], block);
1597
1598 int go1 = go >> 6;
1599 int go2 = go & 63;
1600
1601 int bo1 = bo >> 5;
1602 int bo2 = (bo >> 3) & 3;
1603 int bo3 = bo & 7;
1604
1605 int rh1 = (rh >> 1);
1606 int rh2 = rh & 1;
1607
1608 int fakeR = ro >> 2;
1609 int fakeDR = go1 | ((ro & 3) << 1);
1610
1611 int fakeG = (go2 >> 2);
1612 int fakeDG = ((go2 & 3) << 1) | bo1;
1613
1614 int fakeB = bo2;
1615 int fakeDB = bo3 >> 1;
1616
1617 uint32_t highBits = 0;
1618 uint32_t lowBits = 0;
1619
1620 // Avoid overflowing R
1621 if ((fakeDR & 4) != 0 && fakeR + fakeDR < 8)
1622 highBits |= 1 << (63 - 32);
1623
1624 // Avoid overflowing G
1625 if ((fakeDG & 4) != 0 && fakeG + fakeDG < 8)
1626 highBits |= 1 << (55 - 32);
1627
1628 // Overflow B
1629 if (fakeB + fakeDB < 4)
1630 {
1631 // Overflow low
1632 highBits |= 1 << (42 - 32);
1633 }
1634 else
1635 {
1636 // Overflow high
1637 highBits |= 7 << (45 - 32);
1638 }
1639
1640 highBits |= ro << (57 - 32);
1641 highBits |= go1 << (56 - 32);
1642 highBits |= go2 << (49 - 32);
1643 highBits |= bo1 << (48 - 32);
1644 highBits |= bo2 << (43 - 32);
1645 highBits |= bo3 << (39 - 32);
1646 highBits |= rh1 << (34 - 32);
1647 highBits |= 1 << (33 - 32);
1648 highBits |= rh2 << (32 - 32);
1649
1650 lowBits |= gh << 25;
1651 lowBits |= bh << 19;
1652 lowBits |= rv << 13;
1653 lowBits |= gv << 6;
1654 lowBits |= bv << 0;
1655
1656 for (int i = 0; i < 4; i++)
1657 outputBuffer[block * 8 + i] = (highBits >> (24 - i * 8)) & 0xff;
1658 for (int i = 0; i < 4; i++)
1659 outputBuffer[block * 8 + i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
1660 }
1661 }
1662}
1663
1664void cvtt::Internal::ETCComputer::CompressETC2Block(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, ETC2CompressionData *compressionData, const Options &options, bool punchthroughAlpha)
1665{
1666 ParallelMath::Int16CompFlag pixelIsTransparent[16];
1667 ParallelMath::Int16CompFlag anyTransparent = ParallelMath::MakeBoolInt16(false);
1668 ParallelMath::Int16CompFlag allTransparent = ParallelMath::MakeBoolInt16(true);
1669
1670 if (punchthroughAlpha)
1671 {
1672 const float fThreshold = std::max<float>(std::min<float>(1.0f, options.threshold), 0.0f) * 255.0f;
1673
1674 // +1.0f is intentional, we want to take the next valid integer (even if it's 256) since everything else lower is transparent
1675 MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(std::floor(fThreshold + 1.0f)));
1676
1677 for (int px = 0; px < 16; px++)
1678 {
1679 MUInt15 alpha;
1680 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1681 ParallelMath::PutUInt15(alpha, block, pixelBlocks[block].m_pixels[px][3]);
1682
1683 ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(alpha, threshold);
1684 anyTransparent = (anyTransparent | isTransparent);
1685 allTransparent = (allTransparent & isTransparent);
1686 pixelIsTransparent[px] = isTransparent;
1687 }
1688 }
1689 else
1690 {
1691 for (int px = 0; px < 16; px++)
1692 pixelIsTransparent[px] = ParallelMath::MakeBoolInt16(false);
1693
1694 allTransparent = anyTransparent = ParallelMath::MakeBoolInt16(false);
1695 }
1696
1697 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
1698
1699 ETC2CompressionDataInternal* internalData = static_cast<ETC2CompressionDataInternal*>(compressionData);
1700
1701 MUInt15 pixels[16][3];
1702 MFloat preWeightedPixels[16][3];
1703 ExtractBlocks(pixels, preWeightedPixels, pixelBlocks, options);
1704
1705 if (ParallelMath::AnySet(anyTransparent))
1706 {
1707 for (int px = 0; px < 16; px++)
1708 {
1709 ParallelMath::Int16CompFlag flag = pixelIsTransparent[px];
1710 ParallelMath::FloatCompFlag fflag = ParallelMath::Int16FlagToFloat(flag);
1711
1712 for (int ch = 0; ch < 3; ch++)
1713 {
1714 ParallelMath::ConditionalSet(pixels[px][ch], flag, ParallelMath::MakeUInt15(0));
1715 ParallelMath::ConditionalSet(preWeightedPixels[px][ch], fflag, ParallelMath::MakeFloat(0.0f));
1716 }
1717 }
1718 }
1719
1720 if (!ParallelMath::AllSet(allTransparent))
1721 EncodePlanar(outputBuffer, bestError, pixels, preWeightedPixels, options);
1722
1723 MFloat chromaDelta[16][2];
1724
1725 MUInt15 numOpaque = ParallelMath::MakeUInt15(16);
1726 for (int px = 0; px < 16; px++)
1727 numOpaque = numOpaque - ParallelMath::SelectOrZero(pixelIsTransparent[px], ParallelMath::MakeUInt15(1));
1728
1729 if (options.flags & cvtt::Flags::Uniform)
1730 {
1731 MSInt16 chromaCoordinates3[16][2];
1732 for (int px = 0; px < 16; px++)
1733 {
1734 chromaCoordinates3[px][0] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
1735 chromaCoordinates3[px][1] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][1] << 1) + ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
1736 }
1737
1738 MSInt16 chromaCoordinateCentroid[2] = { ParallelMath::MakeSInt16(0), ParallelMath::MakeSInt16(0) };
1739 for (int px = 0; px < 16; px++)
1740 {
1741 for (int ch = 0; ch < 2; ch++)
1742 chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
1743 }
1744
1745 if (punchthroughAlpha)
1746 {
1747 for (int px = 0; px < 16; px++)
1748 {
1749 for (int ch = 0; ch < 2; ch++)
1750 {
1751 MUInt15 chromaCoordinateMultiplied = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(chromaCoordinates3[px][ch], numOpaque));
1752 MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(chromaCoordinateMultiplied) - chromaCoordinateCentroid[ch];
1753 chromaDelta[px][ch] = ParallelMath::ToFloat(delta);
1754 }
1755 }
1756 }
1757 else
1758 {
1759 for (int px = 0; px < 16; px++)
1760 {
1761 for (int ch = 0; ch < 2; ch++)
1762 chromaDelta[px][ch] = ParallelMath::ToFloat((chromaCoordinates3[px][ch] << 4) - chromaCoordinateCentroid[ch]);
1763 }
1764 }
1765
1766 const MFloat rcpSqrt3 = ParallelMath::MakeFloat(0.57735026918962576450914878050196f);
1767
1768 for (int px = 0; px < 16; px++)
1769 chromaDelta[px][1] = chromaDelta[px][1] * rcpSqrt3;
1770 }
1771 else
1772 {
1773 const float chromaAxis0[3] = { internalData->m_chromaSideAxis0[0], internalData->m_chromaSideAxis0[1], internalData->m_chromaSideAxis0[2] };
1774 const float chromaAxis1[3] = { internalData->m_chromaSideAxis1[0], internalData->m_chromaSideAxis1[1], internalData->m_chromaSideAxis1[2] };
1775
1776 MFloat chromaCoordinates3[16][2];
1777 for (int px = 0; px < 16; px++)
1778 {
1779 const MFloat &px0 = preWeightedPixels[px][0];
1780 const MFloat &px1 = preWeightedPixels[px][1];
1781 const MFloat &px2 = preWeightedPixels[px][2];
1782
1783 chromaCoordinates3[px][0] = px0 * chromaAxis0[0] + px1 * chromaAxis0[1] + px2 * chromaAxis0[2];
1784 chromaCoordinates3[px][1] = px0 * chromaAxis1[0] + px1 * chromaAxis1[1] + px2 * chromaAxis1[2];
1785 }
1786
1787 MFloat chromaCoordinateCentroid[2] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1788 for (int px = 0; px < 16; px++)
1789 {
1790 for (int ch = 0; ch < 2; ch++)
1791 chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
1792 }
1793
1794 if (punchthroughAlpha)
1795 {
1796 const MFloat numOpaqueF = ParallelMath::ToFloat(numOpaque);
1797 for (int px = 0; px < 16; px++)
1798 {
1799 for (int ch = 0; ch < 2; ch++)
1800 {
1801 MFloat chromaCoordinateMultiplied = chromaCoordinates3[px][ch] * numOpaqueF;
1802 MFloat delta = chromaCoordinateMultiplied - chromaCoordinateCentroid[ch];
1803 chromaDelta[px][ch] = delta;
1804 }
1805 }
1806 }
1807 else
1808 {
1809 for (int px = 0; px < 16; px++)
1810 {
1811 for (int ch = 0; ch < 2; ch++)
1812 chromaDelta[px][ch] = chromaCoordinates3[px][ch] * 16.0f - chromaCoordinateCentroid[ch];
1813 }
1814 }
1815 }
1816
1817
1818 MFloat covXX = ParallelMath::MakeFloatZero();
1819 MFloat covYY = ParallelMath::MakeFloatZero();
1820 MFloat covXY = ParallelMath::MakeFloatZero();
1821
1822 for (int px = 0; px < 16; px++)
1823 {
1824 MFloat nx = chromaDelta[px][0];
1825 MFloat ny = chromaDelta[px][1];
1826
1827 covXX = covXX + nx * nx;
1828 covYY = covYY + ny * ny;
1829 covXY = covXY + nx * ny;
1830 }
1831
1832 MFloat halfTrace = (covXX + covYY) * 0.5f;
1833 MFloat det = covXX * covYY - covXY * covXY;
1834
1835 MFloat mm = ParallelMath::Sqrt(ParallelMath::Max(ParallelMath::MakeFloatZero(), halfTrace * halfTrace - det));
1836
1837 MFloat ev = halfTrace + mm;
1838
1839 MFloat dx = (covYY - ev + covXY);
1840 MFloat dy = -(covXX - ev + covXY);
1841
1842 // If evenly distributed, pick an arbitrary plane
1843 ParallelMath::FloatCompFlag allZero = ParallelMath::Equal(dx, ParallelMath::MakeFloatZero()) & ParallelMath::Equal(dy, ParallelMath::MakeFloatZero());
1844 ParallelMath::ConditionalSet(dx, allZero, ParallelMath::MakeFloat(1.f));
1845
1846 ParallelMath::Int16CompFlag sectorAssignments[16];
1847 for (int px = 0; px < 16; px++)
1848 sectorAssignments[px] = ParallelMath::FloatFlagToInt16(ParallelMath::Less(chromaDelta[px][0] * dx + chromaDelta[px][1] * dy, ParallelMath::MakeFloatZero()));
1849
1850 if (!ParallelMath::AllSet(allTransparent))
1851 {
1852 EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
1853
1854 // Flip sector assignments
1855 for (int px = 0; px < 16; px++)
1856 sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1857
1858 EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
1859
1860 EncodeHMode(outputBuffer, bestError, sectorAssignments, pixels, internalData->m_h, preWeightedPixels, options);
1861
1862 CompressETC1BlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, internalData->m_drs, options, true);
1863 }
1864
1865 if (ParallelMath::AnySet(anyTransparent))
1866 {
1867 if (!ParallelMath::AllSet(allTransparent))
1868 {
1869 // Flip sector assignments
1870 for (int px = 0; px < 16; px++)
1871 sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1872 }
1873
1874 // Reset the error of any transparent blocks to max and retry with punchthrough modes
1875 ParallelMath::ConditionalSet(bestError, ParallelMath::Int16FlagToFloat(anyTransparent), ParallelMath::MakeFloat(FLT_MAX));
1876
1877 EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
1878
1879 // Flip sector assignments
1880 for (int px = 0; px < 16; px++)
1881 sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1882
1883 EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
1884
1885 CompressETC1PunchthroughBlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, pixelIsTransparent, static_cast<ETC2CompressionDataInternal*>(compressionData)->m_drs, options);
1886 }
1887}
1888
1889void cvtt::Internal::ETCComputer::CompressETC2AlphaBlock(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, const Options &options)
1890{
1891 MUInt15 pixels[16];
1892
1893 for (int px = 0; px < 16; px++)
1894 {
1895 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1896 ParallelMath::PutUInt15(pixels[px], block, pixelBlocks[block].m_pixels[px][3]);
1897 }
1898
1899 CompressETC2AlphaBlockInternal(outputBuffer, pixels, false, false, options);
1900}
1901
1902void cvtt::Internal::ETCComputer::CompressETC2AlphaBlockInternal(uint8_t *outputBuffer, const MUInt15 pixels[16], bool is11Bit, bool isSigned, const Options &options)
1903{
1904 MUInt15 minAlpha = ParallelMath::MakeUInt15(is11Bit ? 2047 : 255);
1905 MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
1906
1907 for (int px = 0; px < 16; px++)
1908 {
1909 minAlpha = ParallelMath::Min(minAlpha, pixels[px]);
1910 maxAlpha = ParallelMath::Max(maxAlpha, pixels[px]);
1911 }
1912
1913 MUInt15 alphaSpan = maxAlpha - minAlpha;
1914 MUInt15 alphaSpanMidpointTimes2 = maxAlpha + minAlpha;
1915
1916 MUInt31 bestTotalError = ParallelMath::MakeUInt31(0x7fffffff);
1917 MUInt15 bestTableIndex = ParallelMath::MakeUInt15(0);
1918 MUInt15 bestBaseCodeword = ParallelMath::MakeUInt15(0);
1919 MUInt15 bestMultiplier = ParallelMath::MakeUInt15(0);
1920 MUInt15 bestIndexes[16];
1921
1922 for (int px = 0; px < 16; px++)
1923 bestIndexes[px] = ParallelMath::MakeUInt15(0);
1924
1925 const int numAlphaRanges = 10;
1926 for (uint16_t tableIndex = 0; tableIndex < 16; tableIndex++)
1927 {
1928 for (int r = 0; r < numAlphaRanges; r++)
1929 {
1930 int subrange = r % 3;
1931 int mainRange = r / 3;
1932
1933 int16_t maxOffset = Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - (subrange & 1)];
1934 int16_t minOffset = -Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - ((subrange >> 1) & 1)] - 1;
1935 uint16_t offsetSpan = static_cast<uint16_t>(maxOffset - minOffset);
1936
1937 MSInt16 vminOffset = ParallelMath::MakeSInt16(minOffset);
1938 MUInt15 vmaxOffset = ParallelMath::MakeUInt15(maxOffset);
1939 MUInt15 voffsetSpan = ParallelMath::MakeUInt15(offsetSpan);
1940
1941 MUInt15 minMultiplier = ParallelMath::MakeUInt15(0);
1942 for (int block = 0; block < ParallelMath::ParallelSize; block++)
1943 {
1944 uint16_t singleAlphaSpan = ParallelMath::Extract(alphaSpan, block);
1945
1946 uint16_t lowMultiplier = singleAlphaSpan / offsetSpan;
1947 ParallelMath::PutUInt15(minMultiplier, block, lowMultiplier);
1948 }
1949
1950 if (is11Bit)
1951 {
1952 // Clamps this to valid multipliers under 15 and rounds down to nearest multiple of 8
1953 minMultiplier = ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(112)) & ParallelMath::MakeUInt15(120);
1954 }
1955 else
1956 {
1957 // We cap at 1 and 14 so both multipliers are valid and dividable
1958 // Cases where offset span is 0 should be caught by multiplier 1 of table 13
1959 minMultiplier = ParallelMath::Max(ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(14)), ParallelMath::MakeUInt15(1));
1960 }
1961
1962 for (uint16_t multiplierOffset = 0; multiplierOffset < 2; multiplierOffset++)
1963 {
1964 MUInt15 multiplier = minMultiplier;
1965
1966 if (is11Bit)
1967 {
1968 if (multiplierOffset == 1)
1969 multiplier = multiplier + ParallelMath::MakeUInt15(8);
1970 else
1971 multiplier = ParallelMath::Max(multiplier, ParallelMath::MakeUInt15(1));
1972 }
1973 else
1974 {
1975 if (multiplierOffset == 1)
1976 multiplier = multiplier + ParallelMath::MakeUInt15(1);
1977 }
1978
1979 MSInt16 multipliedMinOffset = ParallelMath::CompactMultiply(ParallelMath::LosslessCast<MSInt16>::Cast(multiplier), vminOffset);
1980 MUInt15 multipliedMaxOffset = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(multiplier, vmaxOffset));
1981
1982 // codeword = (maxOffset + minOffset + minAlpha + maxAlpha) / 2
1983 MSInt16 unclampedBaseAlphaTimes2 = ParallelMath::LosslessCast<MSInt16>::Cast(alphaSpanMidpointTimes2) - ParallelMath::LosslessCast<MSInt16>::Cast(multipliedMaxOffset) - multipliedMinOffset;
1984
1985 MUInt15 baseAlpha;
1986 if (is11Bit)
1987 {
1988 // In unsigned, 4 is added to the unquantized alpha, so compensating for that cancels the 4 we have to add to do rounding.
1989 if (isSigned)
1990 unclampedBaseAlphaTimes2 = unclampedBaseAlphaTimes2 + ParallelMath::MakeSInt16(8);
1991
1992 // -128 is illegal for some reason
1993 MSInt16 minBaseAlphaTimes2 = isSigned ? ParallelMath::MakeSInt16(16) : ParallelMath::MakeSInt16(0);
1994
1995 MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, minBaseAlphaTimes2)), ParallelMath::MakeUInt15(4095));
1996 baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2, 1) & ParallelMath::MakeUInt15(2040);
1997
1998 if (!isSigned)
1999 baseAlpha = baseAlpha + ParallelMath::MakeUInt15(4);
2000 }
2001 else
2002 {
2003 MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, ParallelMath::MakeSInt16(0))), ParallelMath::MakeUInt15(510));
2004 baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2 + ParallelMath::MakeUInt15(1), 1);
2005 }
2006
2007 MUInt15 indexes[16];
2008 MUInt31 totalError = ParallelMath::MakeUInt31(0);
2009 for (int px = 0; px < 16; px++)
2010 {
2011 MUInt15 quantizedValues;
2012 QuantizeETC2Alpha(tableIndex, pixels[px], baseAlpha, multiplier, is11Bit, isSigned, indexes[px], quantizedValues);
2013
2014 if (is11Bit)
2015 {
2016 MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(quantizedValues) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px]);
2017 MSInt32 deltaSq = ParallelMath::XMultiply(delta, delta);
2018 totalError = totalError + ParallelMath::LosslessCast<MUInt31>::Cast(deltaSq);
2019 }
2020 else
2021 totalError = totalError + ParallelMath::ToUInt31(ParallelMath::SqDiffUInt8(quantizedValues, pixels[px]));
2022 }
2023
2024 ParallelMath::Int16CompFlag isBetter = ParallelMath::Int32FlagToInt16(ParallelMath::Less(totalError, bestTotalError));
2025 if (ParallelMath::AnySet(isBetter))
2026 {
2027 ParallelMath::ConditionalSet(bestTotalError, isBetter, totalError);
2028 ParallelMath::ConditionalSet(bestTableIndex, isBetter, ParallelMath::MakeUInt15(tableIndex));
2029 ParallelMath::ConditionalSet(bestBaseCodeword, isBetter, baseAlpha);
2030 ParallelMath::ConditionalSet(bestMultiplier, isBetter, multiplier);
2031
2032 for (int px = 0; px < 16; px++)
2033 ParallelMath::ConditionalSet(bestIndexes[px], isBetter, indexes[px]);
2034 }
2035
2036 // TODO: Do one refine pass
2037 }
2038 }
2039 }
2040
2041 if (is11Bit)
2042 {
2043 bestMultiplier = ParallelMath::RightShift(bestMultiplier, 3);
2044
2045 if (isSigned)
2046 bestBaseCodeword = bestBaseCodeword ^ ParallelMath::MakeUInt15(0x80);
2047 }
2048
2049 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2050 {
2051 uint8_t *output = outputBuffer + block * 8;
2052
2053 output[0] = static_cast<uint8_t>(ParallelMath::Extract(bestBaseCodeword, block));
2054
2055 ParallelMath::ScalarUInt16 multiplier = ParallelMath::Extract(bestMultiplier, block);
2056 ParallelMath::ScalarUInt16 tableIndex = ParallelMath::Extract(bestTableIndex, block);
2057
2058 output[1] = static_cast<uint8_t>((multiplier << 4) | tableIndex);
2059
2060 static const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2061
2062 ParallelMath::ScalarUInt16 indexes[16];
2063 for (int px = 0; px < 16; px++)
2064 indexes[pixelSelectorOrder[px]] = ParallelMath::Extract(bestIndexes[px], block);
2065
2066 int outputOffset = 2;
2067 int outputBits = 0;
2068 int numOutputBits = 0;
2069 for (int s = 0; s < 16; s++)
2070 {
2071 outputBits = (outputBits << 3) | indexes[s];
2072 numOutputBits += 3;
2073
2074 if (numOutputBits >= 8)
2075 {
2076 output[outputOffset++] = static_cast<uint8_t>(outputBits >> (numOutputBits - 8));
2077 numOutputBits -= 8;
2078
2079 outputBits &= ((1 << numOutputBits) - 1);
2080 }
2081 }
2082
2083 assert(outputOffset == 8 && numOutputBits == 0);
2084 }
2085}
2086
2087void cvtt::Internal::ETCComputer::CompressEACBlock(uint8_t *outputBuffer, const PixelBlockScalarS16 *inputBlocks, bool isSigned, const Options &options)
2088{
2089 MUInt15 pixels[16];
2090 for (int px = 0; px < 16; px++)
2091 {
2092 MSInt16 adjustedPixel;
2093 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2094 ParallelMath::PutSInt16(adjustedPixel, block, inputBlocks[block].m_pixels[px]);
2095
2096 // We use a slightly shifted range here so we can keep the unquantized base color in a UInt15
2097 // That is, signed range is 1..2047, and unsigned range is 0..2047
2098 if (isSigned)
2099 {
2100 adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(1023)) + ParallelMath::MakeSInt16(1024);
2101 adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(1), adjustedPixel);
2102 }
2103 else
2104 {
2105 adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(2047));
2106 adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(0), adjustedPixel);
2107 }
2108
2109
2110 pixels[px] = ParallelMath::LosslessCast<MUInt15>::Cast(adjustedPixel);
2111 }
2112
2113 CompressETC2AlphaBlockInternal(outputBuffer, pixels, true, isSigned, options);
2114}
2115
2116void cvtt::Internal::ETCComputer::CompressETC1Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC1CompressionData *compressionData, const Options &options)
2117{
2118 DifferentialResolveStorage &drs = static_cast<ETC1CompressionDataInternal*>(compressionData)->m_drs;
2119 MFloat bestTotalError = ParallelMath::MakeFloat(FLT_MAX);
2120
2121 MUInt15 pixels[16][3];
2122 MFloat preWeightedPixels[16][3];
2123 ExtractBlocks(pixels, preWeightedPixels, inputBlocks, options);
2124
2125 CompressETC1BlockInternal(bestTotalError, outputBuffer, pixels, preWeightedPixels, drs, options, false);
2126}
2127
2128void cvtt::Internal::ETCComputer::ExtractBlocks(MUInt15 pixels[16][3], MFloat preWeightedPixels[16][3], const PixelBlockU8 *inputBlocks, const Options &options)
2129{
2130 bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
2131 bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
2132
2133 for (int px = 0; px < 16; px++)
2134 {
2135 for (int ch = 0; ch < 3; ch++)
2136 {
2137 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2138 ParallelMath::PutUInt15(pixels[px][ch], block, inputBlocks[block].m_pixels[px][ch]);
2139 }
2140
2141 if (isFakeBT709)
2142 ConvertToFakeBT709(preWeightedPixels[px], pixels[px]);
2143 else if (isUniform)
2144 {
2145 for (int ch = 0; ch < 3; ch++)
2146 preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
2147 }
2148 else
2149 {
2150 preWeightedPixels[px][0] = ParallelMath::ToFloat(pixels[px][0]) * options.redWeight;
2151 preWeightedPixels[px][1] = ParallelMath::ToFloat(pixels[px][1]) * options.greenWeight;
2152 preWeightedPixels[px][2] = ParallelMath::ToFloat(pixels[px][2]) * options.blueWeight;
2153 }
2154 }
2155}
2156
2157void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
2158{
2159 for (int ch = 0; ch < 3; ch++)
2160 {
2161 const MUInt15& cu15 = sectorCumulative[ch];
2162
2163 if (isDifferential)
2164 {
2165 //quantized[ch] = (cu * 31 + (cu >> 3)) >> 11;
2166 quantized[ch] = ParallelMath::ToUInt15(
2167 ParallelMath::RightShift(
2168 (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
2169 , 11)
2170 );
2171 }
2172 else
2173 {
2174 //quantized[ch] = (cu * 30 + (cu >> 3)) >> 12;
2175 quantized[ch] = ParallelMath::ToUInt15(
2176 ParallelMath::RightShift(
2177 (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
2178 , 12)
2179 );
2180 }
2181 }
2182
2183 MFloat lowOctantRGBFloat[3];
2184 MFloat highOctantRGBFloat[3];
2185
2186 for (int ch = 0; ch < 3; ch++)
2187 {
2188 MUInt15 unquantized;
2189 MUInt15 unquantizedNext;
2190 if (isDifferential)
2191 {
2192 unquantized = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
2193 MUInt15 quantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(31), quantized[ch] + ParallelMath::MakeUInt15(1));
2194 unquantizedNext = (quantizedNext << 3) | ParallelMath::RightShift(quantizedNext, 2);
2195 }
2196 else
2197 {
2198 unquantized = (quantized[ch] << 4) | quantized[ch];
2199 unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
2200 }
2201 lowOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantized << 3);
2202 highOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantizedNext << 3);
2203 }
2204
2205 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
2206 MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
2207
2208 MFloat cumulativeYUV[3];
2209 ConvertToFakeBT709(cumulativeYUV, sectorCumulative);
2210
2211 for (uint16_t octant = 0; octant < 8; octant++)
2212 {
2213 const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
2214 const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
2215 const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
2216
2217 MFloat octantYUV[3];
2218 ConvertToFakeBT709(octantYUV, r, g, b);
2219
2220 MFloat delta[3];
2221 for (int ch = 0; ch < 3; ch++)
2222 delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
2223
2224 MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
2225 ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
2226 ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
2227 bestError = ParallelMath::Min(error, bestError);
2228 }
2229
2230 for (int ch = 0; ch < 3; ch++)
2231 quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
2232}
2233
2234void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
2235{
2236 // sectorCumulative range is 0..2040 (11 bits)
2237 MUInt15 roundingOffset = ParallelMath::MakeUInt15(0);
2238
2239 MUInt15 rOffset;
2240 MUInt15 gOffset;
2241 MUInt15 bOffset;
2242 MUInt15 quantizedBase[3];
2243 MUInt15 upperBound;
2244
2245 MUInt15 sectorCumulativeFillIn[3];
2246 for (int ch = 0; ch < 3; ch++)
2247 sectorCumulativeFillIn[ch] = sectorCumulative[ch] + ParallelMath::RightShift(sectorCumulative[ch], 8);
2248
2249 if (isDifferential)
2250 {
2251 rOffset = (sectorCumulativeFillIn[0] << 6) & ParallelMath::MakeUInt15(0xf00);
2252 gOffset = (sectorCumulativeFillIn[1] << 4) & ParallelMath::MakeUInt15(0x0f0);
2253 bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 2) & ParallelMath::MakeUInt15(0x00f);
2254
2255 for (int ch = 0; ch < 3; ch++)
2256 quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 6);
2257
2258 upperBound = ParallelMath::MakeUInt15(31);
2259 }
2260 else
2261 {
2262 rOffset = (sectorCumulativeFillIn[0] << 5) & ParallelMath::MakeUInt15(0xf00);
2263 gOffset = (sectorCumulativeFillIn[1] << 1) & ParallelMath::MakeUInt15(0x0f0);
2264 bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 3) & ParallelMath::MakeUInt15(0x00f);
2265
2266 for (int ch = 0; ch < 3; ch++)
2267 quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 7);
2268
2269 upperBound = ParallelMath::MakeUInt15(15);
2270 }
2271
2272 MUInt15 lookupIndex = (rOffset | gOffset | bOffset);
2273
2274 MUInt15 octant;
2275 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2276 ParallelMath::PutUInt15(octant, block, Tables::FakeBT709::g_rounding16[ParallelMath::Extract(lookupIndex, block)]);
2277
2278 quantizedBase[0] = quantizedBase[0] + (octant & ParallelMath::MakeUInt15(1));
2279 quantizedBase[1] = quantizedBase[1] + (ParallelMath::RightShift(octant, 1) & ParallelMath::MakeUInt15(1));
2280 quantizedBase[2] = quantizedBase[2] + (ParallelMath::RightShift(octant, 2) & ParallelMath::MakeUInt15(1));
2281
2282 for (int ch = 0; ch < 3; ch++)
2283 quantized[ch] = ParallelMath::Min(quantizedBase[ch], upperBound);
2284}
2285
2286void cvtt::Internal::ETCComputer::ResolveTHFakeBT709Rounding(MUInt15 quantized[3], const MUInt15 targets[3], const MUInt15 &granularity)
2287{
2288 MFloat lowOctantRGBFloat[3];
2289 MFloat highOctantRGBFloat[3];
2290
2291 for (int ch = 0; ch < 3; ch++)
2292 {
2293 MUInt15 unquantized = (quantized[ch] << 4) | quantized[ch];
2294 MUInt15 unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
2295
2296 lowOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantized, granularity) << 1);
2297 highOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantizedNext, granularity) << 1);
2298 }
2299
2300 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
2301 MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
2302
2303 MFloat cumulativeYUV[3];
2304 ConvertToFakeBT709(cumulativeYUV, ParallelMath::ToFloat(targets[0]), ParallelMath::ToFloat(targets[1]), ParallelMath::ToFloat(targets[2]));
2305
2306 for (uint16_t octant = 0; octant < 8; octant++)
2307 {
2308 const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
2309 const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
2310 const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
2311
2312 MFloat octantYUV[3];
2313 ConvertToFakeBT709(octantYUV, r, g, b);
2314
2315 MFloat delta[3];
2316 for (int ch = 0; ch < 3; ch++)
2317 delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
2318
2319 MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
2320 ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
2321 ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
2322 bestError = ParallelMath::Min(error, bestError);
2323 }
2324
2325 for (int ch = 0; ch < 3; ch++)
2326 quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
2327}
2328
2329void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MUInt15 color[3])
2330{
2331 MFloat floatRGB[3];
2332 for (int ch = 0; ch < 3; ch++)
2333 floatRGB[ch] = ParallelMath::ToFloat(color[ch]);
2334
2335 ConvertToFakeBT709(yuv, floatRGB);
2336}
2337
2338void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat color[3])
2339{
2340 ConvertToFakeBT709(yuv, color[0], color[1], color[2]);
2341}
2342
2343void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat &pr, const MFloat &pg, const MFloat &pb)
2344{
2345 MFloat r = pr;
2346 MFloat g = pg;
2347 MFloat b = pb;
2348
2349 yuv[0] = r * 0.368233989135369f + g * 1.23876274963149f + b * 0.125054068802017f;
2350 yuv[1] = r * 0.5f - g * 0.4541529f - b * 0.04584709f;
2351 yuv[2] = r * -0.081014709086133f - g * 0.272538676238785f + b * 0.353553390593274f;
2352}
2353
2354void cvtt::Internal::ETCComputer::ConvertFromFakeBT709(MFloat rgb[3], const MFloat yuv[3])
2355{
2356 MFloat yy = yuv[0] * 0.57735026466774571071f;
2357 MFloat u = yuv[1];
2358 MFloat v = yuv[2];
2359
2360 rgb[0] = yy + u * 1.5748000207960953486f;
2361 rgb[1] = yy - u * 0.46812425854364753669f - v * 0.26491652528157560861f;
2362 rgb[2] = yy + v * 2.6242146882856944069f;
2363}
2364
2365
2366void cvtt::Internal::ETCComputer::QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues)
2367{
2368 MSInt16 offset = ParallelMath::LosslessCast<MSInt16>::Cast(value) - ParallelMath::LosslessCast<MSInt16>::Cast(baseValue);
2369 MSInt16 offsetTimes2 = offset + offset;
2370
2371 // ETC2's offset tables all have a reflect about 0.5*multiplier
2372 MSInt16 offsetAboutReflectorTimes2 = offsetTimes2 + ParallelMath::LosslessCast<MSInt16>::Cast(multiplier);
2373
2374 MUInt15 absOffsetAboutReflectorTimes2 = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Abs(offsetAboutReflectorTimes2));
2375 MUInt15 lookupIndex = ParallelMath::RightShift(absOffsetAboutReflectorTimes2, 1);
2376
2377 MUInt15 positiveIndex;
2378 MUInt15 positiveOffsetUnmultiplied;
2379 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2380 {
2381 uint16_t blockLookupIndex = ParallelMath::Extract(lookupIndex, block) / ParallelMath::Extract(multiplier, block);
2382 if (blockLookupIndex >= Tables::ETC2::g_alphaRoundingTableWidth)
2383 blockLookupIndex = Tables::ETC2::g_alphaRoundingTableWidth - 1;
2384 uint16_t index = Tables::ETC2::g_alphaRoundingTables[tableIndex][blockLookupIndex];
2385 ParallelMath::PutUInt15(positiveIndex, block, index);
2386 ParallelMath::PutUInt15(positiveOffsetUnmultiplied, block, Tables::ETC2::g_alphaModifierTablePositive[tableIndex][index]);
2387
2388 // TODO: This is suboptimal when the offset is capped. We should detect 0 and 255 values and always map them to the maximum offsets.
2389 // Doing that will also affect refinement though.
2390 }
2391
2392 MSInt16 signBits = ParallelMath::RightShift(offsetAboutReflectorTimes2, 15);
2393 MSInt16 offsetUnmultiplied = ParallelMath::LosslessCast<MSInt16>::Cast(positiveOffsetUnmultiplied) ^ signBits;
2394 MSInt16 quantizedOffset = ParallelMath::CompactMultiply(offsetUnmultiplied, multiplier);
2395
2396 MSInt16 offsetValue = ParallelMath::LosslessCast<MSInt16>::Cast(baseValue) + quantizedOffset;
2397
2398 if (is11Bit)
2399 {
2400 if (isSigned)
2401 outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(1), offsetValue)));
2402 else
2403 outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
2404 }
2405 else
2406 outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(255), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
2407
2408 MUInt15 indexSub = ParallelMath::LosslessCast<MUInt15>::Cast(signBits) & ParallelMath::MakeUInt15(4);
2409
2410 outIndexes = positiveIndex + ParallelMath::MakeUInt15(4) - indexSub;
2411}
2412
2413
2414void cvtt::Internal::ETCComputer::EmitTModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 lineColor[3], const ParallelMath::ScalarUInt16 isolatedColor[3], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque)
2415{
2416 static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2417
2418 uint32_t lowBits = 0;
2419 uint32_t highBits = 0;
2420
2421 int rh = ((isolatedColor[0] >> 2) & 3);
2422 int rl = (isolatedColor[0] & 3);
2423
2424 if (rh + rl < 4)
2425 {
2426 // Overflow low
2427 highBits |= 1 << (58 - 32);
2428 }
2429 else
2430 {
2431 // Overflow high
2432 highBits |= 7 << (61 - 32);
2433 }
2434
2435 highBits |= rh << (59 - 32);
2436 highBits |= rl << (56 - 32);
2437 highBits |= isolatedColor[1] << (52 - 32);
2438 highBits |= isolatedColor[2] << (48 - 32);
2439 highBits |= lineColor[0] << (44 - 32);
2440 highBits |= lineColor[1] << (40 - 32);
2441 highBits |= lineColor[2] << (36 - 32);
2442 highBits |= ((table >> 1) & 3) << (34 - 32);
2443 if (opaque)
2444 highBits |= 1 << (33 - 32);
2445 highBits |= (table & 1) << (32 - 32);
2446
2447 for (int px = 0; px < 16; px++)
2448 {
2449 int sel = (packedSelectors >> (2 * selectorOrder[px])) & 3;
2450 if ((sel & 0x1) != 0)
2451 lowBits |= (1 << px);
2452 if ((sel & 0x2) != 0)
2453 lowBits |= (1 << (16 + px));
2454 }
2455
2456 for (int i = 0; i < 4; i++)
2457 outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
2458 for (int i = 0; i < 4; i++)
2459 outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
2460}
2461
2462void cvtt::Internal::ETCComputer::EmitHModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 blockColors[2], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque)
2463{
2464 if (blockColors[0] == blockColors[1])
2465 {
2466 // Base colors are the same.
2467 // If the table low bit isn't 1, then we can't encode this, because swapping the block colors will have no effect
2468 // on their order.
2469 // Instead, we encode this as T mode where all of the indexes are on the line.
2470
2471 ParallelMath::ScalarUInt16 lineColor[3];
2472 ParallelMath::ScalarUInt16 isolatedColor[3];
2473
2474 lineColor[0] = isolatedColor[0] = (blockColors[0] >> 10) & 0x1f;
2475 lineColor[1] = isolatedColor[1] = (blockColors[0] >> 5) & 0x1f;
2476 lineColor[2] = isolatedColor[2] = (blockColors[0] >> 0) & 0x1f;
2477
2478 int32_t packedSelectors = 0x55555555;
2479 for (int px = 0; px < 16; px++)
2480 packedSelectors |= ((signBits >> px) & 1) << ((px * 2) + 1);
2481
2482 EmitTModeBlock(outputBuffer, lineColor, isolatedColor, packedSelectors, table, opaque);
2483 return;
2484 }
2485
2486 static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2487
2488 int16_t colors[2][3];
2489 for (int sector = 0; sector < 2; sector++)
2490 {
2491 for (int ch = 0; ch < 3; ch++)
2492 colors[sector][ch] = (blockColors[sector] >> ((2 - ch) * 5)) & 15;
2493 }
2494
2495 uint32_t lowBits = 0;
2496 uint32_t highBits = 0;
2497
2498 if (((table & 1) == 1) != (blockColors[0] > blockColors[1]))
2499 {
2500 for (int ch = 0; ch < 3; ch++)
2501 std::swap(colors[0][ch], colors[1][ch]);
2502 sectorBits ^= 0xffff;
2503 }
2504
2505 int r1 = colors[0][0];
2506 int g1a = colors[0][1] >> 1;
2507 int g1b = (colors[0][1] & 1);
2508 int b1a = colors[0][2] >> 3;
2509 int b1b = colors[0][2] & 7;
2510 int r2 = colors[1][0];
2511 int g2 = colors[1][1];
2512 int b2 = colors[1][2];
2513
2514 // Avoid overflowing R
2515 if ((g1a & 4) != 0 && r1 + g1a < 8)
2516 highBits |= 1 << (63 - 32);
2517
2518 int fakeDG = b1b >> 1;
2519 int fakeG = b1a | (g1b << 1);
2520
2521 if (fakeG + fakeDG < 4)
2522 {
2523 // Overflow low
2524 highBits |= 1 << (50 - 32);
2525 }
2526 else
2527 {
2528 // Overflow high
2529 highBits |= 7 << (53 - 32);
2530 }
2531
2532 int da = (table >> 2) & 1;
2533 int db = (table >> 1) & 1;
2534
2535 highBits |= r1 << (59 - 32);
2536 highBits |= g1a << (56 - 32);
2537 highBits |= g1b << (52 - 32);
2538 highBits |= b1a << (51 - 32);
2539 highBits |= b1b << (47 - 32);
2540 highBits |= r2 << (43 - 32);
2541 highBits |= g2 << (39 - 32);
2542 highBits |= b2 << (35 - 32);
2543 highBits |= da << (34 - 32);
2544 if (opaque)
2545 highBits |= 1 << (33 - 32);
2546 highBits |= db << (32 - 32);
2547
2548 for (int px = 0; px < 16; px++)
2549 {
2550 int sectorBit = (sectorBits >> selectorOrder[px]) & 1;
2551 int signBit = (signBits >> selectorOrder[px]) & 1;
2552
2553 lowBits |= (signBit << px);
2554 lowBits |= (sectorBit << (16 + px));
2555 }
2556
2557 uint8_t *output = outputBuffer;
2558
2559 for (int i = 0; i < 4; i++)
2560 output[i] = (highBits >> (24 - i * 8)) & 0xff;
2561 for (int i = 0; i < 4; i++)
2562 output[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
2563}
2564
2565void cvtt::Internal::ETCComputer::EmitETC1Block(uint8_t *outputBuffer, int blockBestFlip, int blockBestD, const int blockBestColors[2][3], const int blockBestTables[2], const ParallelMath::ScalarUInt16 blockBestSelectors[2], bool transparent)
2566{
2567 uint32_t highBits = 0;
2568 uint32_t lowBits = 0;
2569
2570 if (blockBestD == 0)
2571 {
2572 highBits |= blockBestColors[0][0] << 28;
2573 highBits |= blockBestColors[1][0] << 24;
2574 highBits |= blockBestColors[0][1] << 20;
2575 highBits |= blockBestColors[1][1] << 16;
2576 highBits |= blockBestColors[0][2] << 12;
2577 highBits |= blockBestColors[1][2] << 8;
2578 }
2579 else
2580 {
2581 highBits |= blockBestColors[0][0] << 27;
2582 highBits |= ((blockBestColors[1][0] - blockBestColors[0][0]) & 7) << 24;
2583 highBits |= blockBestColors[0][1] << 19;
2584 highBits |= ((blockBestColors[1][1] - blockBestColors[0][1]) & 7) << 16;
2585 highBits |= blockBestColors[0][2] << 11;
2586 highBits |= ((blockBestColors[1][2] - blockBestColors[0][2]) & 7) << 8;
2587 }
2588
2589 highBits |= (blockBestTables[0] << 5);
2590 highBits |= (blockBestTables[1] << 2);
2591 if (!transparent)
2592 highBits |= (blockBestD << 1);
2593 highBits |= blockBestFlip;
2594
2595 const uint8_t modifierCodes[4] = { 3, 2, 0, 1 };
2596
2597 uint8_t unpackedSelectors[16];
2598 uint8_t unpackedSelectorCodes[16];
2599 for (int sector = 0; sector < 2; sector++)
2600 {
2601 int blockSectorBestSelectors = blockBestSelectors[sector];
2602
2603 for (int px = 0; px < 8; px++)
2604 {
2605 int selector = (blockSectorBestSelectors >> (2 * px)) & 3;
2606 unpackedSelectorCodes[g_flipTables[blockBestFlip][sector][px]] = modifierCodes[selector];
2607 unpackedSelectors[g_flipTables[blockBestFlip][sector][px]] = selector;
2608 }
2609 }
2610
2611 const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2612
2613 int lowBitOffset = 0;
2614 for (int sb = 0; sb < 2; sb++)
2615 for (int px = 0; px < 16; px++)
2616 lowBits |= ((unpackedSelectorCodes[pixelSelectorOrder[px]] >> sb) & 1) << (px + sb * 16);
2617
2618 for (int i = 0; i < 4; i++)
2619 outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
2620 for (int i = 0; i < 4; i++)
2621 outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
2622}
2623
2624void cvtt::Internal::ETCComputer::CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], DifferentialResolveStorage &drs, const Options &options, bool punchthrough)
2625{
2626 int numTries = 0;
2627
2628 MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
2629 MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
2630
2631 MUInt15 bestColors[2] = { zeroU15, zeroU15 };
2632 MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
2633 MUInt15 bestTables[2] = { zeroU15, zeroU15 };
2634 MUInt15 bestFlip = zeroU15;
2635 MUInt15 bestD = zeroU15;
2636
2637 MUInt15 sectorPixels[2][2][8][3];
2638 MFloat sectorPreWeightedPixels[2][2][8][3];
2639 MUInt15 sectorCumulative[2][2][3];
2640
2641 ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
2642
2643 for (int flip = 0; flip < 2; flip++)
2644 {
2645 for (int sector = 0; sector < 2; sector++)
2646 {
2647 for (int ch = 0; ch < 3; ch++)
2648 sectorCumulative[flip][sector][ch] = zeroU15;
2649
2650 for (int px = 0; px < 8; px++)
2651 {
2652 for (int ch = 0; ch < 3; ch++)
2653 {
2654 MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
2655 sectorPixels[flip][sector][px][ch] = pixelChannelValue;
2656 sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
2657 sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
2658 }
2659 }
2660 }
2661 }
2662
2663 static const MSInt16 modifierTables[8][4] =
2664 {
2665 { ParallelMath::MakeSInt16(-8), ParallelMath::MakeSInt16(-2), ParallelMath::MakeSInt16(2), ParallelMath::MakeSInt16(8) },
2666 { ParallelMath::MakeSInt16(-17), ParallelMath::MakeSInt16(-5), ParallelMath::MakeSInt16(5), ParallelMath::MakeSInt16(17) },
2667 { ParallelMath::MakeSInt16(-29), ParallelMath::MakeSInt16(-9), ParallelMath::MakeSInt16(9), ParallelMath::MakeSInt16(29) },
2668 { ParallelMath::MakeSInt16(-42), ParallelMath::MakeSInt16(-13), ParallelMath::MakeSInt16(13), ParallelMath::MakeSInt16(42) },
2669 { ParallelMath::MakeSInt16(-60), ParallelMath::MakeSInt16(-18), ParallelMath::MakeSInt16(18), ParallelMath::MakeSInt16(60) },
2670 { ParallelMath::MakeSInt16(-80), ParallelMath::MakeSInt16(-24), ParallelMath::MakeSInt16(24), ParallelMath::MakeSInt16(80) },
2671 { ParallelMath::MakeSInt16(-106), ParallelMath::MakeSInt16(-33), ParallelMath::MakeSInt16(33), ParallelMath::MakeSInt16(106) },
2672 { ParallelMath::MakeSInt16(-183), ParallelMath::MakeSInt16(-47), ParallelMath::MakeSInt16(47), ParallelMath::MakeSInt16(183) },
2673 };
2674
2675 bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
2676
2677 int minD = punchthrough ? 1 : 0;
2678
2679 for (int flip = 0; flip < 2; flip++)
2680 {
2681 drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
2682
2683 MFloat bestIndError[2] = { ParallelMath::MakeFloat(FLT_MAX), ParallelMath::MakeFloat(FLT_MAX) };
2684 MUInt16 bestIndSelectors[2] = { ParallelMath::MakeUInt16(0), ParallelMath::MakeUInt16(0) };
2685 MUInt15 bestIndColors[2] = { zeroU15, zeroU15 };
2686 MUInt15 bestIndTable[2] = { zeroU15, zeroU15 };
2687
2688 for (int d = minD; d < 2; d++)
2689 {
2690 for (int sector = 0; sector < 2; sector++)
2691 {
2692 const int16_t *potentialOffsets = cvtt::Tables::ETC1::g_potentialOffsets4;
2693
2694 for (int table = 0; table < 8; table++)
2695 {
2696 int16_t numOffsets = *potentialOffsets++;
2697
2698 MUInt15 possibleColors[cvtt::Tables::ETC1::g_maxPotentialOffsets];
2699
2700 MUInt15 quantized[3];
2701 for (int oi = 0; oi < numOffsets; oi++)
2702 {
2703 if (!isFakeBT709)
2704 {
2705 for (int ch = 0; ch < 3; ch++)
2706 {
2707 // cu is in range 0..2040
2708 MUInt15 cu15 = ParallelMath::Min(
2709 ParallelMath::MakeUInt15(2040),
2710 ParallelMath::ToUInt15(
2711 ParallelMath::Max(
2712 ParallelMath::MakeSInt16(0),
2713 ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
2714 )
2715 )
2716 );
2717
2718 if (d == 1)
2719 {
2720 //quantized[ch] = (cu * 31 + (cu >> 3) + 1024) >> 11;
2721 quantized[ch] = ParallelMath::ToUInt15(
2722 ParallelMath::RightShift(
2723 (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(1024)
2724 , 11)
2725 );
2726 }
2727 else
2728 {
2729 //quantized[ch] = (cu * 30 + (cu >> 3) + 2048) >> 12;
2730 quantized[ch] = ParallelMath::ToUInt15(
2731 ParallelMath::RightShift(
2732 (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(2048)
2733 , 12)
2734 );
2735 }
2736 }
2737 }
2738 else
2739 {
2740 MUInt15 offsetCumulative[3];
2741 for (int ch = 0; ch < 3; ch++)
2742 {
2743 // cu is in range 0..2040
2744 MUInt15 cu15 = ParallelMath::Min(
2745 ParallelMath::MakeUInt15(2040),
2746 ParallelMath::ToUInt15(
2747 ParallelMath::Max(
2748 ParallelMath::MakeSInt16(0),
2749 ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
2750 )
2751 )
2752 );
2753
2754 offsetCumulative[ch] = cu15;
2755 }
2756
2757 if ((options.flags & cvtt::Flags::ETC_FakeBT709Accurate) != 0)
2758 ResolveHalfBlockFakeBT709RoundingAccurate(quantized, offsetCumulative, d == 1);
2759 else
2760 ResolveHalfBlockFakeBT709RoundingFast(quantized, offsetCumulative, d == 1);
2761 }
2762
2763 possibleColors[oi] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
2764 }
2765
2766 potentialOffsets += numOffsets;
2767
2768 ParallelMath::UInt15 numUniqueColors;
2769 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2770 {
2771 uint16_t blockNumUniqueColors = 1;
2772 for (int i = 1; i < numOffsets; i++)
2773 {
2774 uint16_t color = ParallelMath::Extract(possibleColors[i], block);
2775 if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
2776 ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
2777 }
2778
2779 ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
2780 }
2781
2782 int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
2783 for (int block = 1; block < ParallelMath::ParallelSize; block++)
2784 maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
2785
2786 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2787 {
2788 uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
2789 for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
2790 ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
2791 }
2792
2793 for (int i = 0; i < maxUniqueColors; i++)
2794 {
2795 MFloat error = ParallelMath::MakeFloatZero();
2796 MUInt16 selectors = ParallelMath::MakeUInt16(0);
2797 MUInt15 quantized = possibleColors[i];
2798 TestHalfBlock(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], modifierTables[table], d == 1, options);
2799
2800 if (d == 0)
2801 {
2802 ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestIndError[sector]));
2803 if (ParallelMath::AnySet(errorBetter))
2804 {
2805 bestIndError[sector] = ParallelMath::Min(error, bestIndError[sector]);
2806 ParallelMath::ConditionalSet(bestIndSelectors[sector], errorBetter, selectors);
2807 ParallelMath::ConditionalSet(bestIndColors[sector], errorBetter, quantized);
2808 ParallelMath::ConditionalSet(bestIndTable[sector], errorBetter, ParallelMath::MakeUInt15(table));
2809 }
2810 }
2811 else
2812 {
2813 ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
2814
2815 MUInt15 storageIndexes = drs.diffNumAttempts[sector];
2816 drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
2817
2818 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2819 {
2820 int storageIndex = ParallelMath::Extract(storageIndexes, block);
2821
2822 ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
2823 ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
2824 ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
2825 ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
2826 }
2827 }
2828 }
2829 }
2830 }
2831
2832 if (d == 0)
2833 {
2834 MFloat bestIndErrorTotal = bestIndError[0] + bestIndError[1];
2835 ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(bestIndErrorTotal, bestTotalError));
2836 if (ParallelMath::AnySet(errorBetter))
2837 {
2838 bestIsThisMode = bestIsThisMode | errorBetter;
2839
2840 bestTotalError = ParallelMath::Min(bestTotalError, bestIndErrorTotal);
2841 ParallelMath::ConditionalSet(bestFlip, errorBetter, ParallelMath::MakeUInt15(flip));
2842 ParallelMath::ConditionalSet(bestD, errorBetter, ParallelMath::MakeUInt15(d));
2843 for (int sector = 0; sector < 2; sector++)
2844 {
2845 ParallelMath::ConditionalSet(bestColors[sector], errorBetter, bestIndColors[sector]);
2846 ParallelMath::ConditionalSet(bestSelectors[sector], errorBetter, bestIndSelectors[sector]);
2847 ParallelMath::ConditionalSet(bestTables[sector], errorBetter, bestIndTable[sector]);
2848 }
2849 }
2850 }
2851 else
2852 {
2853 ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(false), ParallelMath::MakeBoolInt16(false) };
2854 FindBestDifferentialCombination(flip, d, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestD, bestColors, bestSelectors, bestTables, drs);
2855 }
2856 }
2857 }
2858
2859 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2860 {
2861 if (!ParallelMath::Extract(bestIsThisMode, block))
2862 continue;
2863
2864 uint32_t highBits = 0;
2865 uint32_t lowBits = 0;
2866
2867 int blockBestFlip = ParallelMath::Extract(bestFlip, block);
2868 int blockBestD = ParallelMath::Extract(bestD, block);
2869 int blockBestTables[2] = { ParallelMath::Extract(bestTables[0], block), ParallelMath::Extract(bestTables[1], block) };
2870 ParallelMath::ScalarUInt16 blockBestSelectors[2] = { ParallelMath::Extract(bestSelectors[0], block), ParallelMath::Extract(bestSelectors[1], block) };
2871
2872 int colors[2][3];
2873 for (int sector = 0; sector < 2; sector++)
2874 {
2875 int sectorColor = ParallelMath::Extract(bestColors[sector], block);
2876 for (int ch = 0; ch < 3; ch++)
2877 colors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
2878 }
2879
2880 EmitETC1Block(outputBuffer + block * 8, blockBestFlip, blockBestD, colors, blockBestTables, blockBestSelectors, false);
2881 }
2882}
2883
2884
2885void cvtt::Internal::ETCComputer::CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], DifferentialResolveStorage &drs, const Options &options)
2886{
2887 int numTries = 0;
2888
2889 MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
2890 MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
2891
2892 MUInt15 bestColors[2] = { zeroU15, zeroU15 };
2893 MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
2894 MUInt15 bestTables[2] = { zeroU15, zeroU15 };
2895 MUInt15 bestFlip = zeroU15;
2896
2897 MUInt15 sectorPixels[2][2][8][3];
2898 ParallelMath::Int16CompFlag sectorTransparent[2][2][8];
2899 MFloat sectorPreWeightedPixels[2][2][8][3];
2900 MUInt15 sectorCumulative[2][2][3];
2901
2902 ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
2903
2904 for (int flip = 0; flip < 2; flip++)
2905 {
2906 for (int sector = 0; sector < 2; sector++)
2907 {
2908 for (int ch = 0; ch < 3; ch++)
2909 sectorCumulative[flip][sector][ch] = zeroU15;
2910
2911 for (int px = 0; px < 8; px++)
2912 {
2913 for (int ch = 0; ch < 3; ch++)
2914 {
2915 MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
2916 sectorPixels[flip][sector][px][ch] = pixelChannelValue;
2917 sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
2918 sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
2919 }
2920
2921 sectorTransparent[flip][sector][px] = isTransparent[g_flipTables[flip][sector][px]];
2922 }
2923 }
2924 }
2925
2926 static const MUInt15 modifiers[8] =
2927 {
2928 ParallelMath::MakeUInt15(8),
2929 ParallelMath::MakeUInt15(17),
2930 ParallelMath::MakeUInt15(29),
2931 ParallelMath::MakeUInt15(42),
2932 ParallelMath::MakeUInt15(60),
2933 ParallelMath::MakeUInt15(80),
2934 ParallelMath::MakeUInt15(106),
2935 ParallelMath::MakeUInt15(183),
2936 };
2937
2938 bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
2939
2940 const int maxSectorCumulativeOffsets = 17;
2941
2942 for (int flip = 0; flip < 2; flip++)
2943 {
2944 ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(true), ParallelMath::MakeBoolInt16(false) };
2945
2946 for (int sector = 0; sector < 2; sector++)
2947 for (int px = 0; px < 8; px++)
2948 canIgnoreSector[sector] = canIgnoreSector[sector] & sectorTransparent[flip][sector][px];
2949
2950 drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
2951
2952 for (int sector = 0; sector < 2; sector++)
2953 {
2954 MUInt15 sectorNumOpaque = ParallelMath::MakeUInt15(0);
2955 for (int px = 0; px < 8; px++)
2956 sectorNumOpaque = sectorNumOpaque + ParallelMath::SelectOrZero(sectorTransparent[flip][sector][px], ParallelMath::MakeUInt15(1));
2957
2958 int sectorMaxOpaque = 0;
2959 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2960 sectorMaxOpaque = std::max<int>(sectorMaxOpaque, ParallelMath::Extract(sectorNumOpaque, block));
2961
2962 int sectorNumOpaqueMultipliers = sectorMaxOpaque * 2 + 1;
2963
2964 MUInt15 sectorNumOpaqueDenominator = ParallelMath::Max(ParallelMath::MakeUInt15(1), sectorNumOpaque) << 8;
2965 MUInt15 sectorNumOpaqueAddend = sectorNumOpaque << 7;
2966
2967 MSInt16 sectorNumOpaqueSigned = ParallelMath::LosslessCast<MSInt16>::Cast(sectorNumOpaque);
2968 MSInt16 negSectorNumOpaqueSigned = ParallelMath::MakeSInt16(0) - sectorNumOpaqueSigned;
2969
2970 MUInt15 sectorCumulativeMax = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(255), sectorNumOpaque));
2971
2972 for (int table = 0; table < 8; table++)
2973 {
2974 MUInt15 possibleColors[maxSectorCumulativeOffsets];
2975
2976 MUInt15 quantized[3];
2977 for (int om = -sectorMaxOpaque; om <= sectorMaxOpaque; om++)
2978 {
2979 MSInt16 clampedOffsetMult = ParallelMath::Max(ParallelMath::Min(ParallelMath::MakeSInt16(om), sectorNumOpaqueSigned), negSectorNumOpaqueSigned);
2980 MSInt16 offset = ParallelMath::CompactMultiply(clampedOffsetMult, modifiers[table]);
2981
2982 for (int ch = 0; ch < 3; ch++)
2983 {
2984 // cu is in range 0..255*numOpaque (at most 0..2040)
2985 MUInt15 cu15 = ParallelMath::Min(
2986 sectorCumulativeMax,
2987 ParallelMath::ToUInt15(
2988 ParallelMath::Max(
2989 ParallelMath::MakeSInt16(0),
2990 ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + offset
2991 )
2992 )
2993 );
2994
2995 //quantized[ch] = (cu * 31 + (cu >> 3) + (numOpaque * 128)) / (numOpaque * 256)
2996 MUInt16 cuTimes31 = (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15);
2997 MUInt15 cuDiv8 = ParallelMath::RightShift(cu15, 3);
2998 MUInt16 numerator = cuTimes31 + ParallelMath::LosslessCast<MUInt16>::Cast(cuDiv8 + sectorNumOpaqueAddend);
2999 for (int block = 0; block < ParallelMath::ParallelSize; block++)
3000 ParallelMath::PutUInt15(quantized[ch], block, ParallelMath::Extract(numerator, block) / ParallelMath::Extract(sectorNumOpaqueDenominator, block));
3001 }
3002
3003 possibleColors[om + sectorMaxOpaque] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
3004 }
3005
3006 ParallelMath::UInt15 numUniqueColors;
3007 for (int block = 0; block < ParallelMath::ParallelSize; block++)
3008 {
3009 uint16_t blockNumUniqueColors = 1;
3010 for (int i = 1; i < sectorNumOpaqueMultipliers; i++)
3011 {
3012 uint16_t color = ParallelMath::Extract(possibleColors[i], block);
3013 if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
3014 ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
3015 }
3016
3017 ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
3018 }
3019
3020 int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
3021 for (int block = 1; block < ParallelMath::ParallelSize; block++)
3022 maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
3023
3024 for (int block = 0; block < ParallelMath::ParallelSize; block++)
3025 {
3026 uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
3027 for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
3028 ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
3029 }
3030
3031 for (int i = 0; i < maxUniqueColors; i++)
3032 {
3033 MFloat error = ParallelMath::MakeFloatZero();
3034 MUInt16 selectors = ParallelMath::MakeUInt16(0);
3035 MUInt15 quantized = possibleColors[i];
3036 TestHalfBlockPunchthrough(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], sectorTransparent[flip][sector], modifiers[table], options);
3037
3038 ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
3039
3040 MUInt15 storageIndexes = drs.diffNumAttempts[sector];
3041 drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
3042
3043 for (int block = 0; block < ParallelMath::ParallelSize; block++)
3044 {
3045 int storageIndex = ParallelMath::Extract(storageIndexes, block);
3046
3047 ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
3048 ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
3049 ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
3050 ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
3051 }
3052 }
3053 }
3054 }
3055
3056 MUInt15 bestDDummy = ParallelMath::MakeUInt15(0);
3057 FindBestDifferentialCombination(flip, 1, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestDDummy, bestColors, bestSelectors, bestTables, drs);
3058 }
3059
3060 for (int block = 0; block < ParallelMath::ParallelSize; block++)
3061 {
3062 if (!ParallelMath::Extract(bestIsThisMode, block))
3063 continue;
3064
3065 int blockBestColors[2][3];
3066 int blockBestTables[2];
3067 ParallelMath::ScalarUInt16 blockBestSelectors[2];
3068 for (int sector = 0; sector < 2; sector++)
3069 {
3070 int sectorColor = ParallelMath::Extract(bestColors[sector], block);
3071 for (int ch = 0; ch < 3; ch++)
3072 blockBestColors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
3073
3074 blockBestTables[sector] = ParallelMath::Extract(bestTables[sector], block);
3075 blockBestSelectors[sector] = ParallelMath::Extract(bestSelectors[sector], block);
3076 }
3077
3078 EmitETC1Block(outputBuffer + block * 8, ParallelMath::Extract(bestFlip, block), 1, blockBestColors, blockBestTables, blockBestSelectors, true);
3079 }
3080}
3081
3082
3083cvtt::ETC1CompressionData *cvtt::Internal::ETCComputer::AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void *context)
3084{
3085 void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
3086 if (!buffer)
3087 return NULL;
3088 new (buffer) cvtt::Internal::ETCComputer::ETC1CompressionDataInternal(context);
3089 return static_cast<ETC1CompressionData*>(buffer);
3090}
3091
3092void cvtt::Internal::ETCComputer::ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
3093{
3094 cvtt::Internal::ETCComputer::ETC1CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC1CompressionDataInternal*>(compressionData);
3095 void *context = internalData->m_context;
3096 internalData->~ETC1CompressionDataInternal();
3097 freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
3098}
3099
3100cvtt::ETC2CompressionData *cvtt::Internal::ETCComputer::AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void *context, const cvtt::Options &options)
3101{
3102 void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
3103 if (!buffer)
3104 return NULL;
3105 new (buffer) cvtt::Internal::ETCComputer::ETC2CompressionDataInternal(context, options);
3106 return static_cast<ETC2CompressionData*>(buffer);
3107}
3108
3109void cvtt::Internal::ETCComputer::ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
3110{
3111 cvtt::Internal::ETCComputer::ETC2CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC2CompressionDataInternal*>(compressionData);
3112 void *context = internalData->m_context;
3113 internalData->~ETC2CompressionDataInternal();
3114 freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
3115}
3116
3117cvtt::Internal::ETCComputer::ETC2CompressionDataInternal::ETC2CompressionDataInternal(void *context, const cvtt::Options &options)
3118 : m_context(context)
3119{
3120 const float cd[3] = { options.redWeight, options.greenWeight, options.blueWeight };
3121 const float rotCD[3] = { cd[1], cd[2], cd[0] };
3122
3123 const float offs = -(rotCD[0] * cd[0] + rotCD[1] * cd[1] + rotCD[2] * cd[2]) / (cd[0] * cd[0] + cd[1] * cd[1] + cd[2] * cd[2]);
3124
3125 const float chromaAxis0[3] = { rotCD[0] + cd[0] * offs, rotCD[1] + cd[1] * offs, rotCD[2] + cd[2] * offs };
3126
3127 const float chromaAxis1Unnormalized[3] =
3128 {
3129 chromaAxis0[1] * cd[2] - chromaAxis0[2] * cd[1],
3130 chromaAxis0[2] * cd[0] - chromaAxis0[0] * cd[2],
3131 chromaAxis0[0] * cd[1] - chromaAxis0[1] * cd[0]
3132 };
3133
3134 const float ca0LengthSq = (chromaAxis0[0] * chromaAxis0[0] + chromaAxis0[1] * chromaAxis0[1] + chromaAxis0[2] * chromaAxis0[2]);
3135 const float ca1UNLengthSq = (chromaAxis1Unnormalized[0] * chromaAxis1Unnormalized[0] + chromaAxis1Unnormalized[1] * chromaAxis1Unnormalized[1] + chromaAxis1Unnormalized[2] * chromaAxis1Unnormalized[2]);
3136 const float lengthRatio = static_cast<float>(std::sqrt(ca0LengthSq / ca1UNLengthSq));
3137
3138 const float chromaAxis1[3] = { chromaAxis1Unnormalized[0] * lengthRatio, chromaAxis1Unnormalized[1] * lengthRatio, chromaAxis1Unnormalized[2] * lengthRatio };
3139
3140 for (int i = 0; i < 3; i++)
3141 {
3142 m_chromaSideAxis0[i] = chromaAxis0[i];
3143 m_chromaSideAxis1[i] = chromaAxis1[i];
3144 }
3145}
3146
3147#endif
3148