ConvectionKernels_ETC.cpp source code [Godot/thirdparty/cvtt/ConvectionKernels_ETC.cpp]

1	/*
2	Convection Texture Tools
3	Copyright (c) 2018-2019 Eric Lasota
4
5	Permission is hereby granted, free of charge, to any person obtaining
6	a copy of this software and associated documentation files (the
7	"Software"), to deal in the Software without restriction, including
8	without limitation the rights to use, copy, modify, merge, publish,
9	distribute, sublicense, and/or sell copies of the Software, and to
10	permit persons to whom the Software is furnished to do so, subject
11	to the following conditions:
12
13	The above copyright notice and this permission notice shall be included
14	in all copies or substantial portions of the Software.
15
16	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24	-------------------------------------------------------------------------------------
25
26	Portions based on DirectX Texture Library (DirectXTex)
27
28	Copyright (c) Microsoft Corporation. All rights reserved.
29	Licensed under the MIT License.
30
31	http://go.microsoft.com/fwlink/?LinkId=248926
32	*/
33	#include "ConvectionKernels_Config.h"
34
35	#if !defined(CVTT_SINGLE_FILE) \|\| defined(CVTT_SINGLE_FILE_IMPL)
36
37	#include "ConvectionKernels.h"
38	#include "ConvectionKernels_ETC.h"
39	#include "ConvectionKernels_ETC1.h"
40	#include "ConvectionKernels_ETC2.h"
41	#include "ConvectionKernels_ETC2_Rounding.h"
42	#include "ConvectionKernels_ParallelMath.h"
43	#include "ConvectionKernels_FakeBT709_Rounding.h"
44
45	#include <cmath>
46
47	const int cvtt::Internal::ETCComputer::g_flipTables[`2`][`2`][`8`] =
48	{
49	{
50	{ `0`, `1`, `4`, `5`, `8`, `9`, `12`, `13` },
51	{ `2`, `3`, `6`, `7`, `10`, `11`, `14`, `15` }
52	},
53	{
54	{ `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7` },
55	{ `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15` }
56	},
57	};
58
59	cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorUniform(const MUInt15 pixelA[`3`], const MUInt15 pixelB[`3`])
60	{
61	MSInt16 d0 = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[`0`]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[`0`]);
62	MFloat fd0 = ParallelMath::ToFloat(d0);
63	MFloat error = fd0 * fd0;
64	for (int ch = `1`; ch < `3`; ch++)
65	{
66	MSInt16 d = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[ch]);
67	MFloat fd = ParallelMath::ToFloat(d);
68	error = error + fd * fd;
69	}
70	return error;
71	}
72
73	cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorWeighted(const MUInt15 reconstructed[`3`], const MFloat preWeightedPixel[`3`], const Options options)
74	{
75	MFloat dr = ParallelMath::ToFloat(reconstructed[`0`]) * options.redWeight - preWeightedPixel[`0`];
76	MFloat dg = ParallelMath::ToFloat(reconstructed[`1`]) * options.greenWeight - preWeightedPixel[`1`];
77	MFloat db = ParallelMath::ToFloat(reconstructed[`2`]) * options.blueWeight - preWeightedPixel[`2`];
78
79	return dr * dr + dg * dg + db * db;
80	}
81
82	cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorFakeBT709(const MUInt15 reconstructed[`3`], const MFloat preWeightedPixel[`3`])
83	{
84	MFloat yuv[`3`];
85	ConvertToFakeBT709(yuv, reconstructed);
86
87	MFloat dy = yuv[`0`] - preWeightedPixel[`0`];
88	MFloat du = yuv[`1`] - preWeightedPixel[`1`];
89	MFloat dv = yuv[`2`] - preWeightedPixel[`2`];
90
91	return dy * dy + du * du + dv * dv;
92	}
93
94	void cvtt::Internal::ETCComputer::TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[`8`][`3`], const MFloat preWeightedPixels[`8`][`3`], const MSInt16 modifiers[`4`], bool isDifferential, const Options &options)
95	{
96	MUInt15 quantized[`3`];
97	MUInt15 unquantized[`3`];
98
99	for (int ch = `0`; ch < `3`; ch++)
100	{
101	quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * `5`)) & ParallelMath::MakeUInt15(`31`));
102
103	if (isDifferential)
104	unquantized[ch] = (quantized[ch] << `3`) \| ParallelMath::RightShift(quantized[ch], `2`);
105	else
106	unquantized[ch] = (quantized[ch] << `4`) \| quantized[ch];
107	}
108
109	MUInt16 selectors = ParallelMath::MakeUInt16(`0`);
110	MFloat totalError = ParallelMath::MakeFloatZero();
111
112	MUInt15 u15_255 = ParallelMath::MakeUInt15(`255`);
113	MSInt16 s16_zero = ParallelMath::MakeSInt16(`0`);
114
115	MUInt15 unquantizedModified[`4`][`3`];
116	for (unsigned int s = `0`; s < `4`; s++)
117	for (int ch = `0`; ch < `3`; ch++)
118	unquantizedModified[s][ch] = ParallelMath::Min(ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::ToSInt16(unquantized[ch]) + modifiers[s], s16_zero)), u15_255);
119
120	bool isUniform = ((options.flags & cvtt::Flags::Uniform) != `0`);
121	bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != `0`);
122
123	for (int px = `0`; px < `8`; px++)
124	{
125	MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
126	MUInt16 bestSelector = ParallelMath::MakeUInt16(`0`);
127
128	for (unsigned int s = `0`; s < `4`; s++)
129	{
130	MFloat error;
131	if (isFakeBT709)
132	error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
133	else if (isUniform)
134	error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
135	else
136	error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
137
138	ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
139	bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt16(s), bestSelector);
140	bestError = ParallelMath::Min(error, bestError);
141	}
142
143	totalError = totalError + bestError;
144	selectors = selectors \| (bestSelector << (px * `2`));
145	}
146
147	outError = totalError;
148	outSelectors = selectors;
149	}
150
151	void cvtt::Internal::ETCComputer::TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[`8`][`3`], const MFloat preWeightedPixels[`8`][`3`], const ParallelMath::Int16CompFlag isTransparent[`8`], const MUInt15 modifier, const Options &options)
152	{
153	MUInt15 quantized[`3`];
154	MUInt15 unquantized[`3`];
155
156	for (int ch = `0`; ch < `3`; ch++)
157	{
158	quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * `5`)) & ParallelMath::MakeUInt15(`31`));
159	unquantized[ch] = (quantized[ch] << `3`) \| ParallelMath::RightShift(quantized[ch], `2`);
160	}
161
162	MUInt16 selectors = ParallelMath::MakeUInt16(`0`);
163	MFloat totalError = ParallelMath::MakeFloatZero();
164
165	MUInt15 u15_255 = ParallelMath::MakeUInt15(`255`);
166	MSInt16 s16_zero = ParallelMath::MakeSInt16(`0`);
167
168	MUInt15 unquantizedModified[`3`][`3`];
169	for (int ch = `0`; ch < `3`; ch++)
170	{
171	unquantizedModified[`0`][ch] = ParallelMath::Max(unquantized[ch], modifier) - modifier;
172	unquantizedModified[`1`][ch] = unquantized[ch];
173	unquantizedModified[`2`][ch] = ParallelMath::Min(unquantized[ch] + modifier, u15_255);
174	}
175
176	bool isUniform = ((options.flags & cvtt::Flags::Uniform) != `0`);
177	bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != `0`);
178
179	for (int px = `0`; px < `8`; px++)
180	{
181	ParallelMath::FloatCompFlag isTransparentFloat = ParallelMath::Int16FlagToFloat(isTransparent[px]);
182
183	MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
184	MUInt15 bestSelector = ParallelMath::MakeUInt15(`0`);
185
186	for (unsigned int s = `0`; s < `3`; s++)
187	{
188	MFloat error;
189	if (isFakeBT709)
190	error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
191	else if (isUniform)
192	error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
193	else
194	error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
195
196	ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
197	bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(s), bestSelector);
198	bestError = ParallelMath::Min(error, bestError);
199	}
200
201	// Annoying quirk: The ETC encoding machinery assumes that selectors are in the table order in the spec, which isn't
202	// the same as their encoding bits, so the transparent index is actually 1 and the valid indexes are 0, 2, and 3.
203
204	// Remap selector 1 to 2, and 2 to 3
205	bestSelector = ParallelMath::Min(ParallelMath::MakeUInt15(`3`), bestSelector << `1`);
206
207	// Mark zero transparent as
208	ParallelMath::ConditionalSet(bestError, isTransparentFloat, ParallelMath::MakeFloatZero());
209	ParallelMath::ConditionalSet(bestSelector, isTransparent[px], ParallelMath::MakeUInt15(`1`));
210
211	totalError = totalError + bestError;
212	selectors = selectors \| (ParallelMath::LosslessCast<MUInt16>::Cast(bestSelector) << (px * `2`));
213	}
214
215	outError = totalError;
216	outSelectors = selectors;
217	}
218
219	void cvtt::Internal::ETCComputer::FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[`2`], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[`2`], MUInt16 bestSelectors[`2`], MUInt15 bestTables[`2`], DifferentialResolveStorage &drs)
220	{
221	// We do this part scalar because most of the cost benefit of parallelization is in error evaluation,
222	// and this code has a LOT of early-outs and disjointed index lookups that vary heavily between blocks
223	// and save a lot of time.
224	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
225	{
226	bool canIgnore[`2`] = { ParallelMath::Extract(canIgnoreSector[`0`], block), ParallelMath::Extract(canIgnoreSector[`1`], block) };
227	bool canIgnoreEither = canIgnore[`0`] \|\| canIgnore[`1`];
228	float blockBestTotalError = ParallelMath::Extract(bestTotalError, block);
229	float bestDiffErrors[`2`] = { FLT_MAX, FLT_MAX };
230	uint16_t bestDiffSelectors[`2`] = { `0`, `0` };
231	uint16_t bestDiffColors[`2`] = { `0`, `0` };
232	uint16_t bestDiffTables[`2`] = { `0`, `0` };
233	for (int sector = `0`; sector < `2`; sector++)
234	{
235	unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
236	for (unsigned int i = `0`; i < sectorNumAttempts; i++)
237	{
238	float error = ParallelMath::Extract(drs.diffErrors[sector][i], block);
239	if (error < bestDiffErrors[sector])
240	{
241	bestDiffErrors[sector] = error;
242	bestDiffSelectors[sector] = ParallelMath::Extract(drs.diffSelectors[sector][i], block);
243	bestDiffColors[sector] = ParallelMath::Extract(drs.diffColors[sector][i], block);
244	bestDiffTables[sector] = ParallelMath::Extract(drs.diffTables[sector][i], block);
245	}
246	}
247	}
248
249	if (canIgnore[`0`])
250	bestDiffColors[`0`] = bestDiffColors[`1`];
251	else if (canIgnore[`1`])
252	bestDiffColors[`1`] = bestDiffColors[`0`];
253
254	// The best differential possibilities must be better than the best total error
255	if (bestDiffErrors[`0`] + bestDiffErrors[`1`] < blockBestTotalError)
256	{
257	// Fast path if the best possible case is legal
258	if (canIgnoreEither \|\| ETCDifferentialIsLegalScalar(bestDiffColors[`0`], bestDiffColors[`1`]))
259	{
260	ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
261	ParallelMath::PutFloat(bestTotalError, block, bestDiffErrors[`0`] + bestDiffErrors[`1`]);
262	ParallelMath::PutUInt15(bestFlip, block, flip);
263	ParallelMath::PutUInt15(bestD, block, d);
264	for (int sector = `0`; sector < `2`; sector++)
265	{
266	ParallelMath::PutUInt15(bestColors[sector], block, bestDiffColors[sector]);
267	ParallelMath::PutUInt16(bestSelectors[sector], block, bestDiffSelectors[sector]);
268	ParallelMath::PutUInt15(bestTables[sector], block, bestDiffTables[sector]);
269	}
270	}
271	else
272	{
273	// Slow path: Sort the possible cases by quality, and search valid combinations
274	// TODO: Pre-flatten the error lists so this is nicer to cache
275	unsigned int numSortIndexes[`2`] = { `0`, `0` };
276	for (int sector = `0`; sector < `2`; sector++)
277	{
278	unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
279
280	for (unsigned int i = `0`; i < sectorNumAttempts; i++)
281	{
282	if (ParallelMath::Extract(drs.diffErrors[sector][i], block) < blockBestTotalError)
283	drs.attemptSortIndexes[sector][numSortIndexes[sector]++] = i;
284	}
285
286	struct SortPredicate
287	{
288	const MFloat *diffErrors;
289	int block;
290
291	bool operator()(uint16_t a, uint16_t b) const
292	{
293	float errorA = ParallelMath::Extract(diffErrors[a], block);
294	float errorB = ParallelMath::Extract(diffErrors[b], block);
295
296	if (errorA < errorB)
297	return true;
298	if (errorA > errorB)
299	return false;
300
301	return a < b;
302	}
303	};
304
305	SortPredicate sp;
306	sp.diffErrors = drs.diffErrors[sector];
307	sp.block = block;
308
309	std::sort<uint16_t, const* SortPredicate&>(drs.attemptSortIndexes[sector], drs.attemptSortIndexes[sector] + numSortIndexes[sector], sp);
310	}
311
312	int scannedElements = `0`;
313	for (unsigned int i = `0`; i < numSortIndexes[`0`]; i++)
314	{
315	unsigned int attemptIndex0 = drs.attemptSortIndexes[`0`][i];
316	float error0 = ParallelMath::Extract(drs.diffErrors[`0`][attemptIndex0], block);
317
318	scannedElements++;
319
320	if (error0 >= blockBestTotalError)
321	break;
322
323	float maxError1 = ParallelMath::Extract(bestTotalError, block) - error0;
324	uint16_t diffColor0 = ParallelMath::Extract(drs.diffColors[`0`][attemptIndex0], block);
325
326	if (maxError1 < bestDiffErrors[`1`])
327	break;
328
329	for (unsigned int j = `0`; j < numSortIndexes[`1`]; j++)
330	{
331	unsigned int attemptIndex1 = drs.attemptSortIndexes[`1`][j];
332	float error1 = ParallelMath::Extract(drs.diffErrors[`1`][attemptIndex1], block);
333
334	scannedElements++;
335
336	if (error1 >= maxError1)
337	break;
338
339	uint16_t diffColor1 = ParallelMath::Extract(drs.diffColors[`1`][attemptIndex1], block);
340
341	if (ETCDifferentialIsLegalScalar(diffColor0, diffColor1))
342	{
343	blockBestTotalError = error0 + error1;
344
345	ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
346	ParallelMath::PutFloat(bestTotalError, block, blockBestTotalError);
347	ParallelMath::PutUInt15(bestFlip, block, flip);
348	ParallelMath::PutUInt15(bestD, block, d);
349	ParallelMath::PutUInt15(bestColors[`0`], block, diffColor0);
350	ParallelMath::PutUInt15(bestColors[`1`], block, diffColor1);
351	ParallelMath::PutUInt16(bestSelectors[`0`], block, ParallelMath::Extract(drs.diffSelectors[`0`][attemptIndex0], block));
352	ParallelMath::PutUInt16(bestSelectors[`1`], block, ParallelMath::Extract(drs.diffSelectors[`1`][attemptIndex1], block));
353	ParallelMath::PutUInt15(bestTables[`0`], block, ParallelMath::Extract(drs.diffTables[`0`][attemptIndex0], block));
354	ParallelMath::PutUInt15(bestTables[`1`], block, ParallelMath::Extract(drs.diffTables[`1`][attemptIndex1], block));
355	break;
356	}
357	}
358	}
359	}
360	}
361	}
362	}
363
364	cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b)
365	{
366	MSInt16 diff = ParallelMath::LosslessCast<MSInt16>::Cast(b) - ParallelMath::LosslessCast<MSInt16>::Cast(a);
367
368	return ParallelMath::Less(ParallelMath::MakeSInt16(-`5`), diff) & ParallelMath::Less(diff, ParallelMath::MakeSInt16(`4`));
369	}
370
371	cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b)
372	{
373	MUInt15 mask = ParallelMath::MakeUInt15(`31`);
374
375	return ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, `10`), ParallelMath::RightShift(b, `10`))
376	& ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, `5`) & mask, ParallelMath::RightShift(b, `5`) & mask)
377	& ETCDifferentialIsLegalForChannel(a & mask, b & mask);
378	}
379
380	bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b)
381	{
382	int16_t diff = static_cast<int16_t>(b) - static_cast<int16_t>(a);
383
384	return (-`4` <= diff) && (diff <= `3`);
385	}
386
387	bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b)
388	{
389	MUInt15 mask = ParallelMath::MakeUInt15(`31`);
390
391	return ETCDifferentialIsLegalForChannelScalar((a >> `10`), (b >> `10`))
392	& ETCDifferentialIsLegalForChannelScalar((a >> `5`) & `31`, (b >> `5`) & `31`)
393	& ETCDifferentialIsLegalForChannelScalar(a & `31`, b & `31`);
394	}
395
396	void cvtt::Internal::ETCComputer::EncodeTMode(uint8_t outputBuffer, MFloat &bestError, const* ParallelMath::Int16CompFlag isIsolated[`16`], const MUInt15 pixels[`16`][`3`], const MFloat preWeightedPixels[`16`][`3`], const Options &options)
397	{
398	bool isUniform = ((options.flags & cvtt::Flags::Uniform) != `0`);
399	bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != `0`);
400
401	ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
402
403	MUInt15 isolatedTotal[`3`] = { ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`0`) };
404	MUInt15 lineTotal[`3`] = { ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`0`) };
405
406	MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(`0`);
407
408	// To speed this up, we compute line total as the sum, then subtract out isolated
409	for (unsigned int px = `0`; px < `16`; px++)
410	{
411	for (int ch = `0`; ch < `3`; ch++)
412	{
413	isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
414	lineTotal[ch] = lineTotal[ch] + pixels[px][ch];
415	}
416	numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(`1`));
417	}
418
419	for (int ch = `0`; ch < `3`; ch++)
420	lineTotal[ch] = lineTotal[ch] - isolatedTotal[ch];
421
422	MUInt15 numPixelsLine = ParallelMath::MakeUInt15(`16`) - numPixelsIsolated;
423
424	MUInt15 isolatedAverageQuantized[`3`];
425	MUInt15 isolatedAverageTargets[`3`];
426	{
427	int divisors[ParallelMath::ParallelSize];
428	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
429	divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * `34`;
430
431	MUInt15 addend = (numPixelsIsolated << `4`) \| numPixelsIsolated;
432	for (int ch = `0`; ch < `3`; ch++)
433	{
434	// isolatedAverageQuantized[ch] = (isolatedTotal[ch] 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);*
435
436	MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
437	if (!isFakeBT709)
438	numerator = numerator + addend;
439
440	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
441	{
442	int divisor = divisors[block];
443	if (divisor == `0`)
444	ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, `0`);
445	else
446	ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
447	}
448
449	isolatedAverageTargets[ch] = numerator;
450	}
451	}
452
453	if (isFakeBT709)
454	ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
455
456	MUInt15 isolatedColor[`3`];
457	for (int ch = `0`; ch < `3`; ch++)
458	isolatedColor[ch] = (isolatedAverageQuantized[ch]) \| (isolatedAverageQuantized[ch] << `4`);
459
460	MFloat isolatedError[`16`];
461	for (int px = `0`; px < `16`; px++)
462	{
463	if (isFakeBT709)
464	isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
465	else if (isUniform)
466	isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
467	else
468	isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
469	}
470
471	MSInt32 bestSelectors = ParallelMath::MakeSInt32(`0`);
472	MUInt15 bestTable = ParallelMath::MakeUInt15(`0`);
473	MUInt15 bestLineColor = ParallelMath::MakeUInt15(`0`);
474
475	MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
476	MSInt16 minLine = ParallelMath::MakeSInt16(`0`) - maxLine;
477
478	int16_t clusterMaxLine = `0`;
479	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
480	{
481	int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
482	if (blockMaxLine > clusterMaxLine)
483	clusterMaxLine = blockMaxLine;
484	}
485
486	int16_t clusterMinLine = -clusterMaxLine;
487
488	int lineDivisors[ParallelMath::ParallelSize];
489	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
490	lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * `34`;
491
492	MUInt15 lineAddend = (numPixelsLine << `4`) \| numPixelsLine;
493
494	for (int table = `0`; table < `8`; table++)
495	{
496	int numUniqueColors[ParallelMath::ParallelSize];
497	MUInt15 uniqueQuantizedColors[`31`];
498
499	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
500	numUniqueColors[block] = `0`;
501
502	MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
503	MUInt15 modifierOffset = (modifier + modifier);
504
505	for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier++)
506	{
507	MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
508	MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
509
510	MUInt15 quantized[`3`];
511	if (isFakeBT709)
512	{
513	MUInt15 targets[`3`];
514	for (int ch = `0`; ch < `3`; ch++)
515	{
516	//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));*
517	MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
518	MUInt15 divided = ParallelMath::MakeUInt15(`0`);
519	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
520	{
521	int divisor = lineDivisors[block];
522	if (divisor == `0`)
523	ParallelMath::PutUInt15(divided, block, `0`);
524	else
525	ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
526	}
527	quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(`15`), divided);
528	targets[ch] = numerator;
529	}
530
531	ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
532	}
533	else
534	{
535	for (int ch = `0`; ch < `3`; ch++)
536	{
537	//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));*
538	MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
539	MUInt15 divided = ParallelMath::MakeUInt15(`0`);
540	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
541	{
542	int divisor = lineDivisors[block];
543	if (divisor == `0`)
544	ParallelMath::PutUInt15(divided, block, `0`);
545	else
546	ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
547	}
548	quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(`15`), divided);
549	}
550	}
551
552	MUInt15 packedColor = quantized[`0`] \| (quantized[`1`] << `5`) \| (quantized[`2`] << `10`);
553
554	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
555	{
556	uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
557	if (numUniqueColors[block] == `0` \|\| blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - `1`], block))
558	ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
559	}
560	}
561
562	// Stripe unfilled unique colors
563	int maxUniqueColors = `0`;
564	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
565	{
566	if (numUniqueColors[block] > maxUniqueColors)
567	maxUniqueColors = numUniqueColors[block];
568	}
569
570	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
571	{
572	uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[`0`], block);
573
574	int numUnique = numUniqueColors[block];
575	for (int fill = numUnique + `1`; fill < maxUniqueColors; fill++)
576	ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
577	}
578
579	for (int ci = `0`; ci < maxUniqueColors; ci++)
580	{
581	MUInt15 lineColors[`3`][`3`];
582	for (int ch = `0`; ch < `3`; ch++)
583	{
584	MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], (ch * `5`)) & ParallelMath::MakeUInt15(`15`));
585
586	MUInt15 unquantizedColor = (quantizedChannel << `4`) \| quantizedChannel;
587	lineColors[`0`][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(`255`), unquantizedColor + modifier);
588	lineColors[`1`][ch] = unquantizedColor;
589	lineColors[`2`][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
590	}
591
592	MSInt32 selectors = ParallelMath::MakeSInt32(`0`);
593	MFloat error = ParallelMath::MakeFloatZero();
594	for (int px = `0`; px < `16`; px++)
595	{
596	MFloat pixelError = isolatedError[px];
597
598	MUInt15 pixelBestSelector = ParallelMath::MakeUInt15(`0`);
599	for (int i = `0`; i < `3`; i++)
600	{
601	MFloat error = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
602	ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, pixelError);
603	pixelError = ParallelMath::Min(error, pixelError);
604	pixelBestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(i + `1`), pixelBestSelector);
605	}
606
607	error = error + pixelError;
608	selectors = selectors \| (ParallelMath::ToInt32(pixelBestSelector) << (px * `2`));
609	}
610
611	ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
612	bestError = ParallelMath::Min(error, bestError);
613
614	if (ParallelMath::AnySet(errorBetter))
615	{
616	ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
617	ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
618	ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
619	bestIsThisMode = bestIsThisMode \| errorBetter;
620	}
621	}
622	}
623
624	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
625	{
626	if (ParallelMath::Extract(bestIsThisMode, block))
627	{
628	uint32_t lowBits = `0`;
629	uint32_t highBits = `0`;
630
631	uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
632	ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[`3`];
633
634	for (int ch = `0`; ch < `3`; ch++)
635	blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
636
637	uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
638	int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
639
640	ParallelMath::ScalarUInt16 lineColor[`3`];
641	for (int ch = `0`; ch < `3`; ch++)
642	lineColor[ch] = (blockBestLineColor >> (ch * `5`)) & `15`;
643
644	EmitTModeBlock(outputBuffer + block * `8`, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, true);
645	}
646	}
647	}
648
649	void cvtt::Internal::ETCComputer::EncodeHMode(uint8_t outputBuffer, MFloat &bestError, const* ParallelMath::Int16CompFlag groupings[`16`], const MUInt15 pixels[`16`][`3`], HModeEval &he, const MFloat preWeightedPixels[`16`][`3`], const Options &options)
650	{
651	bool isUniform = ((options.flags & cvtt::Flags::Uniform) != `0`);
652	bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != `0`);
653
654	MUInt15 zero15 = ParallelMath::MakeUInt15(`0`);
655
656	MUInt15 counts[`2`] = { zero15, zero15 };
657
658	ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
659
660	MUInt15 totals[`2`][`3`] =
661	{
662	{ zero15, zero15, zero15 },
663	{ zero15, zero15, zero15 }
664	};
665
666	for (unsigned int px = `0`; px < `16`; px++)
667	{
668	for (int ch = `0`; ch < `3`; ch++)
669	{
670	totals[`0`][ch] = totals[`0`][ch] + pixels[px][ch];
671	totals[`1`][ch] = totals[`1`][ch] + ParallelMath::SelectOrZero(groupings[px], pixels[px][ch]);
672	}
673	counts[`1`] = counts[`1`] + ParallelMath::SelectOrZero(groupings[px], ParallelMath::MakeUInt15(`1`));
674	}
675
676	for (int ch = `0`; ch < `3`; ch++)
677	totals[`0`][ch] = totals[`0`][ch] - totals[`1`][ch];
678	counts[`0`] = ParallelMath::MakeUInt15(`16`) - counts[`1`];
679
680	MUInt16 bestSectorBits = ParallelMath::MakeUInt16(`0`);
681	MUInt16 bestSignBits = ParallelMath::MakeUInt16(`0`);
682	MUInt15 bestColors[`2`] = { zero15, zero15 };
683	MUInt15 bestTable = ParallelMath::MakeUInt15(`0`);
684
685	for (int table = `0`; table < `8`; table++)
686	{
687	MUInt15 numUniqueColors = zero15;
688
689	int modifier = cvtt::Tables::ETC1::g_thModifierTable[table];
690
691	for (int sector = `0`; sector < `2`; sector++)
692	{
693	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
694	{
695	int blockNumUniqueColors = `0`;
696	uint16_t blockUniqueQuantizedColors[`31`];
697
698	int maxOffsetMultiplier = ParallelMath::Extract(counts[sector], block);
699	int minOffsetMultiplier = -maxOffsetMultiplier;
700
701	int modifierOffset = modifier * `2`;
702
703	int blockSectorCounts = ParallelMath::Extract(counts[sector], block);
704	int blockSectorTotals[`3`];
705	for (int ch = `0`; ch < `3`; ch++)
706	blockSectorTotals[ch] = ParallelMath::Extract(totals[sector][ch], block);
707
708	for (int offsetPremultiplier = minOffsetMultiplier; offsetPremultiplier <= maxOffsetMultiplier; offsetPremultiplier++)
709	{
710	// TODO: This isn't ideal for FakeBT709
711	int16_t quantized[`3`];
712	for (int ch = `0`; ch < `3`; ch++)
713	{
714	if (blockSectorCounts == `0`)
715	quantized[ch] = `0`;
716	else
717	quantized[ch] = std::min<int16_t>(`15`, std::max<int16_t>(`0`, (blockSectorTotals[ch] * `2` + blockSectorCounts * `17` + modifierOffset * offsetPremultiplier)) / (blockSectorCounts * `34`));
718	}
719
720	uint16_t packedColor = (quantized[`0`] << `10`) \| (quantized[`1`] << `5`) \| quantized[`2`];
721	if (blockNumUniqueColors == `0` \|\| packedColor != blockUniqueQuantizedColors[blockNumUniqueColors - `1`])
722	{
723	assert(blockNumUniqueColors < `32`);
724	blockUniqueQuantizedColors[blockNumUniqueColors++] = packedColor;
725	}
726	}
727
728	ParallelMath::PutUInt15(he.numUniqueColors[sector], block, blockNumUniqueColors);
729
730	int baseIndex = `0`;
731	if (sector == `1`)
732	baseIndex = ParallelMath::Extract(he.numUniqueColors[`0`], block);
733
734	for (int i = `0`; i < blockNumUniqueColors; i++)
735	ParallelMath::PutUInt15(he.uniqueQuantizedColors[baseIndex + i], block, blockUniqueQuantizedColors[i]);
736	}
737	}
738
739	MUInt15 totalColors = he.numUniqueColors[`0`] + he.numUniqueColors[`1`];
740	int maxErrorColors = `0`;
741	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
742	maxErrorColors = std::max<int>(maxErrorColors, ParallelMath::Extract(totalColors, block));
743
744	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
745	{
746	int lastColor = ParallelMath::Extract(totalColors, block);
747	uint16_t stripeColor = ParallelMath::Extract(he.uniqueQuantizedColors[`0`], block);
748	for (int i = lastColor; i < maxErrorColors; i++)
749	ParallelMath::PutUInt15(he.uniqueQuantizedColors[i], block, stripeColor);
750	}
751
752	for (int ci = `0`; ci < maxErrorColors; ci++)
753	{
754	MUInt15 fifteen = ParallelMath::MakeUInt15(`15`);
755	MUInt15 twoFiftyFive = ParallelMath::MakeUInt15(`255`);
756	MSInt16 zeroS16 = ParallelMath::MakeSInt16(`0`);
757
758	MUInt15 colors[`2`][`3`];
759	for (int ch = `0`; ch < `3`; ch++)
760	{
761	MUInt15 quantizedChannel = ParallelMath::RightShift(he.uniqueQuantizedColors[ci], ((`2` - ch) * `5`)) & fifteen;
762
763	MUInt15 unquantizedColor = (quantizedChannel << `4`) \| quantizedChannel;
764	colors[`0`][ch] = ParallelMath::Min(twoFiftyFive, unquantizedColor + modifier);
765	colors[`1`][ch] = ParallelMath::ToUInt15(ParallelMath::Max(zeroS16, ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::MakeSInt16(modifier)));
766	}
767
768	MUInt16 signBits = ParallelMath::MakeUInt16(`0`);
769	for (int px = `0`; px < `16`; px++)
770	{
771	MFloat errors[`2`];
772	for (int i = `0`; i < `2`; i++)
773	{
774	if (isFakeBT709)
775	errors[i] = ComputeErrorFakeBT709(colors[i], preWeightedPixels[px]);
776	else if (isUniform)
777	errors[i] = ComputeErrorUniform(colors[i], pixels[px]);
778	else
779	errors[i] = ComputeErrorWeighted(colors[i], preWeightedPixels[px], options);
780	}
781
782	ParallelMath::Int16CompFlag errorOneLess = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errors[`1`], errors[`0`]));
783	he.errors[ci][px] = ParallelMath::Min(errors[`0`], errors[`1`]);
784	signBits = signBits \| ParallelMath::SelectOrZero(errorOneLess, ParallelMath::MakeUInt16(`1` << px));
785	}
786	he.signBits[ci] = signBits;
787	}
788
789	int maxUniqueColorCombos = `0`;
790	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
791	{
792	int numUniqueColorCombos = ParallelMath::Extract(he.numUniqueColors[`0`], block) * ParallelMath::Extract(he.numUniqueColors[`1`], block);
793	if (numUniqueColorCombos > maxUniqueColorCombos)
794	maxUniqueColorCombos = numUniqueColorCombos;
795	}
796
797	MUInt15 indexes[`2`] = { zero15, zero15 };
798	MUInt15 maxIndex[`2`] = { he.numUniqueColors[`0`] - ParallelMath::MakeUInt15(`1`), he.numUniqueColors[`1`] - ParallelMath::MakeUInt15(`1`) };
799
800	int block1Starts[ParallelMath::ParallelSize];
801	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
802	block1Starts[block] = ParallelMath::Extract(he.numUniqueColors[`0`], block);
803
804	for (int combo = `0`; combo < maxUniqueColorCombos; combo++)
805	{
806	MUInt15 index0 = indexes[`0`] + ParallelMath::MakeUInt15(`1`);
807	ParallelMath::Int16CompFlag index0Overflow = ParallelMath::Less(maxIndex[`0`], index0);
808	ParallelMath::ConditionalSet(index0, index0Overflow, ParallelMath::MakeUInt15(`0`));
809
810	MUInt15 index1 = ParallelMath::Min(maxIndex[`1`], indexes[`1`] + ParallelMath::SelectOrZero(index0Overflow, ParallelMath::MakeUInt15(`1`)));
811	indexes[`0`] = index0;
812	indexes[`1`] = index1;
813
814	int ci0[ParallelMath::ParallelSize];
815	int ci1[ParallelMath::ParallelSize];
816	MUInt15 color0;
817	MUInt15 color1;
818
819	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
820	{
821	ci0[block] = ParallelMath::Extract(index0, block);
822	ci1[block] = ParallelMath::Extract(index1, block) + block1Starts[block];
823	ParallelMath::PutUInt15(color0, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci0[block]], block));
824	ParallelMath::PutUInt15(color1, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci1[block]], block));
825	}
826
827	MFloat totalError = ParallelMath::MakeFloatZero();
828	MUInt16 sectorBits = ParallelMath::MakeUInt16(`0`);
829	MUInt16 signBits = ParallelMath::MakeUInt16(`0`);
830	for (int px = `0`; px < `16`; px++)
831	{
832	MFloat errorCI0;
833	MFloat errorCI1;
834	MUInt16 signBits0;
835	MUInt16 signBits1;
836
837	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
838	{
839	ParallelMath::PutFloat(errorCI0, block, ParallelMath::Extract(he.errors[ci0[block]][px], block));
840	ParallelMath::PutFloat(errorCI1, block, ParallelMath::Extract(he.errors[ci1[block]][px], block));
841	ParallelMath::PutUInt16(signBits0, block, ParallelMath::Extract(he.signBits[ci0[block]], block));
842	ParallelMath::PutUInt16(signBits1, block, ParallelMath::Extract(he.signBits[ci1[block]], block));
843	}
844
845	totalError = totalError + ParallelMath::Min(errorCI0, errorCI1);
846
847	MUInt16 bitPosition = ParallelMath::MakeUInt16(`1` << px);
848
849	ParallelMath::Int16CompFlag error1Better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errorCI1, errorCI0));
850
851	sectorBits = sectorBits \| ParallelMath::SelectOrZero(error1Better, bitPosition);
852	signBits = signBits \| (bitPosition & ParallelMath::Select(error1Better, signBits1, signBits0));
853	}
854
855	ParallelMath::FloatCompFlag totalErrorBetter = ParallelMath::Less(totalError, bestError);
856	ParallelMath::Int16CompFlag totalErrorBetter16 = ParallelMath::FloatFlagToInt16(totalErrorBetter);
857	if (ParallelMath::AnySet(totalErrorBetter16))
858	{
859	bestIsThisMode = bestIsThisMode \| totalErrorBetter16;
860	ParallelMath::ConditionalSet(bestTable, totalErrorBetter16, ParallelMath::MakeUInt15(table));
861	ParallelMath::ConditionalSet(bestColors[`0`], totalErrorBetter16, color0);
862	ParallelMath::ConditionalSet(bestColors[`1`], totalErrorBetter16, color1);
863	ParallelMath::ConditionalSet(bestSectorBits, totalErrorBetter16, sectorBits);
864	ParallelMath::ConditionalSet(bestSignBits, totalErrorBetter16, signBits);
865	bestError = ParallelMath::Min(totalError, bestError);
866	}
867	}
868	}
869
870	if (ParallelMath::AnySet(bestIsThisMode))
871	{
872	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
873	{
874	if (!ParallelMath::Extract(bestIsThisMode, block))
875	continue;
876
877	ParallelMath::ScalarUInt16 blockBestColors[`2`] = { ParallelMath::Extract(bestColors[`0`], block), ParallelMath::Extract(bestColors[`1`], block) };
878	ParallelMath::ScalarUInt16 blockBestSectorBits = ParallelMath::Extract(bestSectorBits, block);
879	ParallelMath::ScalarUInt16 blockBestSignBits = ParallelMath::Extract(bestSignBits, block);
880	ParallelMath::ScalarUInt16 blockBestTable = ParallelMath::Extract(bestTable, block);
881
882	EmitHModeBlock(outputBuffer + block * `8`, blockBestColors, blockBestSectorBits, blockBestSignBits, blockBestTable, true);
883	}
884	}
885	}
886
887	void cvtt::Internal::ETCComputer::EncodeVirtualTModePunchthrough(uint8_t outputBuffer, MFloat &bestError, const* ParallelMath::Int16CompFlag isIsolatedBase[`16`], const MUInt15 pixels[`16`][`3`], const MFloat preWeightedPixels[`16`][`3`], const ParallelMath::Int16CompFlag isTransparent[`16`], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options)
888	{
889	// We treat T and H mode as the same mode ("Virtual T mode") with punchthrough, because of how the colors work:
890	//
891	// T mode: C1, C2+M, Transparent, C2-M
892	// H mode: C1+M, C1-M, Transparent, C2-M
893	//
894	// So in either case, we have 2 colors +/- a modifier, and a third unique color, which is basically T mode except without the middle color.
895	// The only thing that matters is whether it's better to store the isolated color as T mode color 1, or store it offset in H mode color 2.
896	//
897	// Sometimes it won't even be possible to store it in H mode color 2 because the table low bit derives from a numeric comparison of the colors,
898	// but unlike opaque blocks, we can't flip them.
899	bool isUniform = ((options.flags & cvtt::Flags::Uniform) != `0`);
900	bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != `0`);
901
902	ParallelMath::FloatCompFlag isTransparentF[`16`];
903	for (int px = `0`; px < `16`; px++)
904	isTransparentF[px] = ParallelMath::Int16FlagToFloat(isTransparent[px]);
905
906	ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
907	ParallelMath::Int16CompFlag bestIsHMode = ParallelMath::MakeBoolInt16(false);
908
909	MUInt15 isolatedTotal[`3`] = { ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`0`) };
910	MUInt15 lineTotal[`3`] = { ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`0`) };
911
912	MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(`0`);
913	MUInt15 numPixelsLine = ParallelMath::MakeUInt15(`0`);
914
915	ParallelMath::Int16CompFlag isIsolated[`16`];
916	ParallelMath::Int16CompFlag isLine[`16`];
917
918	for (unsigned int px = `0`; px < `16`; px++)
919	{
920	ParallelMath::Int16CompFlag isOpaque = ParallelMath::Not(isTransparent[px]);
921	isIsolated[px] = isIsolatedBase[px] & isOpaque;
922	isLine[px] = ParallelMath::Not(isIsolatedBase[px]) & isOpaque;
923	}
924
925	for (unsigned int px = `0`; px < `16`; px++)
926	{
927	for (int ch = `0`; ch < `3`; ch++)
928	{
929	isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
930	lineTotal[ch] = lineTotal[ch] + ParallelMath::SelectOrZero(isLine[px], pixels[px][ch]);
931	}
932	numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(`1`));
933	numPixelsLine = numPixelsLine + ParallelMath::SelectOrZero(isLine[px], ParallelMath::MakeUInt15(`1`));
934	}
935
936	MUInt15 isolatedAverageQuantized[`3`];
937	MUInt15 hModeIsolatedQuantized[`8`][`3`];
938	MUInt15 isolatedAverageTargets[`3`];
939	{
940	int divisors[ParallelMath::ParallelSize];
941	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
942	divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * `34`;
943
944	MUInt15 addend = (numPixelsIsolated << `4`) \| numPixelsIsolated;
945	for (int ch = `0`; ch < `3`; ch++)
946	{
947	// isolatedAverageQuantized[ch] = (isolatedTotal[ch] 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);*
948
949	MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
950	if (!isFakeBT709)
951	numerator = numerator + addend;
952
953	MUInt15 hModeIsolatedNumerators[`8`];
954	for (int table = `0`; table < `8`; table++)
955	{
956	// FIXME: Handle fake BT.709 correctly
957	MUInt15 offsetTotal = isolatedTotal[ch] + ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]), numPixelsIsolated));
958
959	hModeIsolatedNumerators[table] = (offsetTotal + offsetTotal) + addend;
960	}
961
962	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
963	{
964	int divisor = divisors[block];
965	if (divisor == `0`)
966	{
967	ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, `0`);
968	for (int table = `0`; table < `8`; table++)
969	ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, `0`);
970	}
971	else
972	{
973	ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
974	for (int table = `0`; table < `8`; table++)
975	ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, ParallelMath::Extract(hModeIsolatedNumerators[table], block) / divisor);
976	}
977	}
978
979	isolatedAverageTargets[ch] = numerator;
980	}
981	}
982
983	if (isFakeBT709)
984	ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
985
986	for (int table = `0`; table < `8`; table++)
987	for (int ch = `0`; ch < `3`; ch++)
988	hModeIsolatedQuantized[table][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(`15`), hModeIsolatedQuantized[table][ch]);
989
990	MUInt15 isolatedColor[`3`];
991	for (int ch = `0`; ch < `3`; ch++)
992	isolatedColor[ch] = (isolatedAverageQuantized[ch]) \| (isolatedAverageQuantized[ch] << `4`);
993
994	MFloat isolatedError[`16`];
995	for (int px = `0`; px < `16`; px++)
996	{
997	if (isFakeBT709)
998	isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
999	else if (isUniform)
1000	isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
1001	else
1002	isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
1003
1004	ParallelMath::ConditionalSet(isolatedError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1005	}
1006
1007	MSInt32 bestSelectors = ParallelMath::MakeSInt32(`0`);
1008	MUInt15 bestTable = ParallelMath::MakeUInt15(`0`);
1009	MUInt15 bestLineColor = ParallelMath::MakeUInt15(`0`);
1010	MUInt15 bestIsolatedColor = ParallelMath::MakeUInt15(`0`);
1011	MUInt15 bestHModeColor2 = ParallelMath::MakeUInt15(`0`);
1012	ParallelMath::Int16CompFlag bestUseHMode = ParallelMath::MakeBoolInt16(false);
1013
1014	MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
1015	MSInt16 minLine = ParallelMath::MakeSInt16(`0`) - maxLine;
1016
1017	int16_t clusterMaxLine = `0`;
1018	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1019	{
1020	int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
1021	if (blockMaxLine > clusterMaxLine)
1022	clusterMaxLine = blockMaxLine;
1023	}
1024
1025	int16_t clusterMinLine = -clusterMaxLine;
1026
1027	int lineDivisors[ParallelMath::ParallelSize];
1028	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1029	lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * `34`;
1030
1031	MUInt15 lineAddend = (numPixelsLine << `4`) \| numPixelsLine;
1032
1033	for (int table = `0`; table < `8`; table++)
1034	{
1035	int numUniqueColors[ParallelMath::ParallelSize];
1036	MUInt15 uniqueQuantizedColors[`31`];
1037
1038	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1039	numUniqueColors[block] = `0`;
1040
1041	MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
1042	MUInt15 modifierOffset = (modifier + modifier);
1043
1044	for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier += `2`)
1045	{
1046	MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
1047	MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
1048
1049	MUInt15 quantized[`3`];
1050	if (isFakeBT709)
1051	{
1052	MUInt15 targets[`3`];
1053	for (int ch = `0`; ch < `3`; ch++)
1054	{
1055	//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));*
1056	MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
1057	MUInt15 divided = ParallelMath::MakeUInt15(`0`);
1058	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1059	{
1060	int divisor = lineDivisors[block];
1061	if (divisor == `0`)
1062	ParallelMath::PutUInt15(divided, block, `0`);
1063	else
1064	ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
1065	}
1066	quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(`15`), divided);
1067	targets[ch] = numerator;
1068	}
1069
1070	ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
1071	}
1072	else
1073	{
1074	for (int ch = `0`; ch < `3`; ch++)
1075	{
1076	//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));*
1077	MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
1078	MUInt15 divided = ParallelMath::MakeUInt15(`0`);
1079	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1080	{
1081	int divisor = lineDivisors[block];
1082	if (divisor == `0`)
1083	ParallelMath::PutUInt15(divided, block, `0`);
1084	else
1085	ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
1086	}
1087	quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(`15`), divided);
1088	}
1089	}
1090
1091	MUInt15 packedColor = (quantized[`0`] << `10`) \| (quantized[`1`] << `5`) \| quantized[`2`];
1092
1093	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1094	{
1095	uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
1096	if (numUniqueColors[block] == `0` \|\| blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - `1`], block))
1097	ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
1098	}
1099	}
1100
1101	// Stripe unfilled unique colors
1102	int maxUniqueColors = `0`;
1103	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1104	{
1105	if (numUniqueColors[block] > maxUniqueColors)
1106	maxUniqueColors = numUniqueColors[block];
1107	}
1108
1109	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1110	{
1111	uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[`0`], block);
1112
1113	int numUnique = numUniqueColors[block];
1114	for (int fill = numUnique + `1`; fill < maxUniqueColors; fill++)
1115	ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
1116	}
1117
1118	MFloat hModeErrors[`16`];
1119	MUInt15 hModeUnquantizedColor[`3`];
1120	for (int ch = `0`; ch < `3`; ch++)
1121	{
1122	MUInt15 quantizedChannel = hModeIsolatedQuantized[table][ch];
1123
1124	MUInt15 unquantizedCh = (quantizedChannel << `4`) \| quantizedChannel;
1125	hModeUnquantizedColor[ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedCh) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
1126	}
1127
1128	for (int px = `0`; px < `16`; px++)
1129	{
1130	hModeErrors[px] = isUniform ? ComputeErrorUniform(hModeUnquantizedColor, pixels[px]) : ComputeErrorWeighted(hModeUnquantizedColor, preWeightedPixels[px], options);
1131	ParallelMath::ConditionalSet(hModeErrors[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1132	}
1133
1134	MUInt15 packedHModeColor2 = (hModeIsolatedQuantized[table][`0`] << `10`) \| (hModeIsolatedQuantized[table][`1`] << `5`) \| hModeIsolatedQuantized[table][`2`];
1135	ParallelMath::Int16CompFlag tableLowBitIsZero = ((table & `1`) == `0`) ? ParallelMath::MakeBoolInt16(true) : ParallelMath::MakeBoolInt16(false);
1136
1137	for (int ci = `0`; ci < maxUniqueColors; ci++)
1138	{
1139	MUInt15 lineColors[`2`][`3`];
1140	for (int ch = `0`; ch < `3`; ch++)
1141	{
1142	MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], `10` - (ch * `5`)) & ParallelMath::MakeUInt15(`15`));
1143
1144	MUInt15 unquantizedColor = (quantizedChannel << `4`) \| quantizedChannel;
1145	lineColors[`0`][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(`255`), unquantizedColor + modifier);
1146	lineColors[`1`][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
1147	}
1148
1149	MUInt15 bestLineSelector[`16`];
1150	MFloat bestLineError[`16`];
1151	for (int px = `0`; px < `16`; px++)
1152	{
1153	MFloat lineErrors[`2`];
1154	for (int i = `0`; i < `2`; i++)
1155	lineErrors[i] = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
1156
1157	ParallelMath::Int16CompFlag firstIsBetter = ParallelMath::FloatFlagToInt16(ParallelMath::LessOrEqual(lineErrors[`0`], lineErrors[`1`]));
1158	bestLineSelector[px] = ParallelMath::Select(firstIsBetter, ParallelMath::MakeUInt15(`1`), ParallelMath::MakeUInt15(`3`));
1159	bestLineError[px] = ParallelMath::Min(lineErrors[`0`], lineErrors[`1`]);
1160
1161	ParallelMath::ConditionalSet(bestLineError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1162	}
1163
1164	// One case considered here was if it was possible to force H mode to be valid when the line color is unused.
1165	// That case isn't actually useful because it's equivalent to the isolated color being unused at maximum offset,
1166	// which is always checked after a swap.
1167	MFloat tModeError = ParallelMath::MakeFloatZero();
1168	MFloat hModeError = ParallelMath::MakeFloatZero();
1169	for (int px = `0`; px < `16`; px++)
1170	{
1171	tModeError = tModeError + ParallelMath::Min(bestLineError[px], isolatedError[px]);
1172	hModeError = hModeError + ParallelMath::Min(bestLineError[px], hModeErrors[px]);
1173	}
1174
1175	ParallelMath::FloatCompFlag hLessError = ParallelMath::Less(hModeError, tModeError);
1176
1177	MUInt15 packedHModeColor1 = uniqueQuantizedColors[ci];
1178
1179	ParallelMath::Int16CompFlag hModeTableLowBitMustBeZero = ParallelMath::Less(packedHModeColor1, packedHModeColor2);
1180
1181	ParallelMath::Int16CompFlag hModeIsLegal = ParallelMath::Equal(hModeTableLowBitMustBeZero, tableLowBitIsZero);
1182	ParallelMath::Int16CompFlag useHMode = ParallelMath::FloatFlagToInt16(hLessError) & hModeIsLegal;
1183
1184	MFloat roundBestError = tModeError;
1185	ParallelMath::ConditionalSet(roundBestError, ParallelMath::Int16FlagToFloat(useHMode), hModeError);
1186
1187	ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(roundBestError, bestError));
1188	ParallelMath::FloatCompFlag useHModeF = ParallelMath::Int16FlagToFloat(useHMode);
1189
1190	if (ParallelMath::AnySet(errorBetter))
1191	{
1192	MSInt32 selectors = ParallelMath::MakeSInt32(`0`);
1193	for (int px = `0`; px < `16`; px++)
1194	{
1195	MUInt15 selector = bestLineSelector[px];
1196
1197	MFloat isolatedPixelError = ParallelMath::Select(useHModeF, hModeErrors[px], isolatedError[px]);
1198	ParallelMath::Int16CompFlag isolatedBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(isolatedPixelError, bestLineError[px]));
1199
1200	ParallelMath::ConditionalSet(selector, isolatedBetter, ParallelMath::MakeUInt15(`0`));
1201	ParallelMath::ConditionalSet(selector, isTransparent[px], ParallelMath::MakeUInt15(`2`));
1202	selectors = selectors \| (ParallelMath::ToInt32(selector) << (px * `2`));
1203	}
1204
1205	bestError = ParallelMath::Min(bestError, roundBestError);
1206	ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
1207	ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
1208	ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
1209	ParallelMath::ConditionalSet(bestIsHMode, errorBetter, useHMode);
1210	ParallelMath::ConditionalSet(bestHModeColor2, errorBetter, packedHModeColor2);
1211
1212	bestIsThisMode = bestIsThisMode \| errorBetter;
1213	}
1214	}
1215	}
1216
1217	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1218	{
1219	if (ParallelMath::Extract(bestIsThisMode, block))
1220	{
1221	uint32_t lowBits = `0`;
1222	uint32_t highBits = `0`;
1223
1224	uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
1225	ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[`3`];
1226
1227	for (int ch = `0`; ch < `3`; ch++)
1228	blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
1229
1230	uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
1231	int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
1232
1233	ParallelMath::ScalarUInt16 lineColor[`3`];
1234	for (int ch = `0`; ch < `3`; ch++)
1235	lineColor[ch] = (blockBestLineColor >> (`10` - (ch * `5`))) & `15`;
1236
1237	if (ParallelMath::Extract(bestIsHMode, block))
1238	{
1239	// T mode: C1, C2+M, Transparent, C2-M
1240	// H mode: C1+M, C1-M, Transparent, C2-M
1241	static const ParallelMath::ScalarUInt16 selectorRemapSector[`4`] = { `1`, `0`, `1`, `0` };
1242	static const ParallelMath::ScalarUInt16 selectorRemapSign[`4`] = { `1`, `0`, `0`, `1` };
1243
1244	// Remap selectors
1245	ParallelMath::ScalarUInt16 signBits = `0`;
1246	ParallelMath::ScalarUInt16 sectorBits = `0`;
1247	int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
1248	for (int px = `0`; px < `16`; px++)
1249	{
1250	int32_t selector = (blockBestSelectors >> (px * `2`)) & `3`;
1251	sectorBits \|= (selectorRemapSector[selector] << px);
1252	signBits \|= (selectorRemapSign[selector] << px);
1253	}
1254
1255	ParallelMath::ScalarUInt16 blockColors[`2`] = { blockBestLineColor, ParallelMath::Extract(bestHModeColor2, block) };
1256
1257	EmitHModeBlock(outputBuffer + block * `8`, blockColors, sectorBits, signBits, blockBestTable, false);
1258	}
1259	else
1260	EmitTModeBlock(outputBuffer + block * `8`, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, false);
1261	}
1262	}
1263	}
1264
1265
1266	cvtt::ParallelMath::UInt15 cvtt::Internal::ETCComputer::DecodePlanarCoeff(const MUInt15 &coeff, int ch)
1267	{
1268	if (ch == `1`)
1269	return (coeff << `1`) \| (ParallelMath::RightShift(coeff, `6`));
1270	else
1271	return (coeff << `2`) \| (ParallelMath::RightShift(coeff, `4`));
1272	}
1273
1274	void cvtt::Internal::ETCComputer::EncodePlanar(uint8_t outputBuffer, MFloat &bestError, const* MUInt15 pixels[`16`][`3`], const MFloat preWeightedPixels[`16`][`3`], const Options &options)
1275	{
1276	// NOTE: If it's desired to do this in another color space, the best way to do it would probably be
1277	// to do everything in that color space and then transform it back to RGB.
1278
1279	// We compute H = (H-O)/4 and V= (V-O)/4 to simplify the math
1280
1281	// error = (xH + yV + O - C)^2
1282	MFloat h[`3`] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1283	MFloat v[`3`] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1284	MFloat o[`3`] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1285
1286	bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != `0`);
1287	bool isUniform = ((options.flags & cvtt::Flags::Uniform) != `0`);
1288
1289	MFloat totalError = ParallelMath::MakeFloatZero();
1290	MUInt15 bestCoeffs[`3`][`3`]; // [Channel][Coeff]
1291	for (int ch = `0`; ch < `3`; ch++)
1292	{
1293	float fhh = `0.f`;
1294	float fho = `0.f`;
1295	float fhv = `0.f`;
1296	float foo = `0.f`;
1297	float fov = `0.f`;
1298	float fvv = `0.f`;
1299	MFloat fc = ParallelMath::MakeFloatZero();
1300	MFloat fh = ParallelMath::MakeFloatZero();
1301	MFloat fv = ParallelMath::MakeFloatZero();
1302	MFloat fo = ParallelMath::MakeFloatZero();
1303
1304	float &foh = fho;
1305	float &fvh = fhv;
1306	float &fvo = fov;
1307
1308	for (int px = `0`; px < `16`; px++)
1309	{
1310	float x = static_cast<float>(px % `4`);
1311	float y = static_cast<float>(px / `4`);
1312	MFloat c = isFakeBT709 ? preWeightedPixels[px][ch] : ParallelMath::ToFloat(pixels[px][ch]);
1313
1314	// (xH + yV + O - C)^2
1315	fhh += x * x;
1316	fhv += x * y;
1317	fho += x;
1318	fh = fh - c * x;
1319
1320	fvh += y * x;
1321	fvv += y * y;
1322	fvo += y;
1323	fv = fv - c * y;
1324
1325	foh += x;
1326	fov += y;
1327	foo += `1`;
1328	fo = fo - c;
1329
1330	fh = fh - c * x;
1331	fv = fv - c * y;
1332	fo = fo - c;
1333	fc = fc + c * c;
1334	}
1335
1336	//float totalError = fhh h * h + fho * ho + fhv hv + foo o * o + fov * ov + fvv v * v + fh * h + fv * v + fo * o + fc;*
1337
1338	// error = fhhh^2 + fhoho + fhvhv + fooo^2 + fovov + fvvv^2 + fhh + fvv + foo + fc
1339	// derror/dh = 2fhhh + fhoo + fhvv + fh
1340	// derror/dv = fhvh + fovo + 2fvvv + fv
1341	// derror/do = fhoh + 2fooo + fovv + fo
1342
1343	// Solve system of equations
1344	// h o v 1 = 0
1345	// -------
1346	// d e f g R0
1347	// i j k l R1
1348	// m n p q R2
1349
1350	float d = `2.0f` * fhh;
1351	float e = fho;
1352	float f = fhv;
1353	MFloat gD = fh;
1354
1355	float i = fhv;
1356	float j = fov;
1357	float k = `2.0f` * fvv;
1358	MFloat lD = fv;
1359
1360	float m = fho;
1361	float n = `2.0f` * foo;
1362	float p = fov;
1363	MFloat qD = fo;
1364
1365	{
1366	// Factor out first column from R1 and R2
1367	float r0to1 = -i / d;
1368	float r0to2 = -m / d;
1369
1370	// 0 j1 k1 l1D
1371	float j1 = j + r0to1 * e;
1372	float k1 = k + r0to1 * f;
1373	MFloat l1D = lD + gD * r0to1;
1374
1375	// 0 n1 p1 q1D
1376	float n1 = n + r0to2 * e;
1377	float p1 = p + r0to2 * f;
1378	MFloat q1D = qD + gD * r0to2;
1379
1380	// Factor out third column from R2
1381	float r1to2 = -p1 / k1;
1382
1383	// 0 n2 0 q2D
1384	float n2 = n1 + r1to2 * j1;
1385	MFloat q2D = q1D + l1D * r1to2;
1386
1387	o[ch] = -q2D / n2;
1388
1389	// Factor out second column from R1
1390	// 0 n2 0 q2D
1391
1392	float r2to1 = -j1 / n2;
1393
1394	// 0 0 k1 l2D
1395	// 0 n2 0 q2D
1396	MFloat l2D = l1D + q2D * r2to1;
1397
1398	float elim2 = -f / k1;
1399	float elim1 = -e / n2;
1400
1401	// d 0 0 g2D
1402	MFloat g2D = gD + l2D * elim2 + q2D * elim1;
1403
1404	// n2o + q2 = 0*
1405	// o = -q2 / n2
1406	h[ch] = -g2D / d;
1407	v[ch] = -l2D / k1;
1408	}
1409
1410	// Undo the local transformation
1411	h[ch] = h[ch] * `4.0f` + o[ch];
1412	v[ch] = v[ch] * `4.0f` + o[ch];
1413	}
1414
1415	if (isFakeBT709)
1416	{
1417	MFloat oRGB[`3`];
1418	MFloat hRGB[`3`];
1419	MFloat vRGB[`3`];
1420
1421	ConvertFromFakeBT709(oRGB, o);
1422	ConvertFromFakeBT709(hRGB, h);
1423	ConvertFromFakeBT709(vRGB, v);
1424
1425	// Twiddling in fake BT.607 is a mess, just round off for now (the precision is pretty good anyway)
1426	{
1427	ParallelMath::RoundTowardNearestForScope rtn;
1428
1429	for (int ch = `0`; ch < `3`; ch++)
1430	{
1431	MFloat fcoeffs[`3`] = { oRGB[ch], hRGB[ch], vRGB[ch] };
1432
1433	for (int c = `0`; c < `3`; c++)
1434	{
1435	MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
1436	if (ch == `1`)
1437	coeff = ParallelMath::Min(ParallelMath::MakeFloat(`127.0f`), coeff * (`127.0f` / `255.0f`));
1438	else
1439	coeff = ParallelMath::Min(ParallelMath::MakeFloat(`63.0f`), coeff * (`63.0f` / `255.0f`));
1440	fcoeffs[c] = coeff;
1441	}
1442
1443	for (int c = `0`; c < `3`; c++)
1444	bestCoeffs[ch][c] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rtn);
1445	}
1446	}
1447
1448	MUInt15 reconstructed[`16`][`3`];
1449	for (int ch = `0`; ch < `3`; ch++)
1450	{
1451	MUInt15 dO = DecodePlanarCoeff(bestCoeffs[ch][`0`], ch);
1452	MUInt15 dH = DecodePlanarCoeff(bestCoeffs[ch][`1`], ch);
1453	MUInt15 dV = DecodePlanarCoeff(bestCoeffs[ch][`2`], ch);
1454
1455	MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1456	MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1457
1458	MFloat error = ParallelMath::MakeFloatZero();
1459
1460	MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << `2`) + `2`;
1461
1462	for (int px = `0`; px < `16`; px++)
1463	{
1464	MUInt15 pxv = ParallelMath::MakeUInt15(px);
1465	MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(`3`));
1466	MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, `2`));
1467
1468	MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, `2`);
1469	MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), interpolated));
1470	reconstructed[px][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(`255`), clampedLow);
1471	}
1472	}
1473
1474	totalError = ParallelMath::MakeFloatZero();
1475	for (int px = `0`; px < `16`; px++)
1476	totalError = totalError + ComputeErrorFakeBT709(reconstructed[px], preWeightedPixels[px]);
1477	}
1478	else
1479	{
1480	for (int ch = `0`; ch < `3`; ch++)
1481	{
1482	MFloat fcoeffs[`3`] = { o[ch], h[ch], v[ch] };
1483	MUInt15 coeffRanges[`3`][`2`];
1484
1485	for (int c = `0`; c < `3`; c++)
1486	{
1487	MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
1488	if (ch == `1`)
1489	coeff = ParallelMath::Min(ParallelMath::MakeFloat(`127.0f`), coeff * (`127.0f` / `255.0f`));
1490	else
1491	coeff = ParallelMath::Min(ParallelMath::MakeFloat(`63.0f`), coeff * (`63.0f` / `255.0f`));
1492	fcoeffs[c] = coeff;
1493	}
1494
1495	{
1496	ParallelMath::RoundDownForScope rd;
1497	for (int c = `0`; c < `3`; c++)
1498	coeffRanges[c][`0`] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rd);
1499	}
1500
1501	{
1502	ParallelMath::RoundUpForScope ru;
1503	for (int c = `0`; c < `3`; c++)
1504	coeffRanges[c][`1`] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &ru);
1505	}
1506
1507	MFloat bestChannelError = ParallelMath::MakeFloat(FLT_MAX);
1508	for (int io = `0`; io < `2`; io++)
1509	{
1510	MUInt15 dO = DecodePlanarCoeff(coeffRanges[`0`][io], ch);
1511
1512	for (int ih = `0`; ih < `2`; ih++)
1513	{
1514	MUInt15 dH = DecodePlanarCoeff(coeffRanges[`1`][ih], ch);
1515	MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1516
1517	for (int iv = `0`; iv < `2`; iv++)
1518	{
1519	MUInt15 dV = DecodePlanarCoeff(coeffRanges[`2`][iv], ch);
1520	MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1521
1522	MFloat error = ParallelMath::MakeFloatZero();
1523
1524	MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << `2`) + `2`;
1525
1526	for (int px = `0`; px < `16`; px++)
1527	{
1528	MUInt15 pxv = ParallelMath::MakeUInt15(px);
1529	MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(`3`));
1530	MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, `2`));
1531
1532	MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, `2`);
1533	MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), interpolated));
1534	MUInt15 dec = ParallelMath::Min(ParallelMath::MakeUInt15(`255`), clampedLow);
1535
1536	MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(dec);
1537
1538	MFloat deltaF = ParallelMath::ToFloat(delta);
1539	error = error + deltaF * deltaF;
1540	}
1541
1542	ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestChannelError));
1543	if (ParallelMath::AnySet(errorBetter))
1544	{
1545	bestChannelError = ParallelMath::Min(error, bestChannelError);
1546	ParallelMath::ConditionalSet(bestCoeffs[ch][`0`], errorBetter, coeffRanges[`0`][io]);
1547	ParallelMath::ConditionalSet(bestCoeffs[ch][`1`], errorBetter, coeffRanges[`1`][ih]);
1548	ParallelMath::ConditionalSet(bestCoeffs[ch][`2`], errorBetter, coeffRanges[`2`][iv]);
1549	}
1550	}
1551	}
1552	}
1553
1554	if (!isUniform)
1555	{
1556	switch (ch)
1557	{
1558	case `0`:
1559	bestChannelError = bestChannelError * (options.redWeight * options.redWeight);
1560	break;
1561	case `1`:
1562	bestChannelError = bestChannelError * (options.greenWeight * options.greenWeight);
1563	break;
1564	case `2`:
1565	bestChannelError = bestChannelError * (options.blueWeight * options.blueWeight);
1566	break;
1567	default:
1568	break;
1569	}
1570	}
1571
1572	totalError = totalError + bestChannelError;
1573	}
1574	}
1575
1576	ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(totalError, bestError));
1577	if (ParallelMath::AnySet(errorBetter))
1578	{
1579	bestError = ParallelMath::Min(bestError, totalError);
1580
1581	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1582	{
1583	if (!ParallelMath::Extract(errorBetter, block))
1584	continue;
1585
1586	int ro = ParallelMath::Extract(bestCoeffs[`0`][`0`], block);
1587	int rh = ParallelMath::Extract(bestCoeffs[`0`][`1`], block);
1588	int rv = ParallelMath::Extract(bestCoeffs[`0`][`2`], block);
1589
1590	int go = ParallelMath::Extract(bestCoeffs[`1`][`0`], block);
1591	int gh = ParallelMath::Extract(bestCoeffs[`1`][`1`], block);
1592	int gv = ParallelMath::Extract(bestCoeffs[`1`][`2`], block);
1593
1594	int bo = ParallelMath::Extract(bestCoeffs[`2`][`0`], block);
1595	int bh = ParallelMath::Extract(bestCoeffs[`2`][`1`], block);
1596	int bv = ParallelMath::Extract(bestCoeffs[`2`][`2`], block);
1597
1598	int go1 = go >> `6`;
1599	int go2 = go & `63`;
1600
1601	int bo1 = bo >> `5`;
1602	int bo2 = (bo >> `3`) & `3`;
1603	int bo3 = bo & `7`;
1604
1605	int rh1 = (rh >> `1`);
1606	int rh2 = rh & `1`;
1607
1608	int fakeR = ro >> `2`;
1609	int fakeDR = go1 \| ((ro & `3`) << `1`);
1610
1611	int fakeG = (go2 >> `2`);
1612	int fakeDG = ((go2 & `3`) << `1`) \| bo1;
1613
1614	int fakeB = bo2;
1615	int fakeDB = bo3 >> `1`;
1616
1617	uint32_t highBits = `0`;
1618	uint32_t lowBits = `0`;
1619
1620	// Avoid overflowing R
1621	if ((fakeDR & `4`) != `0` && fakeR + fakeDR < `8`)
1622	highBits \|= `1` << (`63` - `32`);
1623
1624	// Avoid overflowing G
1625	if ((fakeDG & `4`) != `0` && fakeG + fakeDG < `8`)
1626	highBits \|= `1` << (`55` - `32`);
1627
1628	// Overflow B
1629	if (fakeB + fakeDB < `4`)
1630	{
1631	// Overflow low
1632	highBits \|= `1` << (`42` - `32`);
1633	}
1634	else
1635	{
1636	// Overflow high
1637	highBits \|= `7` << (`45` - `32`);
1638	}
1639
1640	highBits \|= ro << (`57` - `32`);
1641	highBits \|= go1 << (`56` - `32`);
1642	highBits \|= go2 << (`49` - `32`);
1643	highBits \|= bo1 << (`48` - `32`);
1644	highBits \|= bo2 << (`43` - `32`);
1645	highBits \|= bo3 << (`39` - `32`);
1646	highBits \|= rh1 << (`34` - `32`);
1647	highBits \|= `1` << (`33` - `32`);
1648	highBits \|= rh2 << (`32` - `32`);
1649
1650	lowBits \|= gh << `25`;
1651	lowBits \|= bh << `19`;
1652	lowBits \|= rv << `13`;
1653	lowBits \|= gv << `6`;
1654	lowBits \|= bv << `0`;
1655
1656	for (int i = `0`; i < `4`; i++)
1657	outputBuffer[block * `8` + i] = (highBits >> (`24` - i * `8`)) & `0xff`;
1658	for (int i = `0`; i < `4`; i++)
1659	outputBuffer[block * `8` + i + `4`] = (lowBits >> (`24` - i * `8`)) & `0xff`;
1660	}
1661	}
1662	}
1663
1664	void cvtt::Internal::ETCComputer::CompressETC2Block(uint8_t outputBuffer, const* PixelBlockU8 pixelBlocks, ETC2CompressionData compressionData, const Options &options, bool punchthroughAlpha)
1665	{
1666	ParallelMath::Int16CompFlag pixelIsTransparent[`16`];
1667	ParallelMath::Int16CompFlag anyTransparent = ParallelMath::MakeBoolInt16(false);
1668	ParallelMath::Int16CompFlag allTransparent = ParallelMath::MakeBoolInt16(true);
1669
1670	if (punchthroughAlpha)
1671	{
1672	const float fThreshold = std::max<float>(std::min<float>(`1.0f`, options.threshold), `0.0f`) * `255.0f`;
1673
1674	// +1.0f is intentional, we want to take the next valid integer (even if it's 256) since everything else lower is transparent
1675	MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(std::floor(fThreshold + `1.0f`)));
1676
1677	for (int px = `0`; px < `16`; px++)
1678	{
1679	MUInt15 alpha;
1680	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1681	ParallelMath::PutUInt15(alpha, block, pixelBlocks[block].m_pixels[px][`3`]);
1682
1683	ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(alpha, threshold);
1684	anyTransparent = (anyTransparent \| isTransparent);
1685	allTransparent = (allTransparent & isTransparent);
1686	pixelIsTransparent[px] = isTransparent;
1687	}
1688	}
1689	else
1690	{
1691	for (int px = `0`; px < `16`; px++)
1692	pixelIsTransparent[px] = ParallelMath::MakeBoolInt16(false);
1693
1694	allTransparent = anyTransparent = ParallelMath::MakeBoolInt16(false);
1695	}
1696
1697	MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
1698
1699	ETC2CompressionDataInternal* internalData = static_cast<ETC2CompressionDataInternal*>(compressionData);
1700
1701	MUInt15 pixels[`16`][`3`];
1702	MFloat preWeightedPixels[`16`][`3`];
1703	ExtractBlocks(pixels, preWeightedPixels, pixelBlocks, options);
1704
1705	if (ParallelMath::AnySet(anyTransparent))
1706	{
1707	for (int px = `0`; px < `16`; px++)
1708	{
1709	ParallelMath::Int16CompFlag flag = pixelIsTransparent[px];
1710	ParallelMath::FloatCompFlag fflag = ParallelMath::Int16FlagToFloat(flag);
1711
1712	for (int ch = `0`; ch < `3`; ch++)
1713	{
1714	ParallelMath::ConditionalSet(pixels[px][ch], flag, ParallelMath::MakeUInt15(`0`));
1715	ParallelMath::ConditionalSet(preWeightedPixels[px][ch], fflag, ParallelMath::MakeFloat(`0.0f`));
1716	}
1717	}
1718	}
1719
1720	if (!ParallelMath::AllSet(allTransparent))
1721	EncodePlanar(outputBuffer, bestError, pixels, preWeightedPixels, options);
1722
1723	MFloat chromaDelta[`16`][`2`];
1724
1725	MUInt15 numOpaque = ParallelMath::MakeUInt15(`16`);
1726	for (int px = `0`; px < `16`; px++)
1727	numOpaque = numOpaque - ParallelMath::SelectOrZero(pixelIsTransparent[px], ParallelMath::MakeUInt15(`1`));
1728
1729	if (options.flags & cvtt::Flags::Uniform)
1730	{
1731	MSInt16 chromaCoordinates3[`16`][`2`];
1732	for (int px = `0`; px < `16`; px++)
1733	{
1734	chromaCoordinates3[px][`0`] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][`0`]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][`2`]);
1735	chromaCoordinates3[px][`1`] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][`0`]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][`1`] << `1`) + ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][`2`]);
1736	}
1737
1738	MSInt16 chromaCoordinateCentroid[`2`] = { ParallelMath::MakeSInt16(`0`), ParallelMath::MakeSInt16(`0`) };
1739	for (int px = `0`; px < `16`; px++)
1740	{
1741	for (int ch = `0`; ch < `2`; ch++)
1742	chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
1743	}
1744
1745	if (punchthroughAlpha)
1746	{
1747	for (int px = `0`; px < `16`; px++)
1748	{
1749	for (int ch = `0`; ch < `2`; ch++)
1750	{
1751	MUInt15 chromaCoordinateMultiplied = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(chromaCoordinates3[px][ch], numOpaque));
1752	MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(chromaCoordinateMultiplied) - chromaCoordinateCentroid[ch];
1753	chromaDelta[px][ch] = ParallelMath::ToFloat(delta);
1754	}
1755	}
1756	}
1757	else
1758	{
1759	for (int px = `0`; px < `16`; px++)
1760	{
1761	for (int ch = `0`; ch < `2`; ch++)
1762	chromaDelta[px][ch] = ParallelMath::ToFloat((chromaCoordinates3[px][ch] << `4`) - chromaCoordinateCentroid[ch]);
1763	}
1764	}
1765
1766	const MFloat rcpSqrt3 = ParallelMath::MakeFloat(`0.57735026918962576450914878050196f`);
1767
1768	for (int px = `0`; px < `16`; px++)
1769	chromaDelta[px][`1`] = chromaDelta[px][`1`] * rcpSqrt3;
1770	}
1771	else
1772	{
1773	const float chromaAxis0[`3`] = { internalData->m_chromaSideAxis0[`0`], internalData->m_chromaSideAxis0[`1`], internalData->m_chromaSideAxis0[`2`] };
1774	const float chromaAxis1[`3`] = { internalData->m_chromaSideAxis1[`0`], internalData->m_chromaSideAxis1[`1`], internalData->m_chromaSideAxis1[`2`] };
1775
1776	MFloat chromaCoordinates3[`16`][`2`];
1777	for (int px = `0`; px < `16`; px++)
1778	{
1779	const MFloat &px0 = preWeightedPixels[px][`0`];
1780	const MFloat &px1 = preWeightedPixels[px][`1`];
1781	const MFloat &px2 = preWeightedPixels[px][`2`];
1782
1783	chromaCoordinates3[px][`0`] = px0 * chromaAxis0[`0`] + px1 * chromaAxis0[`1`] + px2 * chromaAxis0[`2`];
1784	chromaCoordinates3[px][`1`] = px0 * chromaAxis1[`0`] + px1 * chromaAxis1[`1`] + px2 * chromaAxis1[`2`];
1785	}
1786
1787	MFloat chromaCoordinateCentroid[`2`] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1788	for (int px = `0`; px < `16`; px++)
1789	{
1790	for (int ch = `0`; ch < `2`; ch++)
1791	chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
1792	}
1793
1794	if (punchthroughAlpha)
1795	{
1796	const MFloat numOpaqueF = ParallelMath::ToFloat(numOpaque);
1797	for (int px = `0`; px < `16`; px++)
1798	{
1799	for (int ch = `0`; ch < `2`; ch++)
1800	{
1801	MFloat chromaCoordinateMultiplied = chromaCoordinates3[px][ch] * numOpaqueF;
1802	MFloat delta = chromaCoordinateMultiplied - chromaCoordinateCentroid[ch];
1803	chromaDelta[px][ch] = delta;
1804	}
1805	}
1806	}
1807	else
1808	{
1809	for (int px = `0`; px < `16`; px++)
1810	{
1811	for (int ch = `0`; ch < `2`; ch++)
1812	chromaDelta[px][ch] = chromaCoordinates3[px][ch] * `16.0f` - chromaCoordinateCentroid[ch];
1813	}
1814	}
1815	}
1816
1817
1818	MFloat covXX = ParallelMath::MakeFloatZero();
1819	MFloat covYY = ParallelMath::MakeFloatZero();
1820	MFloat covXY = ParallelMath::MakeFloatZero();
1821
1822	for (int px = `0`; px < `16`; px++)
1823	{
1824	MFloat nx = chromaDelta[px][`0`];
1825	MFloat ny = chromaDelta[px][`1`];
1826
1827	covXX = covXX + nx * nx;
1828	covYY = covYY + ny * ny;
1829	covXY = covXY + nx * ny;
1830	}
1831
1832	MFloat halfTrace = (covXX + covYY) * `0.5f`;
1833	MFloat det = covXX * covYY - covXY * covXY;
1834
1835	MFloat mm = ParallelMath::Sqrt(ParallelMath::Max(ParallelMath::MakeFloatZero(), halfTrace * halfTrace - det));
1836
1837	MFloat ev = halfTrace + mm;
1838
1839	MFloat dx = (covYY - ev + covXY);
1840	MFloat dy = -(covXX - ev + covXY);
1841
1842	// If evenly distributed, pick an arbitrary plane
1843	ParallelMath::FloatCompFlag allZero = ParallelMath::Equal(dx, ParallelMath::MakeFloatZero()) & ParallelMath::Equal(dy, ParallelMath::MakeFloatZero());
1844	ParallelMath::ConditionalSet(dx, allZero, ParallelMath::MakeFloat(`1.f`));
1845
1846	ParallelMath::Int16CompFlag sectorAssignments[`16`];
1847	for (int px = `0`; px < `16`; px++)
1848	sectorAssignments[px] = ParallelMath::FloatFlagToInt16(ParallelMath::Less(chromaDelta[px][`0`] * dx + chromaDelta[px][`1`] * dy, ParallelMath::MakeFloatZero()));
1849
1850	if (!ParallelMath::AllSet(allTransparent))
1851	{
1852	EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
1853
1854	// Flip sector assignments
1855	for (int px = `0`; px < `16`; px++)
1856	sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1857
1858	EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
1859
1860	EncodeHMode(outputBuffer, bestError, sectorAssignments, pixels, internalData->m_h, preWeightedPixels, options);
1861
1862	CompressETC1BlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, internalData->m_drs, options, true);
1863	}
1864
1865	if (ParallelMath::AnySet(anyTransparent))
1866	{
1867	if (!ParallelMath::AllSet(allTransparent))
1868	{
1869	// Flip sector assignments
1870	for (int px = `0`; px < `16`; px++)
1871	sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1872	}
1873
1874	// Reset the error of any transparent blocks to max and retry with punchthrough modes
1875	ParallelMath::ConditionalSet(bestError, ParallelMath::Int16FlagToFloat(anyTransparent), ParallelMath::MakeFloat(FLT_MAX));
1876
1877	EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
1878
1879	// Flip sector assignments
1880	for (int px = `0`; px < `16`; px++)
1881	sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1882
1883	EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
1884
1885	CompressETC1PunchthroughBlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, pixelIsTransparent, static_cast<ETC2CompressionDataInternal*>(compressionData)->m_drs, options);
1886	}
1887	}
1888
1889	void cvtt::Internal::ETCComputer::CompressETC2AlphaBlock(uint8_t outputBuffer, const* PixelBlockU8 pixelBlocks, const* Options &options)
1890	{
1891	MUInt15 pixels[`16`];
1892
1893	for (int px = `0`; px < `16`; px++)
1894	{
1895	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1896	ParallelMath::PutUInt15(pixels[px], block, pixelBlocks[block].m_pixels[px][`3`]);
1897	}
1898
1899	CompressETC2AlphaBlockInternal(outputBuffer, pixels, false, false, options);
1900	}
1901
1902	void cvtt::Internal::ETCComputer::CompressETC2AlphaBlockInternal(uint8_t outputBuffer, const* MUInt15 pixels[`16`], bool is11Bit, bool isSigned, const Options &options)
1903	{
1904	MUInt15 minAlpha = ParallelMath::MakeUInt15(is11Bit ? `2047` : `255`);
1905	MUInt15 maxAlpha = ParallelMath::MakeUInt15(`0`);
1906
1907	for (int px = `0`; px < `16`; px++)
1908	{
1909	minAlpha = ParallelMath::Min(minAlpha, pixels[px]);
1910	maxAlpha = ParallelMath::Max(maxAlpha, pixels[px]);
1911	}
1912
1913	MUInt15 alphaSpan = maxAlpha - minAlpha;
1914	MUInt15 alphaSpanMidpointTimes2 = maxAlpha + minAlpha;
1915
1916	MUInt31 bestTotalError = ParallelMath::MakeUInt31(`0x7fffffff`);
1917	MUInt15 bestTableIndex = ParallelMath::MakeUInt15(`0`);
1918	MUInt15 bestBaseCodeword = ParallelMath::MakeUInt15(`0`);
1919	MUInt15 bestMultiplier = ParallelMath::MakeUInt15(`0`);
1920	MUInt15 bestIndexes[`16`];
1921
1922	for (int px = `0`; px < `16`; px++)
1923	bestIndexes[px] = ParallelMath::MakeUInt15(`0`);
1924
1925	const int numAlphaRanges = `10`;
1926	for (uint16_t tableIndex = `0`; tableIndex < `16`; tableIndex++)
1927	{
1928	for (int r = `0`; r < numAlphaRanges; r++)
1929	{
1930	int subrange = r % `3`;
1931	int mainRange = r / `3`;
1932
1933	int16_t maxOffset = Tables::ETC2::g_alphaModifierTablePositive[tableIndex][`3` - mainRange - (subrange & `1`)];
1934	int16_t minOffset = -Tables::ETC2::g_alphaModifierTablePositive[tableIndex][`3` - mainRange - ((subrange >> `1`) & `1`)] - `1`;
1935	uint16_t offsetSpan = static_cast<uint16_t>(maxOffset - minOffset);
1936
1937	MSInt16 vminOffset = ParallelMath::MakeSInt16(minOffset);
1938	MUInt15 vmaxOffset = ParallelMath::MakeUInt15(maxOffset);
1939	MUInt15 voffsetSpan = ParallelMath::MakeUInt15(offsetSpan);
1940
1941	MUInt15 minMultiplier = ParallelMath::MakeUInt15(`0`);
1942	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
1943	{
1944	uint16_t singleAlphaSpan = ParallelMath::Extract(alphaSpan, block);
1945
1946	uint16_t lowMultiplier = singleAlphaSpan / offsetSpan;
1947	ParallelMath::PutUInt15(minMultiplier, block, lowMultiplier);
1948	}
1949
1950	if (is11Bit)
1951	{
1952	// Clamps this to valid multipliers under 15 and rounds down to nearest multiple of 8
1953	minMultiplier = ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(`112`)) & ParallelMath::MakeUInt15(`120`);
1954	}
1955	else
1956	{
1957	// We cap at 1 and 14 so both multipliers are valid and dividable
1958	// Cases where offset span is 0 should be caught by multiplier 1 of table 13
1959	minMultiplier = ParallelMath::Max(ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(`14`)), ParallelMath::MakeUInt15(`1`));
1960	}
1961
1962	for (uint16_t multiplierOffset = `0`; multiplierOffset < `2`; multiplierOffset++)
1963	{
1964	MUInt15 multiplier = minMultiplier;
1965
1966	if (is11Bit)
1967	{
1968	if (multiplierOffset == `1`)
1969	multiplier = multiplier + ParallelMath::MakeUInt15(`8`);
1970	else
1971	multiplier = ParallelMath::Max(multiplier, ParallelMath::MakeUInt15(`1`));
1972	}
1973	else
1974	{
1975	if (multiplierOffset == `1`)
1976	multiplier = multiplier + ParallelMath::MakeUInt15(`1`);
1977	}
1978
1979	MSInt16 multipliedMinOffset = ParallelMath::CompactMultiply(ParallelMath::LosslessCast<MSInt16>::Cast(multiplier), vminOffset);
1980	MUInt15 multipliedMaxOffset = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(multiplier, vmaxOffset));
1981
1982	// codeword = (maxOffset + minOffset + minAlpha + maxAlpha) / 2
1983	MSInt16 unclampedBaseAlphaTimes2 = ParallelMath::LosslessCast<MSInt16>::Cast(alphaSpanMidpointTimes2) - ParallelMath::LosslessCast<MSInt16>::Cast(multipliedMaxOffset) - multipliedMinOffset;
1984
1985	MUInt15 baseAlpha;
1986	if (is11Bit)
1987	{
1988	// In unsigned, 4 is added to the unquantized alpha, so compensating for that cancels the 4 we have to add to do rounding.
1989	if (isSigned)
1990	unclampedBaseAlphaTimes2 = unclampedBaseAlphaTimes2 + ParallelMath::MakeSInt16(`8`);
1991
1992	// -128 is illegal for some reason
1993	MSInt16 minBaseAlphaTimes2 = isSigned ? ParallelMath::MakeSInt16(`16`) : ParallelMath::MakeSInt16(`0`);
1994
1995	MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, minBaseAlphaTimes2)), ParallelMath::MakeUInt15(`4095`));
1996	baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2, `1`) & ParallelMath::MakeUInt15(`2040`);
1997
1998	if (!isSigned)
1999	baseAlpha = baseAlpha + ParallelMath::MakeUInt15(`4`);
2000	}
2001	else
2002	{
2003	MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, ParallelMath::MakeSInt16(`0`))), ParallelMath::MakeUInt15(`510`));
2004	baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2 + ParallelMath::MakeUInt15(`1`), `1`);
2005	}
2006
2007	MUInt15 indexes[`16`];
2008	MUInt31 totalError = ParallelMath::MakeUInt31(`0`);
2009	for (int px = `0`; px < `16`; px++)
2010	{
2011	MUInt15 quantizedValues;
2012	QuantizeETC2Alpha(tableIndex, pixels[px], baseAlpha, multiplier, is11Bit, isSigned, indexes[px], quantizedValues);
2013
2014	if (is11Bit)
2015	{
2016	MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(quantizedValues) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px]);
2017	MSInt32 deltaSq = ParallelMath::XMultiply(delta, delta);
2018	totalError = totalError + ParallelMath::LosslessCast<MUInt31>::Cast(deltaSq);
2019	}
2020	else
2021	totalError = totalError + ParallelMath::ToUInt31(ParallelMath::SqDiffUInt8(quantizedValues, pixels[px]));
2022	}
2023
2024	ParallelMath::Int16CompFlag isBetter = ParallelMath::Int32FlagToInt16(ParallelMath::Less(totalError, bestTotalError));
2025	if (ParallelMath::AnySet(isBetter))
2026	{
2027	ParallelMath::ConditionalSet(bestTotalError, isBetter, totalError);
2028	ParallelMath::ConditionalSet(bestTableIndex, isBetter, ParallelMath::MakeUInt15(tableIndex));
2029	ParallelMath::ConditionalSet(bestBaseCodeword, isBetter, baseAlpha);
2030	ParallelMath::ConditionalSet(bestMultiplier, isBetter, multiplier);
2031
2032	for (int px = `0`; px < `16`; px++)
2033	ParallelMath::ConditionalSet(bestIndexes[px], isBetter, indexes[px]);
2034	}
2035
2036	// TODO: Do one refine pass
2037	}
2038	}
2039	}
2040
2041	if (is11Bit)
2042	{
2043	bestMultiplier = ParallelMath::RightShift(bestMultiplier, `3`);
2044
2045	if (isSigned)
2046	bestBaseCodeword = bestBaseCodeword ^ ParallelMath::MakeUInt15(`0x80`);
2047	}
2048
2049	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
2050	{
2051	uint8_t output = outputBuffer + block `8`;
2052
2053	output[`0`] = static_cast<uint8_t>(ParallelMath::Extract(bestBaseCodeword, block));
2054
2055	ParallelMath::ScalarUInt16 multiplier = ParallelMath::Extract(bestMultiplier, block);
2056	ParallelMath::ScalarUInt16 tableIndex = ParallelMath::Extract(bestTableIndex, block);
2057
2058	output[`1`] = static_cast<uint8_t>((multiplier << `4`) \| tableIndex);
2059
2060	static const int pixelSelectorOrder[`16`] = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, `3`, `7`, `11`, `15` };
2061
2062	ParallelMath::ScalarUInt16 indexes[`16`];
2063	for (int px = `0`; px < `16`; px++)
2064	indexes[pixelSelectorOrder[px]] = ParallelMath::Extract(bestIndexes[px], block);
2065
2066	int outputOffset = `2`;
2067	int outputBits = `0`;
2068	int numOutputBits = `0`;
2069	for (int s = `0`; s < `16`; s++)
2070	{
2071	outputBits = (outputBits << `3`) \| indexes[s];
2072	numOutputBits += `3`;
2073
2074	if (numOutputBits >= `8`)
2075	{
2076	output[outputOffset++] = static_cast<uint8_t>(outputBits >> (numOutputBits - `8`));
2077	numOutputBits -= `8`;
2078
2079	outputBits &= ((`1` << numOutputBits) - `1`);
2080	}
2081	}
2082
2083	assert(outputOffset == `8` && numOutputBits == `0`);
2084	}
2085	}
2086
2087	void cvtt::Internal::ETCComputer::CompressEACBlock(uint8_t outputBuffer, const* PixelBlockScalarS16 inputBlocks, bool* isSigned, const Options &options)
2088	{
2089	MUInt15 pixels[`16`];
2090	for (int px = `0`; px < `16`; px++)
2091	{
2092	MSInt16 adjustedPixel;
2093	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
2094	ParallelMath::PutSInt16(adjustedPixel, block, inputBlocks[block].m_pixels[px]);
2095
2096	// We use a slightly shifted range here so we can keep the unquantized base color in a UInt15
2097	// That is, signed range is 1..2047, and unsigned range is 0..2047
2098	if (isSigned)
2099	{
2100	adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(`1023`)) + ParallelMath::MakeSInt16(`1024`);
2101	adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(`1`), adjustedPixel);
2102	}
2103	else
2104	{
2105	adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(`2047`));
2106	adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(`0`), adjustedPixel);
2107	}
2108
2109
2110	pixels[px] = ParallelMath::LosslessCast<MUInt15>::Cast(adjustedPixel);
2111	}
2112
2113	CompressETC2AlphaBlockInternal(outputBuffer, pixels, true, isSigned, options);
2114	}
2115
2116	void cvtt::Internal::ETCComputer::CompressETC1Block(uint8_t outputBuffer, const* PixelBlockU8 inputBlocks, ETC1CompressionData compressionData, const Options &options)
2117	{
2118	DifferentialResolveStorage &drs = static_cast<ETC1CompressionDataInternal*>(compressionData)->m_drs;
2119	MFloat bestTotalError = ParallelMath::MakeFloat(FLT_MAX);
2120
2121	MUInt15 pixels[`16`][`3`];
2122	MFloat preWeightedPixels[`16`][`3`];
2123	ExtractBlocks(pixels, preWeightedPixels, inputBlocks, options);
2124
2125	CompressETC1BlockInternal(bestTotalError, outputBuffer, pixels, preWeightedPixels, drs, options, false);
2126	}
2127
2128	void cvtt::Internal::ETCComputer::ExtractBlocks(MUInt15 pixels[`16`][`3`], MFloat preWeightedPixels[`16`][`3`], const PixelBlockU8 inputBlocks, const* Options &options)
2129	{
2130	bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != `0`);
2131	bool isUniform = ((options.flags & cvtt::Flags::Uniform) != `0`);
2132
2133	for (int px = `0`; px < `16`; px++)
2134	{
2135	for (int ch = `0`; ch < `3`; ch++)
2136	{
2137	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
2138	ParallelMath::PutUInt15(pixels[px][ch], block, inputBlocks[block].m_pixels[px][ch]);
2139	}
2140
2141	if (isFakeBT709)
2142	ConvertToFakeBT709(preWeightedPixels[px], pixels[px]);
2143	else if (isUniform)
2144	{
2145	for (int ch = `0`; ch < `3`; ch++)
2146	preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
2147	}
2148	else
2149	{
2150	preWeightedPixels[px][`0`] = ParallelMath::ToFloat(pixels[px][`0`]) * options.redWeight;
2151	preWeightedPixels[px][`1`] = ParallelMath::ToFloat(pixels[px][`1`]) * options.greenWeight;
2152	preWeightedPixels[px][`2`] = ParallelMath::ToFloat(pixels[px][`2`]) * options.blueWeight;
2153	}
2154	}
2155	}
2156
2157	void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[`3`], const MUInt15 sectorCumulative[`3`], bool isDifferential)
2158	{
2159	for (int ch = `0`; ch < `3`; ch++)
2160	{
2161	const MUInt15& cu15 = sectorCumulative[ch];
2162
2163	if (isDifferential)
2164	{
2165	//quantized[ch] = (cu 31 + (cu >> 3)) >> 11;*
2166	quantized[ch] = ParallelMath::ToUInt15(
2167	ParallelMath::RightShift(
2168	(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << `5`) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, `3`))
2169	, `11`)
2170	);
2171	}
2172	else
2173	{
2174	//quantized[ch] = (cu 30 + (cu >> 3)) >> 12;*
2175	quantized[ch] = ParallelMath::ToUInt15(
2176	ParallelMath::RightShift(
2177	(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << `5`) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << `1`) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, `3`))
2178	, `12`)
2179	);
2180	}
2181	}
2182
2183	MFloat lowOctantRGBFloat[`3`];
2184	MFloat highOctantRGBFloat[`3`];
2185
2186	for (int ch = `0`; ch < `3`; ch++)
2187	{
2188	MUInt15 unquantized;
2189	MUInt15 unquantizedNext;
2190	if (isDifferential)
2191	{
2192	unquantized = (quantized[ch] << `3`) \| ParallelMath::RightShift(quantized[ch], `2`);
2193	MUInt15 quantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(`31`), quantized[ch] + ParallelMath::MakeUInt15(`1`));
2194	unquantizedNext = (quantizedNext << `3`) \| ParallelMath::RightShift(quantizedNext, `2`);
2195	}
2196	else
2197	{
2198	unquantized = (quantized[ch] << `4`) \| quantized[ch];
2199	unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(`255`), unquantized + ParallelMath::MakeUInt15(`17`));
2200	}
2201	lowOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantized << `3`);
2202	highOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantizedNext << `3`);
2203	}
2204
2205	MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
2206	MUInt15 bestOctant = ParallelMath::MakeUInt15(`0`);
2207
2208	MFloat cumulativeYUV[`3`];
2209	ConvertToFakeBT709(cumulativeYUV, sectorCumulative);
2210
2211	for (uint16_t octant = `0`; octant < `8`; octant++)
2212	{
2213	const MFloat &r = (octant & `1`) ? highOctantRGBFloat[`0`] : lowOctantRGBFloat[`0`];
2214	const MFloat &g = (octant & `2`) ? highOctantRGBFloat[`1`] : lowOctantRGBFloat[`1`];
2215	const MFloat &b = (octant & `4`) ? highOctantRGBFloat[`2`] : lowOctantRGBFloat[`2`];
2216
2217	MFloat octantYUV[`3`];
2218	ConvertToFakeBT709(octantYUV, r, g, b);
2219
2220	MFloat delta[`3`];
2221	for (int ch = `0`; ch < `3`; ch++)
2222	delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
2223
2224	MFloat error = delta[`0`] * delta[`0`] + delta[`1`] + delta[`1`] + delta[`2`] * delta[`2`];
2225	ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
2226	ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
2227	bestError = ParallelMath::Min(error, bestError);
2228	}
2229
2230	for (int ch = `0`; ch < `3`; ch++)
2231	quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(`1`));
2232	}
2233
2234	void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[`3`], const MUInt15 sectorCumulative[`3`], bool isDifferential)
2235	{
2236	// sectorCumulative range is 0..2040 (11 bits)
2237	MUInt15 roundingOffset = ParallelMath::MakeUInt15(`0`);
2238
2239	MUInt15 rOffset;
2240	MUInt15 gOffset;
2241	MUInt15 bOffset;
2242	MUInt15 quantizedBase[`3`];
2243	MUInt15 upperBound;
2244
2245	MUInt15 sectorCumulativeFillIn[`3`];
2246	for (int ch = `0`; ch < `3`; ch++)
2247	sectorCumulativeFillIn[ch] = sectorCumulative[ch] + ParallelMath::RightShift(sectorCumulative[ch], `8`);
2248
2249	if (isDifferential)
2250	{
2251	rOffset = (sectorCumulativeFillIn[`0`] << `6`) & ParallelMath::MakeUInt15(`0xf00`);
2252	gOffset = (sectorCumulativeFillIn[`1`] << `4`) & ParallelMath::MakeUInt15(`0x0f0`);
2253	bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[`2`], `2`) & ParallelMath::MakeUInt15(`0x00f`);
2254
2255	for (int ch = `0`; ch < `3`; ch++)
2256	quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], `6`);
2257
2258	upperBound = ParallelMath::MakeUInt15(`31`);
2259	}
2260	else
2261	{
2262	rOffset = (sectorCumulativeFillIn[`0`] << `5`) & ParallelMath::MakeUInt15(`0xf00`);
2263	gOffset = (sectorCumulativeFillIn[`1`] << `1`) & ParallelMath::MakeUInt15(`0x0f0`);
2264	bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[`2`], `3`) & ParallelMath::MakeUInt15(`0x00f`);
2265
2266	for (int ch = `0`; ch < `3`; ch++)
2267	quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], `7`);
2268
2269	upperBound = ParallelMath::MakeUInt15(`15`);
2270	}
2271
2272	MUInt15 lookupIndex = (rOffset \| gOffset \| bOffset);
2273
2274	MUInt15 octant;
2275	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
2276	ParallelMath::PutUInt15(octant, block, Tables::FakeBT709::g_rounding16[ParallelMath::Extract(lookupIndex, block)]);
2277
2278	quantizedBase[`0`] = quantizedBase[`0`] + (octant & ParallelMath::MakeUInt15(`1`));
2279	quantizedBase[`1`] = quantizedBase[`1`] + (ParallelMath::RightShift(octant, `1`) & ParallelMath::MakeUInt15(`1`));
2280	quantizedBase[`2`] = quantizedBase[`2`] + (ParallelMath::RightShift(octant, `2`) & ParallelMath::MakeUInt15(`1`));
2281
2282	for (int ch = `0`; ch < `3`; ch++)
2283	quantized[ch] = ParallelMath::Min(quantizedBase[ch], upperBound);
2284	}
2285
2286	void cvtt::Internal::ETCComputer::ResolveTHFakeBT709Rounding(MUInt15 quantized[`3`], const MUInt15 targets[`3`], const MUInt15 &granularity)
2287	{
2288	MFloat lowOctantRGBFloat[`3`];
2289	MFloat highOctantRGBFloat[`3`];
2290
2291	for (int ch = `0`; ch < `3`; ch++)
2292	{
2293	MUInt15 unquantized = (quantized[ch] << `4`) \| quantized[ch];
2294	MUInt15 unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(`255`), unquantized + ParallelMath::MakeUInt15(`17`));
2295
2296	lowOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantized, granularity) << `1`);
2297	highOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantizedNext, granularity) << `1`);
2298	}
2299
2300	MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
2301	MUInt15 bestOctant = ParallelMath::MakeUInt15(`0`);
2302
2303	MFloat cumulativeYUV[`3`];
2304	ConvertToFakeBT709(cumulativeYUV, ParallelMath::ToFloat(targets[`0`]), ParallelMath::ToFloat(targets[`1`]), ParallelMath::ToFloat(targets[`2`]));
2305
2306	for (uint16_t octant = `0`; octant < `8`; octant++)
2307	{
2308	const MFloat &r = (octant & `1`) ? highOctantRGBFloat[`0`] : lowOctantRGBFloat[`0`];
2309	const MFloat &g = (octant & `2`) ? highOctantRGBFloat[`1`] : lowOctantRGBFloat[`1`];
2310	const MFloat &b = (octant & `4`) ? highOctantRGBFloat[`2`] : lowOctantRGBFloat[`2`];
2311
2312	MFloat octantYUV[`3`];
2313	ConvertToFakeBT709(octantYUV, r, g, b);
2314
2315	MFloat delta[`3`];
2316	for (int ch = `0`; ch < `3`; ch++)
2317	delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
2318
2319	MFloat error = delta[`0`] * delta[`0`] + delta[`1`] + delta[`1`] + delta[`2`] * delta[`2`];
2320	ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
2321	ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
2322	bestError = ParallelMath::Min(error, bestError);
2323	}
2324
2325	for (int ch = `0`; ch < `3`; ch++)
2326	quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(`1`));
2327	}
2328
2329	void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[`3`], const MUInt15 color[`3`])
2330	{
2331	MFloat floatRGB[`3`];
2332	for (int ch = `0`; ch < `3`; ch++)
2333	floatRGB[ch] = ParallelMath::ToFloat(color[ch]);
2334
2335	ConvertToFakeBT709(yuv, floatRGB);
2336	}
2337
2338	void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[`3`], const MFloat color[`3`])
2339	{
2340	ConvertToFakeBT709(yuv, color[`0`], color[`1`], color[`2`]);
2341	}
2342
2343	void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[`3`], const MFloat &pr, const MFloat &pg, const MFloat &pb)
2344	{
2345	MFloat r = pr;
2346	MFloat g = pg;
2347	MFloat b = pb;
2348
2349	yuv[`0`] = r * `0.368233989135369f` + g * `1.23876274963149f` + b * `0.125054068802017f`;
2350	yuv[`1`] = r * `0.5f` - g * `0.4541529f` - b * `0.04584709f`;
2351	yuv[`2`] = r * -`0.081014709086133f` - g * `0.272538676238785f` + b * `0.353553390593274f`;
2352	}
2353
2354	void cvtt::Internal::ETCComputer::ConvertFromFakeBT709(MFloat rgb[`3`], const MFloat yuv[`3`])
2355	{
2356	MFloat yy = yuv[`0`] * `0.57735026466774571071f`;
2357	MFloat u = yuv[`1`];
2358	MFloat v = yuv[`2`];
2359
2360	rgb[`0`] = yy + u * `1.5748000207960953486f`;
2361	rgb[`1`] = yy - u * `0.46812425854364753669f` - v * `0.26491652528157560861f`;
2362	rgb[`2`] = yy + v * `2.6242146882856944069f`;
2363	}
2364
2365
2366	void cvtt::Internal::ETCComputer::QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues)
2367	{
2368	MSInt16 offset = ParallelMath::LosslessCast<MSInt16>::Cast(value) - ParallelMath::LosslessCast<MSInt16>::Cast(baseValue);
2369	MSInt16 offsetTimes2 = offset + offset;
2370
2371	// ETC2's offset tables all have a reflect about 0.5multiplier*
2372	MSInt16 offsetAboutReflectorTimes2 = offsetTimes2 + ParallelMath::LosslessCast<MSInt16>::Cast(multiplier);
2373
2374	MUInt15 absOffsetAboutReflectorTimes2 = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Abs(offsetAboutReflectorTimes2));
2375	MUInt15 lookupIndex = ParallelMath::RightShift(absOffsetAboutReflectorTimes2, `1`);
2376
2377	MUInt15 positiveIndex;
2378	MUInt15 positiveOffsetUnmultiplied;
2379	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
2380	{
2381	uint16_t blockLookupIndex = ParallelMath::Extract(lookupIndex, block) / ParallelMath::Extract(multiplier, block);
2382	if (blockLookupIndex >= Tables::ETC2::g_alphaRoundingTableWidth)
2383	blockLookupIndex = Tables::ETC2::g_alphaRoundingTableWidth - `1`;
2384	uint16_t index = Tables::ETC2::g_alphaRoundingTables[tableIndex][blockLookupIndex];
2385	ParallelMath::PutUInt15(positiveIndex, block, index);
2386	ParallelMath::PutUInt15(positiveOffsetUnmultiplied, block, Tables::ETC2::g_alphaModifierTablePositive[tableIndex][index]);
2387
2388	// TODO: This is suboptimal when the offset is capped. We should detect 0 and 255 values and always map them to the maximum offsets.
2389	// Doing that will also affect refinement though.
2390	}
2391
2392	MSInt16 signBits = ParallelMath::RightShift(offsetAboutReflectorTimes2, `15`);
2393	MSInt16 offsetUnmultiplied = ParallelMath::LosslessCast<MSInt16>::Cast(positiveOffsetUnmultiplied) ^ signBits;
2394	MSInt16 quantizedOffset = ParallelMath::CompactMultiply(offsetUnmultiplied, multiplier);
2395
2396	MSInt16 offsetValue = ParallelMath::LosslessCast<MSInt16>::Cast(baseValue) + quantizedOffset;
2397
2398	if (is11Bit)
2399	{
2400	if (isSigned)
2401	outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(`2047`), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(`1`), offsetValue)));
2402	else
2403	outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(`2047`), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), offsetValue)));
2404	}
2405	else
2406	outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(`255`), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(`0`), offsetValue)));
2407
2408	MUInt15 indexSub = ParallelMath::LosslessCast<MUInt15>::Cast(signBits) & ParallelMath::MakeUInt15(`4`);
2409
2410	outIndexes = positiveIndex + ParallelMath::MakeUInt15(`4`) - indexSub;
2411	}
2412
2413
2414	void cvtt::Internal::ETCComputer::EmitTModeBlock(uint8_t outputBuffer, const* ParallelMath::ScalarUInt16 lineColor[`3`], const ParallelMath::ScalarUInt16 isolatedColor[`3`], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque)
2415	{
2416	static const int selectorOrder[] = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, `3`, `7`, `11`, `15` };
2417
2418	uint32_t lowBits = `0`;
2419	uint32_t highBits = `0`;
2420
2421	int rh = ((isolatedColor[`0`] >> `2`) & `3`);
2422	int rl = (isolatedColor[`0`] & `3`);
2423
2424	if (rh + rl < `4`)
2425	{
2426	// Overflow low
2427	highBits \|= `1` << (`58` - `32`);
2428	}
2429	else
2430	{
2431	// Overflow high
2432	highBits \|= `7` << (`61` - `32`);
2433	}
2434
2435	highBits \|= rh << (`59` - `32`);
2436	highBits \|= rl << (`56` - `32`);
2437	highBits \|= isolatedColor[`1`] << (`52` - `32`);
2438	highBits \|= isolatedColor[`2`] << (`48` - `32`);
2439	highBits \|= lineColor[`0`] << (`44` - `32`);
2440	highBits \|= lineColor[`1`] << (`40` - `32`);
2441	highBits \|= lineColor[`2`] << (`36` - `32`);
2442	highBits \|= ((table >> `1`) & `3`) << (`34` - `32`);
2443	if (opaque)
2444	highBits \|= `1` << (`33` - `32`);
2445	highBits \|= (table & `1`) << (`32` - `32`);
2446
2447	for (int px = `0`; px < `16`; px++)
2448	{
2449	int sel = (packedSelectors >> (`2` * selectorOrder[px])) & `3`;
2450	if ((sel & `0x1`) != `0`)
2451	lowBits \|= (`1` << px);
2452	if ((sel & `0x2`) != `0`)
2453	lowBits \|= (`1` << (`16` + px));
2454	}
2455
2456	for (int i = `0`; i < `4`; i++)
2457	outputBuffer[i] = (highBits >> (`24` - i * `8`)) & `0xff`;
2458	for (int i = `0`; i < `4`; i++)
2459	outputBuffer[i + `4`] = (lowBits >> (`24` - i * `8`)) & `0xff`;
2460	}
2461
2462	void cvtt::Internal::ETCComputer::EmitHModeBlock(uint8_t outputBuffer, const* ParallelMath::ScalarUInt16 blockColors[`2`], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque)
2463	{
2464	if (blockColors[`0`] == blockColors[`1`])
2465	{
2466	// Base colors are the same.
2467	// If the table low bit isn't 1, then we can't encode this, because swapping the block colors will have no effect
2468	// on their order.
2469	// Instead, we encode this as T mode where all of the indexes are on the line.
2470
2471	ParallelMath::ScalarUInt16 lineColor[`3`];
2472	ParallelMath::ScalarUInt16 isolatedColor[`3`];
2473
2474	lineColor[`0`] = isolatedColor[`0`] = (blockColors[`0`] >> `10`) & `0x1f`;
2475	lineColor[`1`] = isolatedColor[`1`] = (blockColors[`0`] >> `5`) & `0x1f`;
2476	lineColor[`2`] = isolatedColor[`2`] = (blockColors[`0`] >> `0`) & `0x1f`;
2477
2478	int32_t packedSelectors = `0x55555555`;
2479	for (int px = `0`; px < `16`; px++)
2480	packedSelectors \|= ((signBits >> px) & `1`) << ((px * `2`) + `1`);
2481
2482	EmitTModeBlock(outputBuffer, lineColor, isolatedColor, packedSelectors, table, opaque);
2483	return;
2484	}
2485
2486	static const int selectorOrder[] = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, `3`, `7`, `11`, `15` };
2487
2488	int16_t colors[`2`][`3`];
2489	for (int sector = `0`; sector < `2`; sector++)
2490	{
2491	for (int ch = `0`; ch < `3`; ch++)
2492	colors[sector][ch] = (blockColors[sector] >> ((`2` - ch) * `5`)) & `15`;
2493	}
2494
2495	uint32_t lowBits = `0`;
2496	uint32_t highBits = `0`;
2497
2498	if (((table & `1`) == `1`) != (blockColors[`0`] > blockColors[`1`]))
2499	{
2500	for (int ch = `0`; ch < `3`; ch++)
2501	std::swap(colors[`0`][ch], colors[`1`][ch]);
2502	sectorBits ^= `0xffff`;
2503	}
2504
2505	int r1 = colors[`0`][`0`];
2506	int g1a = colors[`0`][`1`] >> `1`;
2507	int g1b = (colors[`0`][`1`] & `1`);
2508	int b1a = colors[`0`][`2`] >> `3`;
2509	int b1b = colors[`0`][`2`] & `7`;
2510	int r2 = colors[`1`][`0`];
2511	int g2 = colors[`1`][`1`];
2512	int b2 = colors[`1`][`2`];
2513
2514	// Avoid overflowing R
2515	if ((g1a & `4`) != `0` && r1 + g1a < `8`)
2516	highBits \|= `1` << (`63` - `32`);
2517
2518	int fakeDG = b1b >> `1`;
2519	int fakeG = b1a \| (g1b << `1`);
2520
2521	if (fakeG + fakeDG < `4`)
2522	{
2523	// Overflow low
2524	highBits \|= `1` << (`50` - `32`);
2525	}
2526	else
2527	{
2528	// Overflow high
2529	highBits \|= `7` << (`53` - `32`);
2530	}
2531
2532	int da = (table >> `2`) & `1`;
2533	int db = (table >> `1`) & `1`;
2534
2535	highBits \|= r1 << (`59` - `32`);
2536	highBits \|= g1a << (`56` - `32`);
2537	highBits \|= g1b << (`52` - `32`);
2538	highBits \|= b1a << (`51` - `32`);
2539	highBits \|= b1b << (`47` - `32`);
2540	highBits \|= r2 << (`43` - `32`);
2541	highBits \|= g2 << (`39` - `32`);
2542	highBits \|= b2 << (`35` - `32`);
2543	highBits \|= da << (`34` - `32`);
2544	if (opaque)
2545	highBits \|= `1` << (`33` - `32`);
2546	highBits \|= db << (`32` - `32`);
2547
2548	for (int px = `0`; px < `16`; px++)
2549	{
2550	int sectorBit = (sectorBits >> selectorOrder[px]) & `1`;
2551	int signBit = (signBits >> selectorOrder[px]) & `1`;
2552
2553	lowBits \|= (signBit << px);
2554	lowBits \|= (sectorBit << (`16` + px));
2555	}
2556
2557	uint8_t *output = outputBuffer;
2558
2559	for (int i = `0`; i < `4`; i++)
2560	output[i] = (highBits >> (`24` - i * `8`)) & `0xff`;
2561	for (int i = `0`; i < `4`; i++)
2562	output[i + `4`] = (lowBits >> (`24` - i * `8`)) & `0xff`;
2563	}
2564
2565	void cvtt::Internal::ETCComputer::EmitETC1Block(uint8_t outputBuffer, int* blockBestFlip, int blockBestD, const int blockBestColors[`2`][`3`], const int blockBestTables[`2`], const ParallelMath::ScalarUInt16 blockBestSelectors[`2`], bool transparent)
2566	{
2567	uint32_t highBits = `0`;
2568	uint32_t lowBits = `0`;
2569
2570	if (blockBestD == `0`)
2571	{
2572	highBits \|= blockBestColors[`0`][`0`] << `28`;
2573	highBits \|= blockBestColors[`1`][`0`] << `24`;
2574	highBits \|= blockBestColors[`0`][`1`] << `20`;
2575	highBits \|= blockBestColors[`1`][`1`] << `16`;
2576	highBits \|= blockBestColors[`0`][`2`] << `12`;
2577	highBits \|= blockBestColors[`1`][`2`] << `8`;
2578	}
2579	else
2580	{
2581	highBits \|= blockBestColors[`0`][`0`] << `27`;
2582	highBits \|= ((blockBestColors[`1`][`0`] - blockBestColors[`0`][`0`]) & `7`) << `24`;
2583	highBits \|= blockBestColors[`0`][`1`] << `19`;
2584	highBits \|= ((blockBestColors[`1`][`1`] - blockBestColors[`0`][`1`]) & `7`) << `16`;
2585	highBits \|= blockBestColors[`0`][`2`] << `11`;
2586	highBits \|= ((blockBestColors[`1`][`2`] - blockBestColors[`0`][`2`]) & `7`) << `8`;
2587	}
2588
2589	highBits \|= (blockBestTables[`0`] << `5`);
2590	highBits \|= (blockBestTables[`1`] << `2`);
2591	if (!transparent)
2592	highBits \|= (blockBestD << `1`);
2593	highBits \|= blockBestFlip;
2594
2595	const uint8_t modifierCodes[`4`] = { `3`, `2`, `0`, `1` };
2596
2597	uint8_t unpackedSelectors[`16`];
2598	uint8_t unpackedSelectorCodes[`16`];
2599	for (int sector = `0`; sector < `2`; sector++)
2600	{
2601	int blockSectorBestSelectors = blockBestSelectors[sector];
2602
2603	for (int px = `0`; px < `8`; px++)
2604	{
2605	int selector = (blockSectorBestSelectors >> (`2` * px)) & `3`;
2606	unpackedSelectorCodes[g_flipTables[blockBestFlip][sector][px]] = modifierCodes[selector];
2607	unpackedSelectors[g_flipTables[blockBestFlip][sector][px]] = selector;
2608	}
2609	}
2610
2611	const int pixelSelectorOrder[`16`] = { `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, `3`, `7`, `11`, `15` };
2612
2613	int lowBitOffset = `0`;
2614	for (int sb = `0`; sb < `2`; sb++)
2615	for (int px = `0`; px < `16`; px++)
2616	lowBits \|= ((unpackedSelectorCodes[pixelSelectorOrder[px]] >> sb) & `1`) << (px + sb * `16`);
2617
2618	for (int i = `0`; i < `4`; i++)
2619	outputBuffer[i] = (highBits >> (`24` - i * `8`)) & `0xff`;
2620	for (int i = `0`; i < `4`; i++)
2621	outputBuffer[i + `4`] = (lowBits >> (`24` - i * `8`)) & `0xff`;
2622	}
2623
2624	void cvtt::Internal::ETCComputer::CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t outputBuffer, const* MUInt15 pixels[`16`][`3`], const MFloat preWeightedPixels[`16`][`3`], DifferentialResolveStorage &drs, const Options &options, bool punchthrough)
2625	{
2626	int numTries = `0`;
2627
2628	MUInt15 zeroU15 = ParallelMath::MakeUInt15(`0`);
2629	MUInt16 zeroU16 = ParallelMath::MakeUInt16(`0`);
2630
2631	MUInt15 bestColors[`2`] = { zeroU15, zeroU15 };
2632	MUInt16 bestSelectors[`2`] = { zeroU16, zeroU16 };
2633	MUInt15 bestTables[`2`] = { zeroU15, zeroU15 };
2634	MUInt15 bestFlip = zeroU15;
2635	MUInt15 bestD = zeroU15;
2636
2637	MUInt15 sectorPixels[`2`][`2`][`8`][`3`];
2638	MFloat sectorPreWeightedPixels[`2`][`2`][`8`][`3`];
2639	MUInt15 sectorCumulative[`2`][`2`][`3`];
2640
2641	ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
2642
2643	for (int flip = `0`; flip < `2`; flip++)
2644	{
2645	for (int sector = `0`; sector < `2`; sector++)
2646	{
2647	for (int ch = `0`; ch < `3`; ch++)
2648	sectorCumulative[flip][sector][ch] = zeroU15;
2649
2650	for (int px = `0`; px < `8`; px++)
2651	{
2652	for (int ch = `0`; ch < `3`; ch++)
2653	{
2654	MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
2655	sectorPixels[flip][sector][px][ch] = pixelChannelValue;
2656	sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
2657	sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
2658	}
2659	}
2660	}
2661	}
2662
2663	static const MSInt16 modifierTables[`8`][`4`] =
2664	{
2665	{ ParallelMath::MakeSInt16(-`8`), ParallelMath::MakeSInt16(-`2`), ParallelMath::MakeSInt16(`2`), ParallelMath::MakeSInt16(`8`) },
2666	{ ParallelMath::MakeSInt16(-`17`), ParallelMath::MakeSInt16(-`5`), ParallelMath::MakeSInt16(`5`), ParallelMath::MakeSInt16(`17`) },
2667	{ ParallelMath::MakeSInt16(-`29`), ParallelMath::MakeSInt16(-`9`), ParallelMath::MakeSInt16(`9`), ParallelMath::MakeSInt16(`29`) },
2668	{ ParallelMath::MakeSInt16(-`42`), ParallelMath::MakeSInt16(-`13`), ParallelMath::MakeSInt16(`13`), ParallelMath::MakeSInt16(`42`) },
2669	{ ParallelMath::MakeSInt16(-`60`), ParallelMath::MakeSInt16(-`18`), ParallelMath::MakeSInt16(`18`), ParallelMath::MakeSInt16(`60`) },
2670	{ ParallelMath::MakeSInt16(-`80`), ParallelMath::MakeSInt16(-`24`), ParallelMath::MakeSInt16(`24`), ParallelMath::MakeSInt16(`80`) },
2671	{ ParallelMath::MakeSInt16(-`106`), ParallelMath::MakeSInt16(-`33`), ParallelMath::MakeSInt16(`33`), ParallelMath::MakeSInt16(`106`) },
2672	{ ParallelMath::MakeSInt16(-`183`), ParallelMath::MakeSInt16(-`47`), ParallelMath::MakeSInt16(`47`), ParallelMath::MakeSInt16(`183`) },
2673	};
2674
2675	bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != `0`);
2676
2677	int minD = punchthrough ? `1` : `0`;
2678
2679	for (int flip = `0`; flip < `2`; flip++)
2680	{
2681	drs.diffNumAttempts[`0`] = drs.diffNumAttempts[`1`] = zeroU15;
2682
2683	MFloat bestIndError[`2`] = { ParallelMath::MakeFloat(FLT_MAX), ParallelMath::MakeFloat(FLT_MAX) };
2684	MUInt16 bestIndSelectors[`2`] = { ParallelMath::MakeUInt16(`0`), ParallelMath::MakeUInt16(`0`) };
2685	MUInt15 bestIndColors[`2`] = { zeroU15, zeroU15 };
2686	MUInt15 bestIndTable[`2`] = { zeroU15, zeroU15 };
2687
2688	for (int d = minD; d < `2`; d++)
2689	{
2690	for (int sector = `0`; sector < `2`; sector++)
2691	{
2692	const int16_t *potentialOffsets = cvtt::Tables::ETC1::g_potentialOffsets4;
2693
2694	for (int table = `0`; table < `8`; table++)
2695	{
2696	int16_t numOffsets = *potentialOffsets++;
2697
2698	MUInt15 possibleColors[cvtt::Tables::ETC1::g_maxPotentialOffsets];
2699
2700	MUInt15 quantized[`3`];
2701	for (int oi = `0`; oi < numOffsets; oi++)
2702	{
2703	if (!isFakeBT709)
2704	{
2705	for (int ch = `0`; ch < `3`; ch++)
2706	{
2707	// cu is in range 0..2040
2708	MUInt15 cu15 = ParallelMath::Min(
2709	ParallelMath::MakeUInt15(`2040`),
2710	ParallelMath::ToUInt15(
2711	ParallelMath::Max(
2712	ParallelMath::MakeSInt16(`0`),
2713	ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
2714	)
2715	)
2716	);
2717
2718	if (d == `1`)
2719	{
2720	//quantized[ch] = (cu 31 + (cu >> 3) + 1024) >> 11;*
2721	quantized[ch] = ParallelMath::ToUInt15(
2722	ParallelMath::RightShift(
2723	(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << `5`) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, `3`)) + ParallelMath::MakeUInt16(`1024`)
2724	, `11`)
2725	);
2726	}
2727	else
2728	{
2729	//quantized[ch] = (cu 30 + (cu >> 3) + 2048) >> 12;*
2730	quantized[ch] = ParallelMath::ToUInt15(
2731	ParallelMath::RightShift(
2732	(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << `5`) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << `1`) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, `3`)) + ParallelMath::MakeUInt16(`2048`)
2733	, `12`)
2734	);
2735	}
2736	}
2737	}
2738	else
2739	{
2740	MUInt15 offsetCumulative[`3`];
2741	for (int ch = `0`; ch < `3`; ch++)
2742	{
2743	// cu is in range 0..2040
2744	MUInt15 cu15 = ParallelMath::Min(
2745	ParallelMath::MakeUInt15(`2040`),
2746	ParallelMath::ToUInt15(
2747	ParallelMath::Max(
2748	ParallelMath::MakeSInt16(`0`),
2749	ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
2750	)
2751	)
2752	);
2753
2754	offsetCumulative[ch] = cu15;
2755	}
2756
2757	if ((options.flags & cvtt::Flags::ETC_FakeBT709Accurate) != `0`)
2758	ResolveHalfBlockFakeBT709RoundingAccurate(quantized, offsetCumulative, d == `1`);
2759	else
2760	ResolveHalfBlockFakeBT709RoundingFast(quantized, offsetCumulative, d == `1`);
2761	}
2762
2763	possibleColors[oi] = quantized[`0`] \| (quantized[`1`] << `5`) \| (quantized[`2`] << `10`);
2764	}
2765
2766	potentialOffsets += numOffsets;
2767
2768	ParallelMath::UInt15 numUniqueColors;
2769	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
2770	{
2771	uint16_t blockNumUniqueColors = `1`;
2772	for (int i = `1`; i < numOffsets; i++)
2773	{
2774	uint16_t color = ParallelMath::Extract(possibleColors[i], block);
2775	if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - `1`], block))
2776	ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
2777	}
2778
2779	ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
2780	}
2781
2782	int maxUniqueColors = ParallelMath::Extract(numUniqueColors, `0`);
2783	for (int block = `1`; block < ParallelMath::ParallelSize; block++)
2784	maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
2785
2786	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
2787	{
2788	uint16_t fillColor = ParallelMath::Extract(possibleColors[`0`], block);
2789	for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
2790	ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
2791	}
2792
2793	for (int i = `0`; i < maxUniqueColors; i++)
2794	{
2795	MFloat error = ParallelMath::MakeFloatZero();
2796	MUInt16 selectors = ParallelMath::MakeUInt16(`0`);
2797	MUInt15 quantized = possibleColors[i];
2798	TestHalfBlock(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], modifierTables[table], d == `1`, options);
2799
2800	if (d == `0`)
2801	{
2802	ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestIndError[sector]));
2803	if (ParallelMath::AnySet(errorBetter))
2804	{
2805	bestIndError[sector] = ParallelMath::Min(error, bestIndError[sector]);
2806	ParallelMath::ConditionalSet(bestIndSelectors[sector], errorBetter, selectors);
2807	ParallelMath::ConditionalSet(bestIndColors[sector], errorBetter, quantized);
2808	ParallelMath::ConditionalSet(bestIndTable[sector], errorBetter, ParallelMath::MakeUInt15(table));
2809	}
2810	}
2811	else
2812	{
2813	ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
2814
2815	MUInt15 storageIndexes = drs.diffNumAttempts[sector];
2816	drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(`1`));
2817
2818	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
2819	{
2820	int storageIndex = ParallelMath::Extract(storageIndexes, block);
2821
2822	ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
2823	ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
2824	ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
2825	ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
2826	}
2827	}
2828	}
2829	}
2830	}
2831
2832	if (d == `0`)
2833	{
2834	MFloat bestIndErrorTotal = bestIndError[`0`] + bestIndError[`1`];
2835	ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(bestIndErrorTotal, bestTotalError));
2836	if (ParallelMath::AnySet(errorBetter))
2837	{
2838	bestIsThisMode = bestIsThisMode \| errorBetter;
2839
2840	bestTotalError = ParallelMath::Min(bestTotalError, bestIndErrorTotal);
2841	ParallelMath::ConditionalSet(bestFlip, errorBetter, ParallelMath::MakeUInt15(flip));
2842	ParallelMath::ConditionalSet(bestD, errorBetter, ParallelMath::MakeUInt15(d));
2843	for (int sector = `0`; sector < `2`; sector++)
2844	{
2845	ParallelMath::ConditionalSet(bestColors[sector], errorBetter, bestIndColors[sector]);
2846	ParallelMath::ConditionalSet(bestSelectors[sector], errorBetter, bestIndSelectors[sector]);
2847	ParallelMath::ConditionalSet(bestTables[sector], errorBetter, bestIndTable[sector]);
2848	}
2849	}
2850	}
2851	else
2852	{
2853	ParallelMath::Int16CompFlag canIgnoreSector[`2`] = { ParallelMath::MakeBoolInt16(false), ParallelMath::MakeBoolInt16(false) };
2854	FindBestDifferentialCombination(flip, d, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestD, bestColors, bestSelectors, bestTables, drs);
2855	}
2856	}
2857	}
2858
2859	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
2860	{
2861	if (!ParallelMath::Extract(bestIsThisMode, block))
2862	continue;
2863
2864	uint32_t highBits = `0`;
2865	uint32_t lowBits = `0`;
2866
2867	int blockBestFlip = ParallelMath::Extract(bestFlip, block);
2868	int blockBestD = ParallelMath::Extract(bestD, block);
2869	int blockBestTables[`2`] = { ParallelMath::Extract(bestTables[`0`], block), ParallelMath::Extract(bestTables[`1`], block) };
2870	ParallelMath::ScalarUInt16 blockBestSelectors[`2`] = { ParallelMath::Extract(bestSelectors[`0`], block), ParallelMath::Extract(bestSelectors[`1`], block) };
2871
2872	int colors[`2`][`3`];
2873	for (int sector = `0`; sector < `2`; sector++)
2874	{
2875	int sectorColor = ParallelMath::Extract(bestColors[sector], block);
2876	for (int ch = `0`; ch < `3`; ch++)
2877	colors[sector][ch] = (sectorColor >> (ch * `5`)) & `31`;
2878	}
2879
2880	EmitETC1Block(outputBuffer + block * `8`, blockBestFlip, blockBestD, colors, blockBestTables, blockBestSelectors, false);
2881	}
2882	}
2883
2884
2885	void cvtt::Internal::ETCComputer::CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t outputBuffer, const* MUInt15 pixels[`16`][`3`], const MFloat preWeightedPixels[`16`][`3`], const ParallelMath::Int16CompFlag isTransparent[`16`], DifferentialResolveStorage &drs, const Options &options)
2886	{
2887	int numTries = `0`;
2888
2889	MUInt15 zeroU15 = ParallelMath::MakeUInt15(`0`);
2890	MUInt16 zeroU16 = ParallelMath::MakeUInt16(`0`);
2891
2892	MUInt15 bestColors[`2`] = { zeroU15, zeroU15 };
2893	MUInt16 bestSelectors[`2`] = { zeroU16, zeroU16 };
2894	MUInt15 bestTables[`2`] = { zeroU15, zeroU15 };
2895	MUInt15 bestFlip = zeroU15;
2896
2897	MUInt15 sectorPixels[`2`][`2`][`8`][`3`];
2898	ParallelMath::Int16CompFlag sectorTransparent[`2`][`2`][`8`];
2899	MFloat sectorPreWeightedPixels[`2`][`2`][`8`][`3`];
2900	MUInt15 sectorCumulative[`2`][`2`][`3`];
2901
2902	ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
2903
2904	for (int flip = `0`; flip < `2`; flip++)
2905	{
2906	for (int sector = `0`; sector < `2`; sector++)
2907	{
2908	for (int ch = `0`; ch < `3`; ch++)
2909	sectorCumulative[flip][sector][ch] = zeroU15;
2910
2911	for (int px = `0`; px < `8`; px++)
2912	{
2913	for (int ch = `0`; ch < `3`; ch++)
2914	{
2915	MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
2916	sectorPixels[flip][sector][px][ch] = pixelChannelValue;
2917	sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
2918	sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
2919	}
2920
2921	sectorTransparent[flip][sector][px] = isTransparent[g_flipTables[flip][sector][px]];
2922	}
2923	}
2924	}
2925
2926	static const MUInt15 modifiers[`8`] =
2927	{
2928	ParallelMath::MakeUInt15(`8`),
2929	ParallelMath::MakeUInt15(`17`),
2930	ParallelMath::MakeUInt15(`29`),
2931	ParallelMath::MakeUInt15(`42`),
2932	ParallelMath::MakeUInt15(`60`),
2933	ParallelMath::MakeUInt15(`80`),
2934	ParallelMath::MakeUInt15(`106`),
2935	ParallelMath::MakeUInt15(`183`),
2936	};
2937
2938	bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != `0`);
2939
2940	const int maxSectorCumulativeOffsets = `17`;
2941
2942	for (int flip = `0`; flip < `2`; flip++)
2943	{
2944	ParallelMath::Int16CompFlag canIgnoreSector[`2`] = { ParallelMath::MakeBoolInt16(true), ParallelMath::MakeBoolInt16(false) };
2945
2946	for (int sector = `0`; sector < `2`; sector++)
2947	for (int px = `0`; px < `8`; px++)
2948	canIgnoreSector[sector] = canIgnoreSector[sector] & sectorTransparent[flip][sector][px];
2949
2950	drs.diffNumAttempts[`0`] = drs.diffNumAttempts[`1`] = zeroU15;
2951
2952	for (int sector = `0`; sector < `2`; sector++)
2953	{
2954	MUInt15 sectorNumOpaque = ParallelMath::MakeUInt15(`0`);
2955	for (int px = `0`; px < `8`; px++)
2956	sectorNumOpaque = sectorNumOpaque + ParallelMath::SelectOrZero(sectorTransparent[flip][sector][px], ParallelMath::MakeUInt15(`1`));
2957
2958	int sectorMaxOpaque = `0`;
2959	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
2960	sectorMaxOpaque = std::max<int>(sectorMaxOpaque, ParallelMath::Extract(sectorNumOpaque, block));
2961
2962	int sectorNumOpaqueMultipliers = sectorMaxOpaque * `2` + `1`;
2963
2964	MUInt15 sectorNumOpaqueDenominator = ParallelMath::Max(ParallelMath::MakeUInt15(`1`), sectorNumOpaque) << `8`;
2965	MUInt15 sectorNumOpaqueAddend = sectorNumOpaque << `7`;
2966
2967	MSInt16 sectorNumOpaqueSigned = ParallelMath::LosslessCast<MSInt16>::Cast(sectorNumOpaque);
2968	MSInt16 negSectorNumOpaqueSigned = ParallelMath::MakeSInt16(`0`) - sectorNumOpaqueSigned;
2969
2970	MUInt15 sectorCumulativeMax = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(`255`), sectorNumOpaque));
2971
2972	for (int table = `0`; table < `8`; table++)
2973	{
2974	MUInt15 possibleColors[maxSectorCumulativeOffsets];
2975
2976	MUInt15 quantized[`3`];
2977	for (int om = -sectorMaxOpaque; om <= sectorMaxOpaque; om++)
2978	{
2979	MSInt16 clampedOffsetMult = ParallelMath::Max(ParallelMath::Min(ParallelMath::MakeSInt16(om), sectorNumOpaqueSigned), negSectorNumOpaqueSigned);
2980	MSInt16 offset = ParallelMath::CompactMultiply(clampedOffsetMult, modifiers[table]);
2981
2982	for (int ch = `0`; ch < `3`; ch++)
2983	{
2984	// cu is in range 0..255numOpaque (at most 0..2040)*
2985	MUInt15 cu15 = ParallelMath::Min(
2986	sectorCumulativeMax,
2987	ParallelMath::ToUInt15(
2988	ParallelMath::Max(
2989	ParallelMath::MakeSInt16(`0`),
2990	ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + offset
2991	)
2992	)
2993	);
2994
2995	//quantized[ch] = (cu 31 + (cu >> 3) + (numOpaque * 128)) / (numOpaque * 256)*
2996	MUInt16 cuTimes31 = (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << `5`) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15);
2997	MUInt15 cuDiv8 = ParallelMath::RightShift(cu15, `3`);
2998	MUInt16 numerator = cuTimes31 + ParallelMath::LosslessCast<MUInt16>::Cast(cuDiv8 + sectorNumOpaqueAddend);
2999	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
3000	ParallelMath::PutUInt15(quantized[ch], block, ParallelMath::Extract(numerator, block) / ParallelMath::Extract(sectorNumOpaqueDenominator, block));
3001	}
3002
3003	possibleColors[om + sectorMaxOpaque] = quantized[`0`] \| (quantized[`1`] << `5`) \| (quantized[`2`] << `10`);
3004	}
3005
3006	ParallelMath::UInt15 numUniqueColors;
3007	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
3008	{
3009	uint16_t blockNumUniqueColors = `1`;
3010	for (int i = `1`; i < sectorNumOpaqueMultipliers; i++)
3011	{
3012	uint16_t color = ParallelMath::Extract(possibleColors[i], block);
3013	if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - `1`], block))
3014	ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
3015	}
3016
3017	ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
3018	}
3019
3020	int maxUniqueColors = ParallelMath::Extract(numUniqueColors, `0`);
3021	for (int block = `1`; block < ParallelMath::ParallelSize; block++)
3022	maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
3023
3024	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
3025	{
3026	uint16_t fillColor = ParallelMath::Extract(possibleColors[`0`], block);
3027	for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
3028	ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
3029	}
3030
3031	for (int i = `0`; i < maxUniqueColors; i++)
3032	{
3033	MFloat error = ParallelMath::MakeFloatZero();
3034	MUInt16 selectors = ParallelMath::MakeUInt16(`0`);
3035	MUInt15 quantized = possibleColors[i];
3036	TestHalfBlockPunchthrough(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], sectorTransparent[flip][sector], modifiers[table], options);
3037
3038	ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
3039
3040	MUInt15 storageIndexes = drs.diffNumAttempts[sector];
3041	drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(`1`));
3042
3043	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
3044	{
3045	int storageIndex = ParallelMath::Extract(storageIndexes, block);
3046
3047	ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
3048	ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
3049	ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
3050	ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
3051	}
3052	}
3053	}
3054	}
3055
3056	MUInt15 bestDDummy = ParallelMath::MakeUInt15(`0`);
3057	FindBestDifferentialCombination(flip, `1`, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestDDummy, bestColors, bestSelectors, bestTables, drs);
3058	}
3059
3060	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
3061	{
3062	if (!ParallelMath::Extract(bestIsThisMode, block))
3063	continue;
3064
3065	int blockBestColors[`2`][`3`];
3066	int blockBestTables[`2`];
3067	ParallelMath::ScalarUInt16 blockBestSelectors[`2`];
3068	for (int sector = `0`; sector < `2`; sector++)
3069	{
3070	int sectorColor = ParallelMath::Extract(bestColors[sector], block);
3071	for (int ch = `0`; ch < `3`; ch++)
3072	blockBestColors[sector][ch] = (sectorColor >> (ch * `5`)) & `31`;
3073
3074	blockBestTables[sector] = ParallelMath::Extract(bestTables[sector], block);
3075	blockBestSelectors[sector] = ParallelMath::Extract(bestSelectors[sector], block);
3076	}
3077
3078	EmitETC1Block(outputBuffer + block * `8`, ParallelMath::Extract(bestFlip, block), `1`, blockBestColors, blockBestTables, blockBestSelectors, true);
3079	}
3080	}
3081
3082
3083	cvtt::ETC1CompressionData cvtt::Internal::ETCComputer::AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void* *context)
3084	{
3085	void buffer = allocFunc(context, sizeof*(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
3086	if (!buffer)
3087	return NULL;
3088	new (buffer) cvtt::Internal::ETCComputer::ETC1CompressionDataInternal (context);
3089	return static_cast<ETC1CompressionData*>(buffer);
3090	}
3091
3092	void cvtt::Internal::ETCComputer::ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
3093	{
3094	cvtt::Internal::ETCComputer::ETC1CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC1CompressionDataInternal*>(compressionData);
3095	void *context = internalData->m_context;
3096	internalData->~ETC1CompressionDataInternal();
3097	freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
3098	}
3099
3100	cvtt::ETC2CompressionData cvtt::Internal::ETCComputer::AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void* context, const* cvtt::Options &options)
3101	{
3102	void buffer = allocFunc(context, sizeof*(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
3103	if (!buffer)
3104	return NULL;
3105	new (buffer) cvtt::Internal::ETCComputer::ETC2CompressionDataInternal (context, options);
3106	return static_cast<ETC2CompressionData*>(buffer);
3107	}
3108
3109	void cvtt::Internal::ETCComputer::ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
3110	{
3111	cvtt::Internal::ETCComputer::ETC2CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC2CompressionDataInternal*>(compressionData);
3112	void *context = internalData->m_context;
3113	internalData->~ETC2CompressionDataInternal();
3114	freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
3115	}
3116
3117	cvtt::Internal::ETCComputer::ETC2CompressionDataInternal::ETC2CompressionDataInternal(void context, const* cvtt::Options &options)
3118	: m_context(context)
3119	{
3120	const float cd[`3`] = { options.redWeight, options.greenWeight, options.blueWeight };
3121	const float rotCD[`3`] = { cd[`1`], cd[`2`], cd[`0`] };
3122
3123	const float offs = -(rotCD[`0`] * cd[`0`] + rotCD[`1`] * cd[`1`] + rotCD[`2`] * cd[`2`]) / (cd[`0`] * cd[`0`] + cd[`1`] * cd[`1`] + cd[`2`] * cd[`2`]);
3124
3125	const float chromaAxis0[`3`] = { rotCD[`0`] + cd[`0`] * offs, rotCD[`1`] + cd[`1`] * offs, rotCD[`2`] + cd[`2`] * offs };
3126
3127	const float chromaAxis1Unnormalized[`3`] =
3128	{
3129	chromaAxis0[`1`] * cd[`2`] - chromaAxis0[`2`] * cd[`1`],
3130	chromaAxis0[`2`] * cd[`0`] - chromaAxis0[`0`] * cd[`2`],
3131	chromaAxis0[`0`] * cd[`1`] - chromaAxis0[`1`] * cd[`0`]
3132	};
3133
3134	const float ca0LengthSq = (chromaAxis0[`0`] * chromaAxis0[`0`] + chromaAxis0[`1`] * chromaAxis0[`1`] + chromaAxis0[`2`] * chromaAxis0[`2`]);
3135	const float ca1UNLengthSq = (chromaAxis1Unnormalized[`0`] * chromaAxis1Unnormalized[`0`] + chromaAxis1Unnormalized[`1`] * chromaAxis1Unnormalized[`1`] + chromaAxis1Unnormalized[`2`] * chromaAxis1Unnormalized[`2`]);
3136	const float lengthRatio = static_cast<float>(std::sqrt(ca0LengthSq / ca1UNLengthSq));
3137
3138	const float chromaAxis1[`3`] = { chromaAxis1Unnormalized[`0`] * lengthRatio, chromaAxis1Unnormalized[`1`] * lengthRatio, chromaAxis1Unnormalized[`2`] * lengthRatio };
3139
3140	for (int i = `0`; i < `3`; i++)
3141	{
3142	m_chromaSideAxis0[i] = chromaAxis0[i];
3143	m_chromaSideAxis1[i] = chromaAxis1[i];
3144	}
3145	}
3146
3147	#endif
3148

Browse the source code of Godot/thirdparty/cvtt/ConvectionKernels_ETC.cpp