ConvectionKernels_S3TC.cpp source code [Godot/thirdparty/cvtt/ConvectionKernels_S3TC.cpp]

1	/*
2	Convection Texture Tools
3	Copyright (c) 2018-2019 Eric Lasota
4
5	Permission is hereby granted, free of charge, to any person obtaining
6	a copy of this software and associated documentation files (the
7	"Software"), to deal in the Software without restriction, including
8	without limitation the rights to use, copy, modify, merge, publish,
9	distribute, sublicense, and/or sell copies of the Software, and to
10	permit persons to whom the Software is furnished to do so, subject
11	to the following conditions:
12
13	The above copyright notice and this permission notice shall be included
14	in all copies or substantial portions of the Software.
15
16	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24	-------------------------------------------------------------------------------------
25
26	Portions based on DirectX Texture Library (DirectXTex)
27
28	Copyright (c) Microsoft Corporation. All rights reserved.
29	Licensed under the MIT License.
30
31	http://go.microsoft.com/fwlink/?LinkId=248926
32	*/
33	#include "ConvectionKernels_Config.h"
34
35	#if !defined(CVTT_SINGLE_FILE) \|\| defined(CVTT_SINGLE_FILE_IMPL)
36
37	#include "ConvectionKernels_S3TC.h"
38
39	#include "ConvectionKernels_AggregatedError.h"
40	#include "ConvectionKernels_BCCommon.h"
41	#include "ConvectionKernels_EndpointRefiner.h"
42	#include "ConvectionKernels_EndpointSelector.h"
43	#include "ConvectionKernels_IndexSelector.h"
44	#include "ConvectionKernels_UnfinishedEndpoints.h"
45	#include "ConvectionKernels_S3TC_SingleColor.h"
46
47	void cvtt::Internal::S3TCComputer::Init(MFloat& error)
48	{
49	error = ParallelMath::MakeFloat(FLT_MAX);
50	}
51
52	void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v)
53	{
54	MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(`253`)) + ParallelMath::MakeUInt16(`512`), `10`));
55	v = (reduced << `2`) \| ParallelMath::RightShift(reduced, `4`);
56	}
57
58	void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v)
59	{
60	MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(`249`)) + ParallelMath::MakeUInt16(`1024`), `11`));
61	v = (reduced << `3`) \| ParallelMath::RightShift(reduced, `2`);
62	}
63
64	void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[`3`])
65	{
66	QuantizeTo5Bits(endPoint[`0`]);
67	QuantizeTo6Bits(endPoint[`1`]);
68	QuantizeTo5Bits(endPoint[`2`]);
69	}
70
71	cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span)
72	{
73	return ParallelMath::Abs(ParallelMath::ToFloat(span)) * `0.03f`;
74	}
75
76	cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
77	{
78	MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
79	absDiff = absDiff + d;
80	return absDiff * absDiff;
81	}
82
83	void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[`16`][`4`], const MFloat floatPixels[`16`][`4`], int range, const float* channelWeights,
84	MFloat &bestError, MUInt15 bestEndpoints[`2`][`3`], MUInt15 bestIndexes[`16`], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
85	{
86	float channelWeightsSq[`3`];
87
88	for (int ch = `0`; ch < `3`; ch++)
89	channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
90
91	MUInt15 totals[`3`] = { ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`0`) };
92
93	for (int px = `0`; px < `16`; px++)
94	{
95	for (int ch = `0`; ch < `3`; ch++)
96	totals[ch] = totals[ch] + pixels[px][ch];
97	}
98
99	MUInt15 average[`3`];
100	for (int ch = `0`; ch < `3`; ch++)
101	average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(`8`), `4`);
102
103	const Tables::S3TCSC::TableEntry* rbTable = NULL;
104	const Tables::S3TCSC::TableEntry* gTable = NULL;
105	if (flags & cvtt::Flags::S3TC_Paranoid)
106	{
107	if (range == `4`)
108	{
109	rbTable = Tables::S3TCSC::g_singleColor5_3_p;
110	gTable = Tables::S3TCSC::g_singleColor6_3_p;
111	}
112	else
113	{
114	assert(range == `3`);
115	rbTable = Tables::S3TCSC::g_singleColor5_2_p;
116	gTable = Tables::S3TCSC::g_singleColor6_2_p;
117	}
118	}
119	else
120	{
121	if (range == `4`)
122	{
123	rbTable = Tables::S3TCSC::g_singleColor5_3;
124	gTable = Tables::S3TCSC::g_singleColor6_3;
125	}
126	else
127	{
128	assert(range == `3`);
129	rbTable = Tables::S3TCSC::g_singleColor5_2;
130	gTable = Tables::S3TCSC::g_singleColor6_2;
131	}
132	}
133
134	MUInt15 interpolated[`3`];
135	MUInt15 eps[`2`][`3`];
136	MSInt16 spans[`3`];
137	for (int i = `0`; i < ParallelMath::ParallelSize; i++)
138	{
139	for (int ch = `0`; ch < `3`; ch++)
140	{
141	uint16_t avg = ParallelMath::Extract(average[ch], i);
142	const Tables::S3TCSC::TableEntry& tableEntry = ((ch == `1`) ? gTable[avg] : rbTable[avg]);
143	ParallelMath::PutUInt15(eps[`0`][ch], i, tableEntry.m_min);
144	ParallelMath::PutUInt15(eps[`1`][ch], i, tableEntry.m_max);
145	ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
146	ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
147	}
148	}
149
150	MFloat error = ParallelMath::MakeFloatZero();
151	if (flags & cvtt::Flags::S3TC_Paranoid)
152	{
153	MFloat spanParanoidFactors[`3`];
154	for (int ch = `0`; ch < `3`; ch++)
155	spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
156
157	for (int px = `0`; px < `16`; px++)
158	{
159	for (int ch = `0`; ch < `3`; ch++)
160	error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
161	}
162	}
163	else
164	{
165	for (int px = `0`; px < `16`; px++)
166	{
167	for (int ch = `0`; ch < `3`; ch++)
168	error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
169	}
170	}
171
172	ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
173	ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
174
175	if (ParallelMath::AnySet(better16))
176	{
177	bestError = ParallelMath::Min(bestError, error);
178	for (int epi = `0`; epi < `2`; epi++)
179	for (int ch = `0`; ch < `3`; ch++)
180	ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
181
182	MUInt15 vindexes = ParallelMath::MakeUInt15(`1`);
183	for (int px = `0`; px < `16`; px++)
184	ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
185
186	ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
187	}
188	}
189
190	void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[`16`][`4`], const MFloat floatPixels[`16`][`4`], const MFloat preWeightedPixels[`16`][`4`], const MUInt15 unquantizedEndPoints[`2`][`3`], int range, const float* channelWeights,
191	MFloat &bestError, MUInt15 bestEndpoints[`2`][`3`], MUInt15 bestIndexes[`16`], MUInt15 &bestRange, EndpointRefiner<`3`> refiner, const* ParallelMath::RoundTowardNearestForScope *rtn)
192	{
193	float channelWeightsSq[`3`];
194
195	for (int ch = `0`; ch < `3`; ch++)
196	channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
197
198	MUInt15 endPoints[`2`][`3`];
199
200	for (int ep = `0`; ep < `2`; ep++)
201	for (int ch = `0`; ch < `3`; ch++)
202	endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
203
204	QuantizeTo565(endPoints[`0`]);
205	QuantizeTo565(endPoints[`1`]);
206
207	IndexSelector<`3`> selector;
208	selector.Init<false>(channelWeights, endPoints, range);
209
210	MUInt15 indexes[`16`];
211
212	MFloat paranoidFactors[`3`];
213	for (int ch = `0`; ch < `3`; ch++)
214	paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[`0`][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[`1`][ch]));
215
216	MFloat error = ParallelMath::MakeFloatZero();
217	AggregatedError<`3`> aggError;
218	for (int px = `0`; px < `16`; px++)
219	{
220	MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
221	indexes[px] = index;
222
223	if (refiner)
224	refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
225
226	MUInt15 reconstructed[`3`];
227	selector.ReconstructLDRPrecise(index, reconstructed);
228
229	if (flags & Flags::S3TC_Paranoid)
230	{
231	for (int ch = `0`; ch < `3`; ch++)
232	error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
233	}
234	else
235	BCCommon::ComputeErrorLDR<`3`>(flags, reconstructed, pixels[px], aggError);
236	}
237
238	if (!(flags & Flags::S3TC_Paranoid))
239	error = aggError.Finalize(flags, channelWeightsSq);
240
241	ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
242
243	if (ParallelMath::AnySet(better))
244	{
245	ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
246
247	ParallelMath::ConditionalSet(bestError, better, error);
248
249	for (int ep = `0`; ep < `2`; ep++)
250	for (int ch = `0`; ch < `3`; ch++)
251	ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
252
253	for (int px = `0`; px < `16`; px++)
254	ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
255
256	ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
257	}
258	}
259
260	void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int counts, int* nCounts, const MUInt15 &numElements, const MUInt15 pixels[`16`][`4`], const MFloat floatPixels[`16`][`4`], const MFloat preWeightedPixels[`16`][`4`], bool alphaTest,
261	const MFloat floatSortedInputs[`16`][`4`], const MFloat preWeightedFloatSortedInputs[`16`][`4`], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[`2`][`3`], MUInt15 bestIndexes[`16`], MUInt15 &bestRange,
262	const ParallelMath::RoundTowardNearestForScope* rtn)
263	{
264	UNREFERENCED_PARAMETER(alphaTest);
265	UNREFERENCED_PARAMETER(flags);
266
267	EndpointRefiner<`3`> refiner;
268
269	refiner.Init(nCounts, channelWeights);
270
271	bool escape = false;
272	int e = `0`;
273	for (int i = `0`; i < nCounts; i++)
274	{
275	for (int n = `0`; n < counts[i]; n++)
276	{
277	ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
278	if (!ParallelMath::AnySet(valid))
279	{
280	escape = true;
281	break;
282	}
283
284	if (ParallelMath::AllSet(valid))
285	refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
286	else
287	{
288	MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(`1.0f`), ParallelMath::MakeFloat(`0.0f`));
289	refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
290	}
291	}
292
293	if (escape)
294	break;
295	}
296
297	MUInt15 endPoints[`2`][`3`];
298	refiner.GetRefinedEndpointsLDR(endPoints, rtn);
299
300	TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
301	}
302
303	void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
304	{
305	UNREFERENCED_PARAMETER(flags);
306	ParallelMath::RoundTowardNearestForScope rtn;
307
308	float weights[`1`] = { `1.0f` };
309
310	MUInt15 pixels[`16`];
311	MFloat floatPixels[`16`];
312
313	for (int px = `0`; px < `16`; px++)
314	{
315	ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
316	floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
317	}
318
319	MUInt15 ep[`2`][`1`] = { { ParallelMath::MakeUInt15(`0`) },{ ParallelMath::MakeUInt15(`255`) } };
320
321	IndexSelector<`1`> selector;
322	selector.Init<false>(weights, ep, `16`);
323
324	MUInt15 indexes[`16`];
325
326	for (int px = `0`; px < `16`; px++)
327	indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
328
329	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
330	{
331	for (int px = `0`; px < `16`; px += `2`)
332	{
333	int index0 = ParallelMath::Extract(indexes[px], block);
334	int index1 = ParallelMath::Extract(indexes[px + `1`], block);
335
336	packedBlocks[px / `2`] = static_cast<uint8_t>(index0 \| (index1 << `4`));
337	}
338
339	packedBlocks += packedBlockStride;
340	}
341	}
342
343	void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
344	{
345	if (maxTweakRounds < `1`)
346	maxTweakRounds = `1`;
347
348	if (numRefineRounds < `1`)
349	numRefineRounds = `1`;
350
351	ParallelMath::RoundTowardNearestForScope rtn;
352
353	float oneWeight[`1`] = { `1.0f` };
354
355	MUInt15 pixels[`16`];
356	MFloat floatPixels[`16`];
357
358	MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(`254`) : ParallelMath::MakeUInt15(`255`);
359	MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(`1`);
360
361	for (int px = `0`; px < `16`; px++)
362	{
363	ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
364
365	if (isSigned)
366	pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
367
368	floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
369	}
370
371	MUInt15 sortedPixels[`16`];
372	for (int px = `0`; px < `16`; px++)
373	sortedPixels[px] = pixels[px];
374
375	for (int sortEnd = `15`; sortEnd > `0`; sortEnd--)
376	{
377	for (int sortOffset = `0`; sortOffset < sortEnd; sortOffset++)
378	{
379	MUInt15 a = sortedPixels[sortOffset];
380	MUInt15 b = sortedPixels[sortOffset + `1`];
381
382	sortedPixels[sortOffset] = ParallelMath::Min(a, b);
383	sortedPixels[sortOffset + `1`] = ParallelMath::Max(a, b);
384	}
385	}
386
387	MUInt15 zero = ParallelMath::MakeUInt15(`0`);
388	MUInt15 one = ParallelMath::MakeUInt15(`1`);
389
390	MUInt15 bestIsFullRange = zero;
391	MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
392	MUInt15 bestEP[`2`] = { zero, zero };
393	MUInt15 bestIndexes[`16`] = {
394	zero, zero, zero, zero,
395	zero, zero, zero, zero,
396	zero, zero, zero, zero,
397	zero, zero, zero, zero
398	};
399
400	// Full-precision
401	{
402	MUInt15 minEP = sortedPixels[`0`];
403	MUInt15 maxEP = sortedPixels[`15`];
404
405	MFloat base[`1`] = { ParallelMath::ToFloat(minEP) };
406	MFloat offset[`1`] = { ParallelMath::ToFloat(maxEP - minEP) };
407
408	UnfinishedEndpoints<`1`> ufep = UnfinishedEndpoints<`1`>(base, offset);
409
410	int numTweakRounds = BCCommon::TweakRoundsForRange(`8`);
411	if (numTweakRounds > maxTweakRounds)
412	numTweakRounds = maxTweakRounds;
413
414	for (int tweak = `0`; tweak < numTweakRounds; tweak++)
415	{
416	MUInt15 ep[`2`][`1`];
417
418	ufep.FinishLDR(tweak, `8`, ep[`0`], ep[`1`]);
419
420	for (int refinePass = `0`; refinePass < numRefineRounds; refinePass++)
421	{
422	EndpointRefiner<`1`> refiner;
423	refiner.Init(`8`, oneWeight);
424
425	if (isSigned)
426	for (int epi = `0`; epi < `2`; epi++)
427	ep[epi][`0`] = ParallelMath::Min(ep[epi][`0`], highTerminal);
428
429	IndexSelector<`1`> indexSelector;
430	indexSelector.Init<false>(oneWeight, ep, `8`);
431
432	MUInt15 indexes[`16`];
433
434	AggregatedError<`1`> aggError;
435	for (int px = `0`; px < `16`; px++)
436	{
437	MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
438
439	MUInt15 reconstructedPixel;
440
441	indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
442	BCCommon::ComputeErrorLDR<`1`>(flags, &reconstructedPixel, &pixels[px], aggError);
443
444	if (refinePass != numRefineRounds - `1`)
445	refiner.ContributeUnweightedPW(&floatPixels[px], index);
446
447	indexes[px] = index;
448	}
449	MFloat error = aggError.Finalize(flags \| Flags::Uniform, oneWeight);
450
451	ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
452	ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
453
454	if (ParallelMath::AnySet(errorBetter16))
455	{
456	bestError = ParallelMath::Min(error, bestError);
457	ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
458	for (int px = `0`; px < `16`; px++)
459	ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
460
461	for (int epi = `0`; epi < `2`; epi++)
462	ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][`0`]);
463	}
464
465	if (refinePass != numRefineRounds - `1`)
466	refiner.GetRefinedEndpointsLDR(ep, &rtn);
467	}
468	}
469	}
470
471	// Reduced precision with special endpoints
472	{
473	MUInt15 bestHeuristicMin = sortedPixels[`0`];
474	MUInt15 bestHeuristicMax = sortedPixels[`15`];
475
476	ParallelMath::Int16CompFlag canTryClipping;
477
478	// In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
479	// The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
480	// This will usually not find anything, but it's cheap to check.
481
482	{
483	MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
484	MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
485
486	MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << `2`) + (lowestPossibleClearance << `4`);
487	canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
488	}
489
490	if (ParallelMath::AnySet(canTryClipping))
491	{
492	MUInt15 lowClearances[`16`];
493	MUInt15 highClearances[`16`];
494	MUInt15 bestSkipCount = ParallelMath::MakeUInt15(`0`);
495
496	lowClearances[`0`] = highClearances[`0`] = ParallelMath::MakeUInt15(`0`);
497
498	for (int px = `1`; px < `16`; px++)
499	{
500	lowClearances[px] = sortedPixels[px - `1`];
501	highClearances[px] = highTerminal - sortedPixels[`16` - px];
502	}
503
504	for (uint16_t firstIndex = `0`; firstIndex < `16`; firstIndex++)
505	{
506	uint16_t numSkippedLow = firstIndex;
507
508	MUInt15 lowClearance = lowClearances[firstIndex];
509
510	for (uint16_t lastIndex = firstIndex; lastIndex < `16`; lastIndex++)
511	{
512	uint16_t numSkippedHigh = `15` - lastIndex;
513	uint16_t numSkipped = numSkippedLow + numSkippedHigh;
514
515	MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
516
517	ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
518
519	if (!ParallelMath::AnySet(areMoreSkipped))
520	continue;
521
522	MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
523	MUInt15 clearanceTimes10 = (clearance << `2`) + (clearance << `4`);
524
525	MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
526
527	ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
528	ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
529	ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
530	}
531	}
532	}
533
534	MUInt15 bestSimpleMin = one;
535	MUInt15 bestSimpleMax = highTerminalMinusOne;
536
537	for (int px = `0`; px < `16`; px++)
538	{
539	ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[`15` - px]), sortedPixels[`15` - px]);
540	ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
541	}
542
543	MUInt15 minEPs[`2`] = { bestSimpleMin, bestHeuristicMin };
544	MUInt15 maxEPs[`2`] = { bestSimpleMax, bestHeuristicMax };
545
546	int minEPRange = `2`;
547	if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[`0`], minEPs[`1`])))
548	minEPRange = `1`;
549
550	int maxEPRange = `2`;
551	if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[`0`], maxEPs[`1`])))
552	maxEPRange = `1`;
553
554	for (int minEPIndex = `0`; minEPIndex < minEPRange; minEPIndex++)
555	{
556	for (int maxEPIndex = `0`; maxEPIndex < maxEPRange; maxEPIndex++)
557	{
558	MFloat base[`1`] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
559	MFloat offset[`1`] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
560
561	UnfinishedEndpoints<`1`> ufep = UnfinishedEndpoints<`1`>(base, offset);
562
563	int numTweakRounds = BCCommon::TweakRoundsForRange(`6`);
564	if (numTweakRounds > maxTweakRounds)
565	numTweakRounds = maxTweakRounds;
566
567	for (int tweak = `0`; tweak < numTweakRounds; tweak++)
568	{
569	MUInt15 ep[`2`][`1`];
570
571	ufep.FinishLDR(tweak, `8`, ep[`0`], ep[`1`]);
572
573	for (int refinePass = `0`; refinePass < numRefineRounds; refinePass++)
574	{
575	EndpointRefiner<`1`> refiner;
576	refiner.Init(`6`, oneWeight);
577
578	if (isSigned)
579	for (int epi = `0`; epi < `2`; epi++)
580	ep[epi][`0`] = ParallelMath::Min(ep[epi][`0`], highTerminal);
581
582	IndexSelector<`1`> indexSelector;
583	indexSelector.Init<false>(oneWeight, ep, `6`);
584
585	MUInt15 indexes[`16`];
586	MFloat error = ParallelMath::MakeFloatZero();
587
588	for (int px = `0`; px < `16`; px++)
589	{
590	MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
591
592	MUInt15 reconstructedPixel;
593
594	indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
595
596	MFloat zeroError = BCCommon::ComputeErrorLDRSimple<`1`>(flags \| Flags::Uniform, &zero, &pixels[px], `1`, oneWeight);
597	MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<`1`>(flags \| Flags::Uniform, &highTerminal, &pixels[px], `1`, oneWeight);
598	MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<`1`>(flags \| Flags::Uniform, &reconstructedPixel, &pixels[px], `1`, oneWeight);
599
600	MFloat bestPixelError = zeroError;
601	MUInt15 index = ParallelMath::MakeUInt15(`6`);
602
603	ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(`7`));
604	bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
605
606	ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
607
608	if (ParallelMath::AllSet(selectedIndexBetter))
609	{
610	if (refinePass != numRefineRounds - `1`)
611	refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
612	}
613	else
614	{
615	MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(`1.0f`), ParallelMath::MakeFloatZero());
616
617	if (refinePass != numRefineRounds - `1`)
618	refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
619	}
620
621	ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
622	bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
623
624	error = error + bestPixelError;
625
626	indexes[px] = index;
627	}
628
629	ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
630	ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
631
632	if (ParallelMath::AnySet(errorBetter16))
633	{
634	bestError = ParallelMath::Min(error, bestError);
635	ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
636	for (int px = `0`; px < `16`; px++)
637	ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
638
639	for (int epi = `0`; epi < `2`; epi++)
640	ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][`0`]);
641	}
642
643	if (refinePass != numRefineRounds - `1`)
644	refiner.GetRefinedEndpointsLDR(ep, &rtn);
645	}
646	}
647	}
648	}
649	}
650
651	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
652	{
653	int ep0 = ParallelMath::Extract(bestEP[`0`], block);
654	int ep1 = ParallelMath::Extract(bestEP[`1`], block);
655	int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
656
657	if (isSigned)
658	{
659	ep0 -= `127`;
660	ep1 -= `127`;
661
662	assert(ep0 >= -`127` && ep0 <= `127`);
663	assert(ep1 >= -`127` && ep1 <= `127`);
664	}
665
666
667	bool swapEndpoints = (isFullRange != `0`) != (ep0 > ep1);
668
669	if (swapEndpoints)
670	std::swap(ep0, ep1);
671
672	uint16_t dumpBits = `0`;
673	int dumpBitsOffset = `0`;
674	int dumpByteOffset = `2`;
675	packedBlocks[`0`] = static_cast<uint8_t>(ep0 & `0xff`);
676	packedBlocks[`1`] = static_cast<uint8_t>(ep1 & `0xff`);
677
678	int maxValue = (isFullRange != `0`) ? `7` : `5`;
679
680	for (int px = `0`; px < `16`; px++)
681	{
682	int index = ParallelMath::Extract(bestIndexes[px], block);
683
684	if (swapEndpoints && index <= maxValue)
685	index = maxValue - index;
686
687	if (index != `0`)
688	{
689	if (index == maxValue)
690	index = `1`;
691	else if (index < maxValue)
692	index++;
693	}
694
695	assert(index >= `0` && index < `8`);
696
697	dumpBits \|= static_cast<uint16_t>(index << dumpBitsOffset);
698	dumpBitsOffset += `3`;
699
700	if (dumpBitsOffset >= `8`)
701	{
702	assert(dumpByteOffset < `8`);
703	packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & `0xff`);
704	dumpBits >>= `8`;
705	dumpBitsOffset -= `8`;
706	dumpByteOffset++;
707	}
708	}
709
710	assert(dumpBitsOffset == `0`);
711	assert(dumpByteOffset == `8`);
712
713	packedBlocks += packedBlockStride;
714	}
715	}
716
717	void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[`4`], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
718	{
719	ParallelMath::RoundTowardNearestForScope rtn;
720
721	if (numRefineRounds < `1`)
722	numRefineRounds = `1`;
723
724	if (maxTweakRounds < `1`)
725	maxTweakRounds = `1`;
726
727	EndpointSelector<`3`, `8`> endpointSelector;
728
729	MUInt15 pixels[`16`][`4`];
730	MFloat floatPixels[`16`][`4`];
731
732	MFloat preWeightedPixels[`16`][`4`];
733
734	for (int px = `0`; px < `16`; px++)
735	{
736	for (int ch = `0`; ch < `4`; ch++)
737	ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
738	}
739
740	for (int px = `0`; px < `16`; px++)
741	{
742	for (int ch = `0`; ch < `4`; ch++)
743	floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
744	}
745
746	if (alphaTest)
747	{
748	MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * `255.0f` + `0.5f`)));
749
750	for (int px = `0`; px < `16`; px++)
751	{
752	ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][`3`], threshold);
753	pixels[px][`3`] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(`0`), ParallelMath::MakeUInt15(`255`));
754	}
755	}
756
757	BCCommon::PreWeightPixelsLDR<`4`>(preWeightedPixels, pixels, channelWeights);
758
759	MUInt15 minAlpha = ParallelMath::MakeUInt15(`255`);
760
761	for (int px = `0`; px < `16`; px++)
762	minAlpha = ParallelMath::Min(minAlpha, pixels[px][`3`]);
763
764	MFloat pixelWeights[`16`];
765	for (int px = `0`; px < `16`; px++)
766	{
767	pixelWeights[px] = ParallelMath::MakeFloat(`1.0f`);
768	if (alphaTest)
769	{
770	ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][`3`], ParallelMath::MakeUInt15(`255`));
771
772	ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
773	}
774	}
775
776	for (int pass = `0`; pass < NumEndpointSelectorPasses; pass++)
777	{
778	for (int px = `0`; px < `16`; px++)
779	endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
780
781	endpointSelector.FinishPass(pass);
782	}
783
784	UnfinishedEndpoints<`3`> ufep = endpointSelector.GetEndpoints(channelWeights);
785
786	MUInt15 bestEndpoints[`2`][`3`];
787	MUInt15 bestIndexes[`16`];
788	MUInt15 bestRange = ParallelMath::MakeUInt15(`0`);
789	MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
790
791	for (int px = `0`; px < `16`; px++)
792	bestIndexes[px] = ParallelMath::MakeUInt15(`0`);
793
794	for (int ep = `0`; ep < `2`; ep++)
795	for (int ch = `0`; ch < `3`; ch++)
796	bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(`0`);
797
798	if (exhaustive)
799	{
800	MSInt16 sortBins[`16`];
801
802	{
803	// Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
804	// and pack the original indexes into the low bits.
805
806	MUInt15 sortEP[`2`][`3`];
807	ufep.FinishLDR(`0`, `11`, sortEP[`0`], sortEP[`1`]);
808
809	IndexSelector<`3`> sortSelector;
810	sortSelector.Init<false>(channelWeights, sortEP, `1` << `11`);
811
812	for (int16_t px = `0`; px < `16`; px++)
813	{
814	MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << `4`);
815
816	if (alphaTest)
817	{
818	ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][`3`], ParallelMath::MakeUInt15(`255`));
819
820	ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-`16`)); // 0xfff0
821	}
822
823	sortBin = sortBin + ParallelMath::MakeSInt16(px);
824
825	sortBins[px] = sortBin;
826	}
827	}
828
829	// Sort bins
830	for (int sortEnd = `1`; sortEnd < `16`; sortEnd++)
831	{
832	for (int sortLoc = sortEnd; sortLoc > `0`; sortLoc--)
833	{
834	MSInt16 a = sortBins[sortLoc];
835	MSInt16 b = sortBins[sortLoc - `1`];
836
837	sortBins[sortLoc] = ParallelMath::Max(a, b);
838	sortBins[sortLoc - `1`] = ParallelMath::Min(a, b);
839	}
840	}
841
842	MUInt15 firstElement = ParallelMath::MakeUInt15(`0`);
843	for (uint16_t e = `0`; e < `16`; e++)
844	{
845	ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(`0`));
846	ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + `1`));
847	if (!ParallelMath::AnySet(isInvalid))
848	break;
849	}
850
851	MUInt15 numElements = ParallelMath::MakeUInt15(`16`) - firstElement;
852
853	MUInt15 sortedInputs[`16`][`4`];
854	MFloat floatSortedInputs[`16`][`4`];
855	MFloat pwFloatSortedInputs[`16`][`4`];
856
857	for (int e = `0`; e < `16`; e++)
858	{
859	for (int ch = `0`; ch < `4`; ch++)
860	sortedInputs[e][ch] = ParallelMath::MakeUInt15(`0`);
861	}
862
863	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
864	{
865	for (int e = ParallelMath::Extract(firstElement, block); e < `16`; e++)
866	{
867	ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
868	int originalIndex = (sortBin & `15`);
869
870	for (int ch = `0`; ch < `4`; ch++)
871	ParallelMath::PutUInt15(sortedInputs[`15` - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
872	}
873	}
874
875	for (int e = `0`; e < `16`; e++)
876	{
877	for (int ch = `0`; ch < `4`; ch++)
878	{
879	MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
880	floatSortedInputs[e][ch] = f;
881	pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
882	}
883	}
884
885	for (int n0 = `0`; n0 <= `15`; n0++)
886	{
887	int remainingFor1 = `16` - n0;
888	if (remainingFor1 == `16`)
889	remainingFor1 = `15`;
890
891	for (int n1 = `0`; n1 <= remainingFor1; n1++)
892	{
893	int remainingFor2 = `16` - n1 - n0;
894	if (remainingFor2 == `16`)
895	remainingFor2 = `15`;
896
897	for (int n2 = `0`; n2 <= remainingFor2; n2++)
898	{
899	int n3 = `16` - n2 - n1 - n0;
900
901	if (n3 == `16`)
902	continue;
903
904	int counts[`4`] = { n0, n1, n2, n3 };
905
906	TestCounts(flags, counts, `4`, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
907	}
908	}
909	}
910
911	TestSingleColor(flags, pixels, floatPixels, `4`, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
912
913	if (alphaTest)
914	{
915	for (int n0 = `0`; n0 <= `15`; n0++)
916	{
917	int remainingFor1 = `16` - n0;
918	if (remainingFor1 == `16`)
919	remainingFor1 = `15`;
920
921	for (int n1 = `0`; n1 <= remainingFor1; n1++)
922	{
923	int n2 = `16` - n1 - n0;
924
925	if (n2 == `16`)
926	continue;
927
928	int counts[`3`] = { n0, n1, n2 };
929
930	TestCounts(flags, counts, `3`, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
931	}
932	}
933
934	TestSingleColor(flags, pixels, floatPixels, `3`, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
935	}
936	}
937	else
938	{
939	int minRange = alphaTest ? `3` : `4`;
940
941	for (int range = minRange; range <= `4`; range++)
942	{
943	int tweakRounds = BCCommon::TweakRoundsForRange(range);
944	if (tweakRounds > maxTweakRounds)
945	tweakRounds = maxTweakRounds;
946
947	for (int tweak = `0`; tweak < tweakRounds; tweak++)
948	{
949	MUInt15 endPoints[`2`][`3`];
950
951	ufep.FinishLDR(tweak, range, endPoints[`0`], endPoints[`1`]);
952
953	for (int refine = `0`; refine < numRefineRounds; refine++)
954	{
955	EndpointRefiner<`3`> refiner;
956	refiner.Init(range, channelWeights);
957
958	TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
959
960	if (refine != numRefineRounds - `1`)
961	refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
962	}
963	}
964	}
965	}
966
967	for (int block = `0`; block < ParallelMath::ParallelSize; block++)
968	{
969	ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
970	assert(range == `3` \|\| range == `4`);
971
972	ParallelMath::ScalarUInt16 compressedEP[`2`];
973	for (int ep = `0`; ep < `2`; ep++)
974	{
975	ParallelMath::ScalarUInt16 endPoint[`3`];
976	for (int ch = `0`; ch < `3`; ch++)
977	endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
978
979	int compressed = (endPoint[`0`] & `0xf8`) << `8`;
980	compressed \|= (endPoint[`1`] & `0xfc`) << `3`;
981	compressed \|= (endPoint[`2`] & `0xf8`) >> `3`;
982
983	compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
984	}
985
986	int indexOrder[`4`];
987
988	if (range == `4`)
989	{
990	if (compressedEP[`0`] == compressedEP[`1`])
991	{
992	indexOrder[`0`] = `0`;
993	indexOrder[`1`] = `0`;
994	indexOrder[`2`] = `0`;
995	indexOrder[`3`] = `0`;
996	}
997	else if (compressedEP[`0`] < compressedEP[`1`])
998	{
999	std::swap(compressedEP[`0`], compressedEP[`1`]);
1000	indexOrder[`0`] = `1`;
1001	indexOrder[`1`] = `3`;
1002	indexOrder[`2`] = `2`;
1003	indexOrder[`3`] = `0`;
1004	}
1005	else
1006	{
1007	indexOrder[`0`] = `0`;
1008	indexOrder[`1`] = `2`;
1009	indexOrder[`2`] = `3`;
1010	indexOrder[`3`] = `1`;
1011	}
1012	}
1013	else
1014	{
1015	assert(range == `3`);
1016
1017	if (compressedEP[`0`] > compressedEP[`1`])
1018	{
1019	std::swap(compressedEP[`0`], compressedEP[`1`]);
1020	indexOrder[`0`] = `1`;
1021	indexOrder[`1`] = `2`;
1022	indexOrder[`2`] = `0`;
1023	}
1024	else
1025	{
1026	indexOrder[`0`] = `0`;
1027	indexOrder[`1`] = `2`;
1028	indexOrder[`2`] = `1`;
1029	}
1030	indexOrder[`3`] = `3`;
1031	}
1032
1033	packedBlocks[`0`] = static_cast<uint8_t>(compressedEP[`0`] & `0xff`);
1034	packedBlocks[`1`] = static_cast<uint8_t>((compressedEP[`0`] >> `8`) & `0xff`);
1035	packedBlocks[`2`] = static_cast<uint8_t>(compressedEP[`1`] & `0xff`);
1036	packedBlocks[`3`] = static_cast<uint8_t>((compressedEP[`1`] >> `8`) & `0xff`);
1037
1038	for (int i = `0`; i < `16`; i += `4`)
1039	{
1040	int packedIndexes = `0`;
1041	for (int subi = `0`; subi < `4`; subi++)
1042	{
1043	ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
1044	packedIndexes \|= (indexOrder[index] << (subi * `2`));
1045	}
1046
1047	packedBlocks[`4` + i / `4`] = static_cast<uint8_t>(packedIndexes);
1048	}
1049
1050	packedBlocks += packedBlockStride;
1051	}
1052	}
1053
1054	#endif
1055

Browse the source code of Godot/thirdparty/cvtt/ConvectionKernels_S3TC.cpp