1 | /* |
2 | Convection Texture Tools |
3 | Copyright (c) 2018-2019 Eric Lasota |
4 | |
5 | Permission is hereby granted, free of charge, to any person obtaining |
6 | a copy of this software and associated documentation files (the |
7 | "Software"), to deal in the Software without restriction, including |
8 | without limitation the rights to use, copy, modify, merge, publish, |
9 | distribute, sublicense, and/or sell copies of the Software, and to |
10 | permit persons to whom the Software is furnished to do so, subject |
11 | to the following conditions: |
12 | |
13 | The above copyright notice and this permission notice shall be included |
14 | in all copies or substantial portions of the Software. |
15 | |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
17 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
23 | |
24 | ------------------------------------------------------------------------------------- |
25 | |
26 | Portions based on DirectX Texture Library (DirectXTex) |
27 | |
28 | Copyright (c) Microsoft Corporation. All rights reserved. |
29 | Licensed under the MIT License. |
30 | |
31 | http://go.microsoft.com/fwlink/?LinkId=248926 |
32 | */ |
33 | #include "ConvectionKernels_Config.h" |
34 | |
35 | #if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL) |
36 | |
37 | #include "ConvectionKernels_S3TC.h" |
38 | |
39 | #include "ConvectionKernels_AggregatedError.h" |
40 | #include "ConvectionKernels_BCCommon.h" |
41 | #include "ConvectionKernels_EndpointRefiner.h" |
42 | #include "ConvectionKernels_EndpointSelector.h" |
43 | #include "ConvectionKernels_IndexSelector.h" |
44 | #include "ConvectionKernels_UnfinishedEndpoints.h" |
45 | #include "ConvectionKernels_S3TC_SingleColor.h" |
46 | |
47 | void cvtt::Internal::S3TCComputer::Init(MFloat& error) |
48 | { |
49 | error = ParallelMath::MakeFloat(FLT_MAX); |
50 | } |
51 | |
52 | void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v) |
53 | { |
54 | MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10)); |
55 | v = (reduced << 2) | ParallelMath::RightShift(reduced, 4); |
56 | } |
57 | |
58 | void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v) |
59 | { |
60 | MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11)); |
61 | v = (reduced << 3) | ParallelMath::RightShift(reduced, 2); |
62 | } |
63 | |
64 | void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3]) |
65 | { |
66 | QuantizeTo5Bits(endPoint[0]); |
67 | QuantizeTo6Bits(endPoint[1]); |
68 | QuantizeTo5Bits(endPoint[2]); |
69 | } |
70 | |
71 | cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span) |
72 | { |
73 | return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f; |
74 | } |
75 | |
76 | cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d) |
77 | { |
78 | MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b))); |
79 | absDiff = absDiff + d; |
80 | return absDiff * absDiff; |
81 | } |
82 | |
83 | void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights, |
84 | MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn) |
85 | { |
86 | float channelWeightsSq[3]; |
87 | |
88 | for (int ch = 0; ch < 3; ch++) |
89 | channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; |
90 | |
91 | MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) }; |
92 | |
93 | for (int px = 0; px < 16; px++) |
94 | { |
95 | for (int ch = 0; ch < 3; ch++) |
96 | totals[ch] = totals[ch] + pixels[px][ch]; |
97 | } |
98 | |
99 | MUInt15 average[3]; |
100 | for (int ch = 0; ch < 3; ch++) |
101 | average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4); |
102 | |
103 | const Tables::S3TCSC::TableEntry* rbTable = NULL; |
104 | const Tables::S3TCSC::TableEntry* gTable = NULL; |
105 | if (flags & cvtt::Flags::S3TC_Paranoid) |
106 | { |
107 | if (range == 4) |
108 | { |
109 | rbTable = Tables::S3TCSC::g_singleColor5_3_p; |
110 | gTable = Tables::S3TCSC::g_singleColor6_3_p; |
111 | } |
112 | else |
113 | { |
114 | assert(range == 3); |
115 | rbTable = Tables::S3TCSC::g_singleColor5_2_p; |
116 | gTable = Tables::S3TCSC::g_singleColor6_2_p; |
117 | } |
118 | } |
119 | else |
120 | { |
121 | if (range == 4) |
122 | { |
123 | rbTable = Tables::S3TCSC::g_singleColor5_3; |
124 | gTable = Tables::S3TCSC::g_singleColor6_3; |
125 | } |
126 | else |
127 | { |
128 | assert(range == 3); |
129 | rbTable = Tables::S3TCSC::g_singleColor5_2; |
130 | gTable = Tables::S3TCSC::g_singleColor6_2; |
131 | } |
132 | } |
133 | |
134 | MUInt15 interpolated[3]; |
135 | MUInt15 eps[2][3]; |
136 | MSInt16 spans[3]; |
137 | for (int i = 0; i < ParallelMath::ParallelSize; i++) |
138 | { |
139 | for (int ch = 0; ch < 3; ch++) |
140 | { |
141 | uint16_t avg = ParallelMath::Extract(average[ch], i); |
142 | const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]); |
143 | ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min); |
144 | ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max); |
145 | ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor); |
146 | ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span); |
147 | } |
148 | } |
149 | |
150 | MFloat error = ParallelMath::MakeFloatZero(); |
151 | if (flags & cvtt::Flags::S3TC_Paranoid) |
152 | { |
153 | MFloat spanParanoidFactors[3]; |
154 | for (int ch = 0; ch < 3; ch++) |
155 | spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]); |
156 | |
157 | for (int px = 0; px < 16; px++) |
158 | { |
159 | for (int ch = 0; ch < 3; ch++) |
160 | error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch]; |
161 | } |
162 | } |
163 | else |
164 | { |
165 | for (int px = 0; px < 16; px++) |
166 | { |
167 | for (int ch = 0; ch < 3; ch++) |
168 | error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch]; |
169 | } |
170 | } |
171 | |
172 | ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError); |
173 | ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better); |
174 | |
175 | if (ParallelMath::AnySet(better16)) |
176 | { |
177 | bestError = ParallelMath::Min(bestError, error); |
178 | for (int epi = 0; epi < 2; epi++) |
179 | for (int ch = 0; ch < 3; ch++) |
180 | ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]); |
181 | |
182 | MUInt15 vindexes = ParallelMath::MakeUInt15(1); |
183 | for (int px = 0; px < 16; px++) |
184 | ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes); |
185 | |
186 | ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range)); |
187 | } |
188 | } |
189 | |
190 | void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights, |
191 | MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn) |
192 | { |
193 | float channelWeightsSq[3]; |
194 | |
195 | for (int ch = 0; ch < 3; ch++) |
196 | channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; |
197 | |
198 | MUInt15 endPoints[2][3]; |
199 | |
200 | for (int ep = 0; ep < 2; ep++) |
201 | for (int ch = 0; ch < 3; ch++) |
202 | endPoints[ep][ch] = unquantizedEndPoints[ep][ch]; |
203 | |
204 | QuantizeTo565(endPoints[0]); |
205 | QuantizeTo565(endPoints[1]); |
206 | |
207 | IndexSelector<3> selector; |
208 | selector.Init<false>(channelWeights, endPoints, range); |
209 | |
210 | MUInt15 indexes[16]; |
211 | |
212 | MFloat paranoidFactors[3]; |
213 | for (int ch = 0; ch < 3; ch++) |
214 | paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch])); |
215 | |
216 | MFloat error = ParallelMath::MakeFloatZero(); |
217 | AggregatedError<3> aggError; |
218 | for (int px = 0; px < 16; px++) |
219 | { |
220 | MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn); |
221 | indexes[px] = index; |
222 | |
223 | if (refiner) |
224 | refiner->ContributeUnweightedPW(preWeightedPixels[px], index); |
225 | |
226 | MUInt15 reconstructed[3]; |
227 | selector.ReconstructLDRPrecise(index, reconstructed); |
228 | |
229 | if (flags & Flags::S3TC_Paranoid) |
230 | { |
231 | for (int ch = 0; ch < 3; ch++) |
232 | error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch]; |
233 | } |
234 | else |
235 | BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError); |
236 | } |
237 | |
238 | if (!(flags & Flags::S3TC_Paranoid)) |
239 | error = aggError.Finalize(flags, channelWeightsSq); |
240 | |
241 | ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError); |
242 | |
243 | if (ParallelMath::AnySet(better)) |
244 | { |
245 | ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better); |
246 | |
247 | ParallelMath::ConditionalSet(bestError, better, error); |
248 | |
249 | for (int ep = 0; ep < 2; ep++) |
250 | for (int ch = 0; ch < 3; ch++) |
251 | ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]); |
252 | |
253 | for (int px = 0; px < 16; px++) |
254 | ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]); |
255 | |
256 | ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range))); |
257 | } |
258 | } |
259 | |
260 | void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest, |
261 | const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, |
262 | const ParallelMath::RoundTowardNearestForScope* rtn) |
263 | { |
264 | UNREFERENCED_PARAMETER(alphaTest); |
265 | UNREFERENCED_PARAMETER(flags); |
266 | |
267 | EndpointRefiner<3> refiner; |
268 | |
269 | refiner.Init(nCounts, channelWeights); |
270 | |
271 | bool escape = false; |
272 | int e = 0; |
273 | for (int i = 0; i < nCounts; i++) |
274 | { |
275 | for (int n = 0; n < counts[i]; n++) |
276 | { |
277 | ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements); |
278 | if (!ParallelMath::AnySet(valid)) |
279 | { |
280 | escape = true; |
281 | break; |
282 | } |
283 | |
284 | if (ParallelMath::AllSet(valid)) |
285 | refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i))); |
286 | else |
287 | { |
288 | MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f)); |
289 | refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight); |
290 | } |
291 | } |
292 | |
293 | if (escape) |
294 | break; |
295 | } |
296 | |
297 | MUInt15 endPoints[2][3]; |
298 | refiner.GetRefinedEndpointsLDR(endPoints, rtn); |
299 | |
300 | TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn); |
301 | } |
302 | |
303 | void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride) |
304 | { |
305 | UNREFERENCED_PARAMETER(flags); |
306 | ParallelMath::RoundTowardNearestForScope rtn; |
307 | |
308 | float weights[1] = { 1.0f }; |
309 | |
310 | MUInt15 pixels[16]; |
311 | MFloat floatPixels[16]; |
312 | |
313 | for (int px = 0; px < 16; px++) |
314 | { |
315 | ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]); |
316 | floatPixels[px] = ParallelMath::ToFloat(pixels[px]); |
317 | } |
318 | |
319 | MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } }; |
320 | |
321 | IndexSelector<1> selector; |
322 | selector.Init<false>(weights, ep, 16); |
323 | |
324 | MUInt15 indexes[16]; |
325 | |
326 | for (int px = 0; px < 16; px++) |
327 | indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn); |
328 | |
329 | for (int block = 0; block < ParallelMath::ParallelSize; block++) |
330 | { |
331 | for (int px = 0; px < 16; px += 2) |
332 | { |
333 | int index0 = ParallelMath::Extract(indexes[px], block); |
334 | int index1 = ParallelMath::Extract(indexes[px + 1], block); |
335 | |
336 | packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4)); |
337 | } |
338 | |
339 | packedBlocks += packedBlockStride; |
340 | } |
341 | } |
342 | |
343 | void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds) |
344 | { |
345 | if (maxTweakRounds < 1) |
346 | maxTweakRounds = 1; |
347 | |
348 | if (numRefineRounds < 1) |
349 | numRefineRounds = 1; |
350 | |
351 | ParallelMath::RoundTowardNearestForScope rtn; |
352 | |
353 | float oneWeight[1] = { 1.0f }; |
354 | |
355 | MUInt15 pixels[16]; |
356 | MFloat floatPixels[16]; |
357 | |
358 | MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255); |
359 | MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1); |
360 | |
361 | for (int px = 0; px < 16; px++) |
362 | { |
363 | ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]); |
364 | |
365 | if (isSigned) |
366 | pixels[px] = ParallelMath::Min(pixels[px], highTerminal); |
367 | |
368 | floatPixels[px] = ParallelMath::ToFloat(pixels[px]); |
369 | } |
370 | |
371 | MUInt15 sortedPixels[16]; |
372 | for (int px = 0; px < 16; px++) |
373 | sortedPixels[px] = pixels[px]; |
374 | |
375 | for (int sortEnd = 15; sortEnd > 0; sortEnd--) |
376 | { |
377 | for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++) |
378 | { |
379 | MUInt15 a = sortedPixels[sortOffset]; |
380 | MUInt15 b = sortedPixels[sortOffset + 1]; |
381 | |
382 | sortedPixels[sortOffset] = ParallelMath::Min(a, b); |
383 | sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b); |
384 | } |
385 | } |
386 | |
387 | MUInt15 zero = ParallelMath::MakeUInt15(0); |
388 | MUInt15 one = ParallelMath::MakeUInt15(1); |
389 | |
390 | MUInt15 bestIsFullRange = zero; |
391 | MFloat bestError = ParallelMath::MakeFloat(FLT_MAX); |
392 | MUInt15 bestEP[2] = { zero, zero }; |
393 | MUInt15 bestIndexes[16] = { |
394 | zero, zero, zero, zero, |
395 | zero, zero, zero, zero, |
396 | zero, zero, zero, zero, |
397 | zero, zero, zero, zero |
398 | }; |
399 | |
400 | // Full-precision |
401 | { |
402 | MUInt15 minEP = sortedPixels[0]; |
403 | MUInt15 maxEP = sortedPixels[15]; |
404 | |
405 | MFloat base[1] = { ParallelMath::ToFloat(minEP) }; |
406 | MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) }; |
407 | |
408 | UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset); |
409 | |
410 | int numTweakRounds = BCCommon::TweakRoundsForRange(8); |
411 | if (numTweakRounds > maxTweakRounds) |
412 | numTweakRounds = maxTweakRounds; |
413 | |
414 | for (int tweak = 0; tweak < numTweakRounds; tweak++) |
415 | { |
416 | MUInt15 ep[2][1]; |
417 | |
418 | ufep.FinishLDR(tweak, 8, ep[0], ep[1]); |
419 | |
420 | for (int refinePass = 0; refinePass < numRefineRounds; refinePass++) |
421 | { |
422 | EndpointRefiner<1> refiner; |
423 | refiner.Init(8, oneWeight); |
424 | |
425 | if (isSigned) |
426 | for (int epi = 0; epi < 2; epi++) |
427 | ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal); |
428 | |
429 | IndexSelector<1> indexSelector; |
430 | indexSelector.Init<false>(oneWeight, ep, 8); |
431 | |
432 | MUInt15 indexes[16]; |
433 | |
434 | AggregatedError<1> aggError; |
435 | for (int px = 0; px < 16; px++) |
436 | { |
437 | MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn); |
438 | |
439 | MUInt15 reconstructedPixel; |
440 | |
441 | indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel); |
442 | BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError); |
443 | |
444 | if (refinePass != numRefineRounds - 1) |
445 | refiner.ContributeUnweightedPW(&floatPixels[px], index); |
446 | |
447 | indexes[px] = index; |
448 | } |
449 | MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight); |
450 | |
451 | ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError); |
452 | ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter); |
453 | |
454 | if (ParallelMath::AnySet(errorBetter16)) |
455 | { |
456 | bestError = ParallelMath::Min(error, bestError); |
457 | ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one); |
458 | for (int px = 0; px < 16; px++) |
459 | ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]); |
460 | |
461 | for (int epi = 0; epi < 2; epi++) |
462 | ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]); |
463 | } |
464 | |
465 | if (refinePass != numRefineRounds - 1) |
466 | refiner.GetRefinedEndpointsLDR(ep, &rtn); |
467 | } |
468 | } |
469 | } |
470 | |
471 | // Reduced precision with special endpoints |
472 | { |
473 | MUInt15 bestHeuristicMin = sortedPixels[0]; |
474 | MUInt15 bestHeuristicMax = sortedPixels[15]; |
475 | |
476 | ParallelMath::Int16CompFlag canTryClipping; |
477 | |
478 | // In reduced precision, we want try putting endpoints at the reserved indexes at the ends. |
479 | // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range. |
480 | // This will usually not find anything, but it's cheap to check. |
481 | |
482 | { |
483 | MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255 |
484 | MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax)); |
485 | |
486 | MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4); |
487 | canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange); |
488 | } |
489 | |
490 | if (ParallelMath::AnySet(canTryClipping)) |
491 | { |
492 | MUInt15 lowClearances[16]; |
493 | MUInt15 highClearances[16]; |
494 | MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0); |
495 | |
496 | lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0); |
497 | |
498 | for (int px = 1; px < 16; px++) |
499 | { |
500 | lowClearances[px] = sortedPixels[px - 1]; |
501 | highClearances[px] = highTerminal - sortedPixels[16 - px]; |
502 | } |
503 | |
504 | for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++) |
505 | { |
506 | uint16_t numSkippedLow = firstIndex; |
507 | |
508 | MUInt15 lowClearance = lowClearances[firstIndex]; |
509 | |
510 | for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++) |
511 | { |
512 | uint16_t numSkippedHigh = 15 - lastIndex; |
513 | uint16_t numSkipped = numSkippedLow + numSkippedHigh; |
514 | |
515 | MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped); |
516 | |
517 | ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV); |
518 | |
519 | if (!ParallelMath::AnySet(areMoreSkipped)) |
520 | continue; |
521 | |
522 | MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance); |
523 | MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4); |
524 | |
525 | MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex]; |
526 | |
527 | ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range)); |
528 | ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]); |
529 | ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]); |
530 | } |
531 | } |
532 | } |
533 | |
534 | MUInt15 bestSimpleMin = one; |
535 | MUInt15 bestSimpleMax = highTerminalMinusOne; |
536 | |
537 | for (int px = 0; px < 16; px++) |
538 | { |
539 | ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]); |
540 | ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]); |
541 | } |
542 | |
543 | MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin }; |
544 | MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax }; |
545 | |
546 | int minEPRange = 2; |
547 | if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1]))) |
548 | minEPRange = 1; |
549 | |
550 | int maxEPRange = 2; |
551 | if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1]))) |
552 | maxEPRange = 1; |
553 | |
554 | for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++) |
555 | { |
556 | for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++) |
557 | { |
558 | MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) }; |
559 | MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) }; |
560 | |
561 | UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset); |
562 | |
563 | int numTweakRounds = BCCommon::TweakRoundsForRange(6); |
564 | if (numTweakRounds > maxTweakRounds) |
565 | numTweakRounds = maxTweakRounds; |
566 | |
567 | for (int tweak = 0; tweak < numTweakRounds; tweak++) |
568 | { |
569 | MUInt15 ep[2][1]; |
570 | |
571 | ufep.FinishLDR(tweak, 8, ep[0], ep[1]); |
572 | |
573 | for (int refinePass = 0; refinePass < numRefineRounds; refinePass++) |
574 | { |
575 | EndpointRefiner<1> refiner; |
576 | refiner.Init(6, oneWeight); |
577 | |
578 | if (isSigned) |
579 | for (int epi = 0; epi < 2; epi++) |
580 | ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal); |
581 | |
582 | IndexSelector<1> indexSelector; |
583 | indexSelector.Init<false>(oneWeight, ep, 6); |
584 | |
585 | MUInt15 indexes[16]; |
586 | MFloat error = ParallelMath::MakeFloatZero(); |
587 | |
588 | for (int px = 0; px < 16; px++) |
589 | { |
590 | MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn); |
591 | |
592 | MUInt15 reconstructedPixel; |
593 | |
594 | indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel); |
595 | |
596 | MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight); |
597 | MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight); |
598 | MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight); |
599 | |
600 | MFloat bestPixelError = zeroError; |
601 | MUInt15 index = ParallelMath::MakeUInt15(6); |
602 | |
603 | ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7)); |
604 | bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError); |
605 | |
606 | ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError); |
607 | |
608 | if (ParallelMath::AllSet(selectedIndexBetter)) |
609 | { |
610 | if (refinePass != numRefineRounds - 1) |
611 | refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex); |
612 | } |
613 | else |
614 | { |
615 | MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero()); |
616 | |
617 | if (refinePass != numRefineRounds - 1) |
618 | refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight); |
619 | } |
620 | |
621 | ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex); |
622 | bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError); |
623 | |
624 | error = error + bestPixelError; |
625 | |
626 | indexes[px] = index; |
627 | } |
628 | |
629 | ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError); |
630 | ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter); |
631 | |
632 | if (ParallelMath::AnySet(errorBetter16)) |
633 | { |
634 | bestError = ParallelMath::Min(error, bestError); |
635 | ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero); |
636 | for (int px = 0; px < 16; px++) |
637 | ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]); |
638 | |
639 | for (int epi = 0; epi < 2; epi++) |
640 | ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]); |
641 | } |
642 | |
643 | if (refinePass != numRefineRounds - 1) |
644 | refiner.GetRefinedEndpointsLDR(ep, &rtn); |
645 | } |
646 | } |
647 | } |
648 | } |
649 | } |
650 | |
651 | for (int block = 0; block < ParallelMath::ParallelSize; block++) |
652 | { |
653 | int ep0 = ParallelMath::Extract(bestEP[0], block); |
654 | int ep1 = ParallelMath::Extract(bestEP[1], block); |
655 | int isFullRange = ParallelMath::Extract(bestIsFullRange, block); |
656 | |
657 | if (isSigned) |
658 | { |
659 | ep0 -= 127; |
660 | ep1 -= 127; |
661 | |
662 | assert(ep0 >= -127 && ep0 <= 127); |
663 | assert(ep1 >= -127 && ep1 <= 127); |
664 | } |
665 | |
666 | |
667 | bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1); |
668 | |
669 | if (swapEndpoints) |
670 | std::swap(ep0, ep1); |
671 | |
672 | uint16_t dumpBits = 0; |
673 | int dumpBitsOffset = 0; |
674 | int dumpByteOffset = 2; |
675 | packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff); |
676 | packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff); |
677 | |
678 | int maxValue = (isFullRange != 0) ? 7 : 5; |
679 | |
680 | for (int px = 0; px < 16; px++) |
681 | { |
682 | int index = ParallelMath::Extract(bestIndexes[px], block); |
683 | |
684 | if (swapEndpoints && index <= maxValue) |
685 | index = maxValue - index; |
686 | |
687 | if (index != 0) |
688 | { |
689 | if (index == maxValue) |
690 | index = 1; |
691 | else if (index < maxValue) |
692 | index++; |
693 | } |
694 | |
695 | assert(index >= 0 && index < 8); |
696 | |
697 | dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset); |
698 | dumpBitsOffset += 3; |
699 | |
700 | if (dumpBitsOffset >= 8) |
701 | { |
702 | assert(dumpByteOffset < 8); |
703 | packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff); |
704 | dumpBits >>= 8; |
705 | dumpBitsOffset -= 8; |
706 | dumpByteOffset++; |
707 | } |
708 | } |
709 | |
710 | assert(dumpBitsOffset == 0); |
711 | assert(dumpByteOffset == 8); |
712 | |
713 | packedBlocks += packedBlockStride; |
714 | } |
715 | } |
716 | |
717 | void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds) |
718 | { |
719 | ParallelMath::RoundTowardNearestForScope rtn; |
720 | |
721 | if (numRefineRounds < 1) |
722 | numRefineRounds = 1; |
723 | |
724 | if (maxTweakRounds < 1) |
725 | maxTweakRounds = 1; |
726 | |
727 | EndpointSelector<3, 8> endpointSelector; |
728 | |
729 | MUInt15 pixels[16][4]; |
730 | MFloat floatPixels[16][4]; |
731 | |
732 | MFloat preWeightedPixels[16][4]; |
733 | |
734 | for (int px = 0; px < 16; px++) |
735 | { |
736 | for (int ch = 0; ch < 4; ch++) |
737 | ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]); |
738 | } |
739 | |
740 | for (int px = 0; px < 16; px++) |
741 | { |
742 | for (int ch = 0; ch < 4; ch++) |
743 | floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]); |
744 | } |
745 | |
746 | if (alphaTest) |
747 | { |
748 | MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f))); |
749 | |
750 | for (int px = 0; px < 16; px++) |
751 | { |
752 | ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold); |
753 | pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255)); |
754 | } |
755 | } |
756 | |
757 | BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights); |
758 | |
759 | MUInt15 minAlpha = ParallelMath::MakeUInt15(255); |
760 | |
761 | for (int px = 0; px < 16; px++) |
762 | minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]); |
763 | |
764 | MFloat pixelWeights[16]; |
765 | for (int px = 0; px < 16; px++) |
766 | { |
767 | pixelWeights[px] = ParallelMath::MakeFloat(1.0f); |
768 | if (alphaTest) |
769 | { |
770 | ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255)); |
771 | |
772 | ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero()); |
773 | } |
774 | } |
775 | |
776 | for (int pass = 0; pass < NumEndpointSelectorPasses; pass++) |
777 | { |
778 | for (int px = 0; px < 16; px++) |
779 | endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]); |
780 | |
781 | endpointSelector.FinishPass(pass); |
782 | } |
783 | |
784 | UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights); |
785 | |
786 | MUInt15 bestEndpoints[2][3]; |
787 | MUInt15 bestIndexes[16]; |
788 | MUInt15 bestRange = ParallelMath::MakeUInt15(0); |
789 | MFloat bestError = ParallelMath::MakeFloat(FLT_MAX); |
790 | |
791 | for (int px = 0; px < 16; px++) |
792 | bestIndexes[px] = ParallelMath::MakeUInt15(0); |
793 | |
794 | for (int ep = 0; ep < 2; ep++) |
795 | for (int ch = 0; ch < 3; ch++) |
796 | bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0); |
797 | |
798 | if (exhaustive) |
799 | { |
800 | MSInt16 sortBins[16]; |
801 | |
802 | { |
803 | // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins, |
804 | // and pack the original indexes into the low bits. |
805 | |
806 | MUInt15 sortEP[2][3]; |
807 | ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]); |
808 | |
809 | IndexSelector<3> sortSelector; |
810 | sortSelector.Init<false>(channelWeights, sortEP, 1 << 11); |
811 | |
812 | for (int16_t px = 0; px < 16; px++) |
813 | { |
814 | MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4); |
815 | |
816 | if (alphaTest) |
817 | { |
818 | ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255)); |
819 | |
820 | ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0 |
821 | } |
822 | |
823 | sortBin = sortBin + ParallelMath::MakeSInt16(px); |
824 | |
825 | sortBins[px] = sortBin; |
826 | } |
827 | } |
828 | |
829 | // Sort bins |
830 | for (int sortEnd = 1; sortEnd < 16; sortEnd++) |
831 | { |
832 | for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--) |
833 | { |
834 | MSInt16 a = sortBins[sortLoc]; |
835 | MSInt16 b = sortBins[sortLoc - 1]; |
836 | |
837 | sortBins[sortLoc] = ParallelMath::Max(a, b); |
838 | sortBins[sortLoc - 1] = ParallelMath::Min(a, b); |
839 | } |
840 | } |
841 | |
842 | MUInt15 firstElement = ParallelMath::MakeUInt15(0); |
843 | for (uint16_t e = 0; e < 16; e++) |
844 | { |
845 | ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0)); |
846 | ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1)); |
847 | if (!ParallelMath::AnySet(isInvalid)) |
848 | break; |
849 | } |
850 | |
851 | MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement; |
852 | |
853 | MUInt15 sortedInputs[16][4]; |
854 | MFloat floatSortedInputs[16][4]; |
855 | MFloat pwFloatSortedInputs[16][4]; |
856 | |
857 | for (int e = 0; e < 16; e++) |
858 | { |
859 | for (int ch = 0; ch < 4; ch++) |
860 | sortedInputs[e][ch] = ParallelMath::MakeUInt15(0); |
861 | } |
862 | |
863 | for (int block = 0; block < ParallelMath::ParallelSize; block++) |
864 | { |
865 | for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++) |
866 | { |
867 | ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block); |
868 | int originalIndex = (sortBin & 15); |
869 | |
870 | for (int ch = 0; ch < 4; ch++) |
871 | ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block)); |
872 | } |
873 | } |
874 | |
875 | for (int e = 0; e < 16; e++) |
876 | { |
877 | for (int ch = 0; ch < 4; ch++) |
878 | { |
879 | MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]); |
880 | floatSortedInputs[e][ch] = f; |
881 | pwFloatSortedInputs[e][ch] = f * channelWeights[ch]; |
882 | } |
883 | } |
884 | |
885 | for (int n0 = 0; n0 <= 15; n0++) |
886 | { |
887 | int remainingFor1 = 16 - n0; |
888 | if (remainingFor1 == 16) |
889 | remainingFor1 = 15; |
890 | |
891 | for (int n1 = 0; n1 <= remainingFor1; n1++) |
892 | { |
893 | int remainingFor2 = 16 - n1 - n0; |
894 | if (remainingFor2 == 16) |
895 | remainingFor2 = 15; |
896 | |
897 | for (int n2 = 0; n2 <= remainingFor2; n2++) |
898 | { |
899 | int n3 = 16 - n2 - n1 - n0; |
900 | |
901 | if (n3 == 16) |
902 | continue; |
903 | |
904 | int counts[4] = { n0, n1, n2, n3 }; |
905 | |
906 | TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); |
907 | } |
908 | } |
909 | } |
910 | |
911 | TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); |
912 | |
913 | if (alphaTest) |
914 | { |
915 | for (int n0 = 0; n0 <= 15; n0++) |
916 | { |
917 | int remainingFor1 = 16 - n0; |
918 | if (remainingFor1 == 16) |
919 | remainingFor1 = 15; |
920 | |
921 | for (int n1 = 0; n1 <= remainingFor1; n1++) |
922 | { |
923 | int n2 = 16 - n1 - n0; |
924 | |
925 | if (n2 == 16) |
926 | continue; |
927 | |
928 | int counts[3] = { n0, n1, n2 }; |
929 | |
930 | TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); |
931 | } |
932 | } |
933 | |
934 | TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); |
935 | } |
936 | } |
937 | else |
938 | { |
939 | int minRange = alphaTest ? 3 : 4; |
940 | |
941 | for (int range = minRange; range <= 4; range++) |
942 | { |
943 | int tweakRounds = BCCommon::TweakRoundsForRange(range); |
944 | if (tweakRounds > maxTweakRounds) |
945 | tweakRounds = maxTweakRounds; |
946 | |
947 | for (int tweak = 0; tweak < tweakRounds; tweak++) |
948 | { |
949 | MUInt15 endPoints[2][3]; |
950 | |
951 | ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]); |
952 | |
953 | for (int refine = 0; refine < numRefineRounds; refine++) |
954 | { |
955 | EndpointRefiner<3> refiner; |
956 | refiner.Init(range, channelWeights); |
957 | |
958 | TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn); |
959 | |
960 | if (refine != numRefineRounds - 1) |
961 | refiner.GetRefinedEndpointsLDR(endPoints, &rtn); |
962 | } |
963 | } |
964 | } |
965 | } |
966 | |
967 | for (int block = 0; block < ParallelMath::ParallelSize; block++) |
968 | { |
969 | ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block); |
970 | assert(range == 3 || range == 4); |
971 | |
972 | ParallelMath::ScalarUInt16 compressedEP[2]; |
973 | for (int ep = 0; ep < 2; ep++) |
974 | { |
975 | ParallelMath::ScalarUInt16 endPoint[3]; |
976 | for (int ch = 0; ch < 3; ch++) |
977 | endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block); |
978 | |
979 | int compressed = (endPoint[0] & 0xf8) << 8; |
980 | compressed |= (endPoint[1] & 0xfc) << 3; |
981 | compressed |= (endPoint[2] & 0xf8) >> 3; |
982 | |
983 | compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed); |
984 | } |
985 | |
986 | int indexOrder[4]; |
987 | |
988 | if (range == 4) |
989 | { |
990 | if (compressedEP[0] == compressedEP[1]) |
991 | { |
992 | indexOrder[0] = 0; |
993 | indexOrder[1] = 0; |
994 | indexOrder[2] = 0; |
995 | indexOrder[3] = 0; |
996 | } |
997 | else if (compressedEP[0] < compressedEP[1]) |
998 | { |
999 | std::swap(compressedEP[0], compressedEP[1]); |
1000 | indexOrder[0] = 1; |
1001 | indexOrder[1] = 3; |
1002 | indexOrder[2] = 2; |
1003 | indexOrder[3] = 0; |
1004 | } |
1005 | else |
1006 | { |
1007 | indexOrder[0] = 0; |
1008 | indexOrder[1] = 2; |
1009 | indexOrder[2] = 3; |
1010 | indexOrder[3] = 1; |
1011 | } |
1012 | } |
1013 | else |
1014 | { |
1015 | assert(range == 3); |
1016 | |
1017 | if (compressedEP[0] > compressedEP[1]) |
1018 | { |
1019 | std::swap(compressedEP[0], compressedEP[1]); |
1020 | indexOrder[0] = 1; |
1021 | indexOrder[1] = 2; |
1022 | indexOrder[2] = 0; |
1023 | } |
1024 | else |
1025 | { |
1026 | indexOrder[0] = 0; |
1027 | indexOrder[1] = 2; |
1028 | indexOrder[2] = 1; |
1029 | } |
1030 | indexOrder[3] = 3; |
1031 | } |
1032 | |
1033 | packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff); |
1034 | packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff); |
1035 | packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff); |
1036 | packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff); |
1037 | |
1038 | for (int i = 0; i < 16; i += 4) |
1039 | { |
1040 | int packedIndexes = 0; |
1041 | for (int subi = 0; subi < 4; subi++) |
1042 | { |
1043 | ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block); |
1044 | packedIndexes |= (indexOrder[index] << (subi * 2)); |
1045 | } |
1046 | |
1047 | packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes); |
1048 | } |
1049 | |
1050 | packedBlocks += packedBlockStride; |
1051 | } |
1052 | } |
1053 | |
1054 | #endif |
1055 | |