| 1 | /* |
| 2 | Convection Texture Tools |
| 3 | Copyright (c) 2018-2019 Eric Lasota |
| 4 | |
| 5 | Permission is hereby granted, free of charge, to any person obtaining |
| 6 | a copy of this software and associated documentation files (the |
| 7 | "Software"), to deal in the Software without restriction, including |
| 8 | without limitation the rights to use, copy, modify, merge, publish, |
| 9 | distribute, sublicense, and/or sell copies of the Software, and to |
| 10 | permit persons to whom the Software is furnished to do so, subject |
| 11 | to the following conditions: |
| 12 | |
| 13 | The above copyright notice and this permission notice shall be included |
| 14 | in all copies or substantial portions of the Software. |
| 15 | |
| 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
| 17 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
| 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
| 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
| 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
| 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 23 | |
| 24 | ------------------------------------------------------------------------------------- |
| 25 | |
| 26 | Portions based on DirectX Texture Library (DirectXTex) |
| 27 | |
| 28 | Copyright (c) Microsoft Corporation. All rights reserved. |
| 29 | Licensed under the MIT License. |
| 30 | |
| 31 | http://go.microsoft.com/fwlink/?LinkId=248926 |
| 32 | */ |
| 33 | #include "ConvectionKernels_Config.h" |
| 34 | |
| 35 | #if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL) |
| 36 | |
| 37 | #include "ConvectionKernels_S3TC.h" |
| 38 | |
| 39 | #include "ConvectionKernels_AggregatedError.h" |
| 40 | #include "ConvectionKernels_BCCommon.h" |
| 41 | #include "ConvectionKernels_EndpointRefiner.h" |
| 42 | #include "ConvectionKernels_EndpointSelector.h" |
| 43 | #include "ConvectionKernels_IndexSelector.h" |
| 44 | #include "ConvectionKernels_UnfinishedEndpoints.h" |
| 45 | #include "ConvectionKernels_S3TC_SingleColor.h" |
| 46 | |
| 47 | void cvtt::Internal::S3TCComputer::Init(MFloat& error) |
| 48 | { |
| 49 | error = ParallelMath::MakeFloat(FLT_MAX); |
| 50 | } |
| 51 | |
| 52 | void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v) |
| 53 | { |
| 54 | MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10)); |
| 55 | v = (reduced << 2) | ParallelMath::RightShift(reduced, 4); |
| 56 | } |
| 57 | |
| 58 | void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v) |
| 59 | { |
| 60 | MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11)); |
| 61 | v = (reduced << 3) | ParallelMath::RightShift(reduced, 2); |
| 62 | } |
| 63 | |
| 64 | void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3]) |
| 65 | { |
| 66 | QuantizeTo5Bits(endPoint[0]); |
| 67 | QuantizeTo6Bits(endPoint[1]); |
| 68 | QuantizeTo5Bits(endPoint[2]); |
| 69 | } |
| 70 | |
| 71 | cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span) |
| 72 | { |
| 73 | return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f; |
| 74 | } |
| 75 | |
| 76 | cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d) |
| 77 | { |
| 78 | MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b))); |
| 79 | absDiff = absDiff + d; |
| 80 | return absDiff * absDiff; |
| 81 | } |
| 82 | |
| 83 | void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights, |
| 84 | MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn) |
| 85 | { |
| 86 | float channelWeightsSq[3]; |
| 87 | |
| 88 | for (int ch = 0; ch < 3; ch++) |
| 89 | channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; |
| 90 | |
| 91 | MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) }; |
| 92 | |
| 93 | for (int px = 0; px < 16; px++) |
| 94 | { |
| 95 | for (int ch = 0; ch < 3; ch++) |
| 96 | totals[ch] = totals[ch] + pixels[px][ch]; |
| 97 | } |
| 98 | |
| 99 | MUInt15 average[3]; |
| 100 | for (int ch = 0; ch < 3; ch++) |
| 101 | average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4); |
| 102 | |
| 103 | const Tables::S3TCSC::TableEntry* rbTable = NULL; |
| 104 | const Tables::S3TCSC::TableEntry* gTable = NULL; |
| 105 | if (flags & cvtt::Flags::S3TC_Paranoid) |
| 106 | { |
| 107 | if (range == 4) |
| 108 | { |
| 109 | rbTable = Tables::S3TCSC::g_singleColor5_3_p; |
| 110 | gTable = Tables::S3TCSC::g_singleColor6_3_p; |
| 111 | } |
| 112 | else |
| 113 | { |
| 114 | assert(range == 3); |
| 115 | rbTable = Tables::S3TCSC::g_singleColor5_2_p; |
| 116 | gTable = Tables::S3TCSC::g_singleColor6_2_p; |
| 117 | } |
| 118 | } |
| 119 | else |
| 120 | { |
| 121 | if (range == 4) |
| 122 | { |
| 123 | rbTable = Tables::S3TCSC::g_singleColor5_3; |
| 124 | gTable = Tables::S3TCSC::g_singleColor6_3; |
| 125 | } |
| 126 | else |
| 127 | { |
| 128 | assert(range == 3); |
| 129 | rbTable = Tables::S3TCSC::g_singleColor5_2; |
| 130 | gTable = Tables::S3TCSC::g_singleColor6_2; |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | MUInt15 interpolated[3]; |
| 135 | MUInt15 eps[2][3]; |
| 136 | MSInt16 spans[3]; |
| 137 | for (int i = 0; i < ParallelMath::ParallelSize; i++) |
| 138 | { |
| 139 | for (int ch = 0; ch < 3; ch++) |
| 140 | { |
| 141 | uint16_t avg = ParallelMath::Extract(average[ch], i); |
| 142 | const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]); |
| 143 | ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min); |
| 144 | ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max); |
| 145 | ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor); |
| 146 | ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span); |
| 147 | } |
| 148 | } |
| 149 | |
| 150 | MFloat error = ParallelMath::MakeFloatZero(); |
| 151 | if (flags & cvtt::Flags::S3TC_Paranoid) |
| 152 | { |
| 153 | MFloat spanParanoidFactors[3]; |
| 154 | for (int ch = 0; ch < 3; ch++) |
| 155 | spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]); |
| 156 | |
| 157 | for (int px = 0; px < 16; px++) |
| 158 | { |
| 159 | for (int ch = 0; ch < 3; ch++) |
| 160 | error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch]; |
| 161 | } |
| 162 | } |
| 163 | else |
| 164 | { |
| 165 | for (int px = 0; px < 16; px++) |
| 166 | { |
| 167 | for (int ch = 0; ch < 3; ch++) |
| 168 | error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch]; |
| 169 | } |
| 170 | } |
| 171 | |
| 172 | ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError); |
| 173 | ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better); |
| 174 | |
| 175 | if (ParallelMath::AnySet(better16)) |
| 176 | { |
| 177 | bestError = ParallelMath::Min(bestError, error); |
| 178 | for (int epi = 0; epi < 2; epi++) |
| 179 | for (int ch = 0; ch < 3; ch++) |
| 180 | ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]); |
| 181 | |
| 182 | MUInt15 vindexes = ParallelMath::MakeUInt15(1); |
| 183 | for (int px = 0; px < 16; px++) |
| 184 | ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes); |
| 185 | |
| 186 | ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range)); |
| 187 | } |
| 188 | } |
| 189 | |
| 190 | void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights, |
| 191 | MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn) |
| 192 | { |
| 193 | float channelWeightsSq[3]; |
| 194 | |
| 195 | for (int ch = 0; ch < 3; ch++) |
| 196 | channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; |
| 197 | |
| 198 | MUInt15 endPoints[2][3]; |
| 199 | |
| 200 | for (int ep = 0; ep < 2; ep++) |
| 201 | for (int ch = 0; ch < 3; ch++) |
| 202 | endPoints[ep][ch] = unquantizedEndPoints[ep][ch]; |
| 203 | |
| 204 | QuantizeTo565(endPoints[0]); |
| 205 | QuantizeTo565(endPoints[1]); |
| 206 | |
| 207 | IndexSelector<3> selector; |
| 208 | selector.Init<false>(channelWeights, endPoints, range); |
| 209 | |
| 210 | MUInt15 indexes[16]; |
| 211 | |
| 212 | MFloat paranoidFactors[3]; |
| 213 | for (int ch = 0; ch < 3; ch++) |
| 214 | paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch])); |
| 215 | |
| 216 | MFloat error = ParallelMath::MakeFloatZero(); |
| 217 | AggregatedError<3> aggError; |
| 218 | for (int px = 0; px < 16; px++) |
| 219 | { |
| 220 | MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn); |
| 221 | indexes[px] = index; |
| 222 | |
| 223 | if (refiner) |
| 224 | refiner->ContributeUnweightedPW(preWeightedPixels[px], index); |
| 225 | |
| 226 | MUInt15 reconstructed[3]; |
| 227 | selector.ReconstructLDRPrecise(index, reconstructed); |
| 228 | |
| 229 | if (flags & Flags::S3TC_Paranoid) |
| 230 | { |
| 231 | for (int ch = 0; ch < 3; ch++) |
| 232 | error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch]; |
| 233 | } |
| 234 | else |
| 235 | BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError); |
| 236 | } |
| 237 | |
| 238 | if (!(flags & Flags::S3TC_Paranoid)) |
| 239 | error = aggError.Finalize(flags, channelWeightsSq); |
| 240 | |
| 241 | ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError); |
| 242 | |
| 243 | if (ParallelMath::AnySet(better)) |
| 244 | { |
| 245 | ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better); |
| 246 | |
| 247 | ParallelMath::ConditionalSet(bestError, better, error); |
| 248 | |
| 249 | for (int ep = 0; ep < 2; ep++) |
| 250 | for (int ch = 0; ch < 3; ch++) |
| 251 | ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]); |
| 252 | |
| 253 | for (int px = 0; px < 16; px++) |
| 254 | ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]); |
| 255 | |
| 256 | ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range))); |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest, |
| 261 | const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, |
| 262 | const ParallelMath::RoundTowardNearestForScope* rtn) |
| 263 | { |
| 264 | UNREFERENCED_PARAMETER(alphaTest); |
| 265 | UNREFERENCED_PARAMETER(flags); |
| 266 | |
| 267 | EndpointRefiner<3> refiner; |
| 268 | |
| 269 | refiner.Init(nCounts, channelWeights); |
| 270 | |
| 271 | bool escape = false; |
| 272 | int e = 0; |
| 273 | for (int i = 0; i < nCounts; i++) |
| 274 | { |
| 275 | for (int n = 0; n < counts[i]; n++) |
| 276 | { |
| 277 | ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements); |
| 278 | if (!ParallelMath::AnySet(valid)) |
| 279 | { |
| 280 | escape = true; |
| 281 | break; |
| 282 | } |
| 283 | |
| 284 | if (ParallelMath::AllSet(valid)) |
| 285 | refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i))); |
| 286 | else |
| 287 | { |
| 288 | MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f)); |
| 289 | refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight); |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | if (escape) |
| 294 | break; |
| 295 | } |
| 296 | |
| 297 | MUInt15 endPoints[2][3]; |
| 298 | refiner.GetRefinedEndpointsLDR(endPoints, rtn); |
| 299 | |
| 300 | TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn); |
| 301 | } |
| 302 | |
| 303 | void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride) |
| 304 | { |
| 305 | UNREFERENCED_PARAMETER(flags); |
| 306 | ParallelMath::RoundTowardNearestForScope rtn; |
| 307 | |
| 308 | float weights[1] = { 1.0f }; |
| 309 | |
| 310 | MUInt15 pixels[16]; |
| 311 | MFloat floatPixels[16]; |
| 312 | |
| 313 | for (int px = 0; px < 16; px++) |
| 314 | { |
| 315 | ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]); |
| 316 | floatPixels[px] = ParallelMath::ToFloat(pixels[px]); |
| 317 | } |
| 318 | |
| 319 | MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } }; |
| 320 | |
| 321 | IndexSelector<1> selector; |
| 322 | selector.Init<false>(weights, ep, 16); |
| 323 | |
| 324 | MUInt15 indexes[16]; |
| 325 | |
| 326 | for (int px = 0; px < 16; px++) |
| 327 | indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn); |
| 328 | |
| 329 | for (int block = 0; block < ParallelMath::ParallelSize; block++) |
| 330 | { |
| 331 | for (int px = 0; px < 16; px += 2) |
| 332 | { |
| 333 | int index0 = ParallelMath::Extract(indexes[px], block); |
| 334 | int index1 = ParallelMath::Extract(indexes[px + 1], block); |
| 335 | |
| 336 | packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4)); |
| 337 | } |
| 338 | |
| 339 | packedBlocks += packedBlockStride; |
| 340 | } |
| 341 | } |
| 342 | |
| 343 | void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds) |
| 344 | { |
| 345 | if (maxTweakRounds < 1) |
| 346 | maxTweakRounds = 1; |
| 347 | |
| 348 | if (numRefineRounds < 1) |
| 349 | numRefineRounds = 1; |
| 350 | |
| 351 | ParallelMath::RoundTowardNearestForScope rtn; |
| 352 | |
| 353 | float oneWeight[1] = { 1.0f }; |
| 354 | |
| 355 | MUInt15 pixels[16]; |
| 356 | MFloat floatPixels[16]; |
| 357 | |
| 358 | MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255); |
| 359 | MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1); |
| 360 | |
| 361 | for (int px = 0; px < 16; px++) |
| 362 | { |
| 363 | ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]); |
| 364 | |
| 365 | if (isSigned) |
| 366 | pixels[px] = ParallelMath::Min(pixels[px], highTerminal); |
| 367 | |
| 368 | floatPixels[px] = ParallelMath::ToFloat(pixels[px]); |
| 369 | } |
| 370 | |
| 371 | MUInt15 sortedPixels[16]; |
| 372 | for (int px = 0; px < 16; px++) |
| 373 | sortedPixels[px] = pixels[px]; |
| 374 | |
| 375 | for (int sortEnd = 15; sortEnd > 0; sortEnd--) |
| 376 | { |
| 377 | for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++) |
| 378 | { |
| 379 | MUInt15 a = sortedPixels[sortOffset]; |
| 380 | MUInt15 b = sortedPixels[sortOffset + 1]; |
| 381 | |
| 382 | sortedPixels[sortOffset] = ParallelMath::Min(a, b); |
| 383 | sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b); |
| 384 | } |
| 385 | } |
| 386 | |
| 387 | MUInt15 zero = ParallelMath::MakeUInt15(0); |
| 388 | MUInt15 one = ParallelMath::MakeUInt15(1); |
| 389 | |
| 390 | MUInt15 bestIsFullRange = zero; |
| 391 | MFloat bestError = ParallelMath::MakeFloat(FLT_MAX); |
| 392 | MUInt15 bestEP[2] = { zero, zero }; |
| 393 | MUInt15 bestIndexes[16] = { |
| 394 | zero, zero, zero, zero, |
| 395 | zero, zero, zero, zero, |
| 396 | zero, zero, zero, zero, |
| 397 | zero, zero, zero, zero |
| 398 | }; |
| 399 | |
| 400 | // Full-precision |
| 401 | { |
| 402 | MUInt15 minEP = sortedPixels[0]; |
| 403 | MUInt15 maxEP = sortedPixels[15]; |
| 404 | |
| 405 | MFloat base[1] = { ParallelMath::ToFloat(minEP) }; |
| 406 | MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) }; |
| 407 | |
| 408 | UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset); |
| 409 | |
| 410 | int numTweakRounds = BCCommon::TweakRoundsForRange(8); |
| 411 | if (numTweakRounds > maxTweakRounds) |
| 412 | numTweakRounds = maxTweakRounds; |
| 413 | |
| 414 | for (int tweak = 0; tweak < numTweakRounds; tweak++) |
| 415 | { |
| 416 | MUInt15 ep[2][1]; |
| 417 | |
| 418 | ufep.FinishLDR(tweak, 8, ep[0], ep[1]); |
| 419 | |
| 420 | for (int refinePass = 0; refinePass < numRefineRounds; refinePass++) |
| 421 | { |
| 422 | EndpointRefiner<1> refiner; |
| 423 | refiner.Init(8, oneWeight); |
| 424 | |
| 425 | if (isSigned) |
| 426 | for (int epi = 0; epi < 2; epi++) |
| 427 | ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal); |
| 428 | |
| 429 | IndexSelector<1> indexSelector; |
| 430 | indexSelector.Init<false>(oneWeight, ep, 8); |
| 431 | |
| 432 | MUInt15 indexes[16]; |
| 433 | |
| 434 | AggregatedError<1> aggError; |
| 435 | for (int px = 0; px < 16; px++) |
| 436 | { |
| 437 | MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn); |
| 438 | |
| 439 | MUInt15 reconstructedPixel; |
| 440 | |
| 441 | indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel); |
| 442 | BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError); |
| 443 | |
| 444 | if (refinePass != numRefineRounds - 1) |
| 445 | refiner.ContributeUnweightedPW(&floatPixels[px], index); |
| 446 | |
| 447 | indexes[px] = index; |
| 448 | } |
| 449 | MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight); |
| 450 | |
| 451 | ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError); |
| 452 | ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter); |
| 453 | |
| 454 | if (ParallelMath::AnySet(errorBetter16)) |
| 455 | { |
| 456 | bestError = ParallelMath::Min(error, bestError); |
| 457 | ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one); |
| 458 | for (int px = 0; px < 16; px++) |
| 459 | ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]); |
| 460 | |
| 461 | for (int epi = 0; epi < 2; epi++) |
| 462 | ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]); |
| 463 | } |
| 464 | |
| 465 | if (refinePass != numRefineRounds - 1) |
| 466 | refiner.GetRefinedEndpointsLDR(ep, &rtn); |
| 467 | } |
| 468 | } |
| 469 | } |
| 470 | |
| 471 | // Reduced precision with special endpoints |
| 472 | { |
| 473 | MUInt15 bestHeuristicMin = sortedPixels[0]; |
| 474 | MUInt15 bestHeuristicMax = sortedPixels[15]; |
| 475 | |
| 476 | ParallelMath::Int16CompFlag canTryClipping; |
| 477 | |
| 478 | // In reduced precision, we want try putting endpoints at the reserved indexes at the ends. |
| 479 | // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range. |
| 480 | // This will usually not find anything, but it's cheap to check. |
| 481 | |
| 482 | { |
| 483 | MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255 |
| 484 | MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax)); |
| 485 | |
| 486 | MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4); |
| 487 | canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange); |
| 488 | } |
| 489 | |
| 490 | if (ParallelMath::AnySet(canTryClipping)) |
| 491 | { |
| 492 | MUInt15 lowClearances[16]; |
| 493 | MUInt15 highClearances[16]; |
| 494 | MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0); |
| 495 | |
| 496 | lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0); |
| 497 | |
| 498 | for (int px = 1; px < 16; px++) |
| 499 | { |
| 500 | lowClearances[px] = sortedPixels[px - 1]; |
| 501 | highClearances[px] = highTerminal - sortedPixels[16 - px]; |
| 502 | } |
| 503 | |
| 504 | for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++) |
| 505 | { |
| 506 | uint16_t numSkippedLow = firstIndex; |
| 507 | |
| 508 | MUInt15 lowClearance = lowClearances[firstIndex]; |
| 509 | |
| 510 | for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++) |
| 511 | { |
| 512 | uint16_t numSkippedHigh = 15 - lastIndex; |
| 513 | uint16_t numSkipped = numSkippedLow + numSkippedHigh; |
| 514 | |
| 515 | MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped); |
| 516 | |
| 517 | ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV); |
| 518 | |
| 519 | if (!ParallelMath::AnySet(areMoreSkipped)) |
| 520 | continue; |
| 521 | |
| 522 | MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance); |
| 523 | MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4); |
| 524 | |
| 525 | MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex]; |
| 526 | |
| 527 | ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range)); |
| 528 | ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]); |
| 529 | ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]); |
| 530 | } |
| 531 | } |
| 532 | } |
| 533 | |
| 534 | MUInt15 bestSimpleMin = one; |
| 535 | MUInt15 bestSimpleMax = highTerminalMinusOne; |
| 536 | |
| 537 | for (int px = 0; px < 16; px++) |
| 538 | { |
| 539 | ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]); |
| 540 | ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]); |
| 541 | } |
| 542 | |
| 543 | MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin }; |
| 544 | MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax }; |
| 545 | |
| 546 | int minEPRange = 2; |
| 547 | if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1]))) |
| 548 | minEPRange = 1; |
| 549 | |
| 550 | int maxEPRange = 2; |
| 551 | if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1]))) |
| 552 | maxEPRange = 1; |
| 553 | |
| 554 | for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++) |
| 555 | { |
| 556 | for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++) |
| 557 | { |
| 558 | MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) }; |
| 559 | MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) }; |
| 560 | |
| 561 | UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset); |
| 562 | |
| 563 | int numTweakRounds = BCCommon::TweakRoundsForRange(6); |
| 564 | if (numTweakRounds > maxTweakRounds) |
| 565 | numTweakRounds = maxTweakRounds; |
| 566 | |
| 567 | for (int tweak = 0; tweak < numTweakRounds; tweak++) |
| 568 | { |
| 569 | MUInt15 ep[2][1]; |
| 570 | |
| 571 | ufep.FinishLDR(tweak, 8, ep[0], ep[1]); |
| 572 | |
| 573 | for (int refinePass = 0; refinePass < numRefineRounds; refinePass++) |
| 574 | { |
| 575 | EndpointRefiner<1> refiner; |
| 576 | refiner.Init(6, oneWeight); |
| 577 | |
| 578 | if (isSigned) |
| 579 | for (int epi = 0; epi < 2; epi++) |
| 580 | ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal); |
| 581 | |
| 582 | IndexSelector<1> indexSelector; |
| 583 | indexSelector.Init<false>(oneWeight, ep, 6); |
| 584 | |
| 585 | MUInt15 indexes[16]; |
| 586 | MFloat error = ParallelMath::MakeFloatZero(); |
| 587 | |
| 588 | for (int px = 0; px < 16; px++) |
| 589 | { |
| 590 | MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn); |
| 591 | |
| 592 | MUInt15 reconstructedPixel; |
| 593 | |
| 594 | indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel); |
| 595 | |
| 596 | MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight); |
| 597 | MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight); |
| 598 | MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight); |
| 599 | |
| 600 | MFloat bestPixelError = zeroError; |
| 601 | MUInt15 index = ParallelMath::MakeUInt15(6); |
| 602 | |
| 603 | ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7)); |
| 604 | bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError); |
| 605 | |
| 606 | ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError); |
| 607 | |
| 608 | if (ParallelMath::AllSet(selectedIndexBetter)) |
| 609 | { |
| 610 | if (refinePass != numRefineRounds - 1) |
| 611 | refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex); |
| 612 | } |
| 613 | else |
| 614 | { |
| 615 | MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero()); |
| 616 | |
| 617 | if (refinePass != numRefineRounds - 1) |
| 618 | refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight); |
| 619 | } |
| 620 | |
| 621 | ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex); |
| 622 | bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError); |
| 623 | |
| 624 | error = error + bestPixelError; |
| 625 | |
| 626 | indexes[px] = index; |
| 627 | } |
| 628 | |
| 629 | ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError); |
| 630 | ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter); |
| 631 | |
| 632 | if (ParallelMath::AnySet(errorBetter16)) |
| 633 | { |
| 634 | bestError = ParallelMath::Min(error, bestError); |
| 635 | ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero); |
| 636 | for (int px = 0; px < 16; px++) |
| 637 | ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]); |
| 638 | |
| 639 | for (int epi = 0; epi < 2; epi++) |
| 640 | ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]); |
| 641 | } |
| 642 | |
| 643 | if (refinePass != numRefineRounds - 1) |
| 644 | refiner.GetRefinedEndpointsLDR(ep, &rtn); |
| 645 | } |
| 646 | } |
| 647 | } |
| 648 | } |
| 649 | } |
| 650 | |
| 651 | for (int block = 0; block < ParallelMath::ParallelSize; block++) |
| 652 | { |
| 653 | int ep0 = ParallelMath::Extract(bestEP[0], block); |
| 654 | int ep1 = ParallelMath::Extract(bestEP[1], block); |
| 655 | int isFullRange = ParallelMath::Extract(bestIsFullRange, block); |
| 656 | |
| 657 | if (isSigned) |
| 658 | { |
| 659 | ep0 -= 127; |
| 660 | ep1 -= 127; |
| 661 | |
| 662 | assert(ep0 >= -127 && ep0 <= 127); |
| 663 | assert(ep1 >= -127 && ep1 <= 127); |
| 664 | } |
| 665 | |
| 666 | |
| 667 | bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1); |
| 668 | |
| 669 | if (swapEndpoints) |
| 670 | std::swap(ep0, ep1); |
| 671 | |
| 672 | uint16_t dumpBits = 0; |
| 673 | int dumpBitsOffset = 0; |
| 674 | int dumpByteOffset = 2; |
| 675 | packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff); |
| 676 | packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff); |
| 677 | |
| 678 | int maxValue = (isFullRange != 0) ? 7 : 5; |
| 679 | |
| 680 | for (int px = 0; px < 16; px++) |
| 681 | { |
| 682 | int index = ParallelMath::Extract(bestIndexes[px], block); |
| 683 | |
| 684 | if (swapEndpoints && index <= maxValue) |
| 685 | index = maxValue - index; |
| 686 | |
| 687 | if (index != 0) |
| 688 | { |
| 689 | if (index == maxValue) |
| 690 | index = 1; |
| 691 | else if (index < maxValue) |
| 692 | index++; |
| 693 | } |
| 694 | |
| 695 | assert(index >= 0 && index < 8); |
| 696 | |
| 697 | dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset); |
| 698 | dumpBitsOffset += 3; |
| 699 | |
| 700 | if (dumpBitsOffset >= 8) |
| 701 | { |
| 702 | assert(dumpByteOffset < 8); |
| 703 | packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff); |
| 704 | dumpBits >>= 8; |
| 705 | dumpBitsOffset -= 8; |
| 706 | dumpByteOffset++; |
| 707 | } |
| 708 | } |
| 709 | |
| 710 | assert(dumpBitsOffset == 0); |
| 711 | assert(dumpByteOffset == 8); |
| 712 | |
| 713 | packedBlocks += packedBlockStride; |
| 714 | } |
| 715 | } |
| 716 | |
| 717 | void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds) |
| 718 | { |
| 719 | ParallelMath::RoundTowardNearestForScope rtn; |
| 720 | |
| 721 | if (numRefineRounds < 1) |
| 722 | numRefineRounds = 1; |
| 723 | |
| 724 | if (maxTweakRounds < 1) |
| 725 | maxTweakRounds = 1; |
| 726 | |
| 727 | EndpointSelector<3, 8> endpointSelector; |
| 728 | |
| 729 | MUInt15 pixels[16][4]; |
| 730 | MFloat floatPixels[16][4]; |
| 731 | |
| 732 | MFloat preWeightedPixels[16][4]; |
| 733 | |
| 734 | for (int px = 0; px < 16; px++) |
| 735 | { |
| 736 | for (int ch = 0; ch < 4; ch++) |
| 737 | ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]); |
| 738 | } |
| 739 | |
| 740 | for (int px = 0; px < 16; px++) |
| 741 | { |
| 742 | for (int ch = 0; ch < 4; ch++) |
| 743 | floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]); |
| 744 | } |
| 745 | |
| 746 | if (alphaTest) |
| 747 | { |
| 748 | MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f))); |
| 749 | |
| 750 | for (int px = 0; px < 16; px++) |
| 751 | { |
| 752 | ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold); |
| 753 | pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255)); |
| 754 | } |
| 755 | } |
| 756 | |
| 757 | BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights); |
| 758 | |
| 759 | MUInt15 minAlpha = ParallelMath::MakeUInt15(255); |
| 760 | |
| 761 | for (int px = 0; px < 16; px++) |
| 762 | minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]); |
| 763 | |
| 764 | MFloat pixelWeights[16]; |
| 765 | for (int px = 0; px < 16; px++) |
| 766 | { |
| 767 | pixelWeights[px] = ParallelMath::MakeFloat(1.0f); |
| 768 | if (alphaTest) |
| 769 | { |
| 770 | ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255)); |
| 771 | |
| 772 | ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero()); |
| 773 | } |
| 774 | } |
| 775 | |
| 776 | for (int pass = 0; pass < NumEndpointSelectorPasses; pass++) |
| 777 | { |
| 778 | for (int px = 0; px < 16; px++) |
| 779 | endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]); |
| 780 | |
| 781 | endpointSelector.FinishPass(pass); |
| 782 | } |
| 783 | |
| 784 | UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights); |
| 785 | |
| 786 | MUInt15 bestEndpoints[2][3]; |
| 787 | MUInt15 bestIndexes[16]; |
| 788 | MUInt15 bestRange = ParallelMath::MakeUInt15(0); |
| 789 | MFloat bestError = ParallelMath::MakeFloat(FLT_MAX); |
| 790 | |
| 791 | for (int px = 0; px < 16; px++) |
| 792 | bestIndexes[px] = ParallelMath::MakeUInt15(0); |
| 793 | |
| 794 | for (int ep = 0; ep < 2; ep++) |
| 795 | for (int ch = 0; ch < 3; ch++) |
| 796 | bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0); |
| 797 | |
| 798 | if (exhaustive) |
| 799 | { |
| 800 | MSInt16 sortBins[16]; |
| 801 | |
| 802 | { |
| 803 | // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins, |
| 804 | // and pack the original indexes into the low bits. |
| 805 | |
| 806 | MUInt15 sortEP[2][3]; |
| 807 | ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]); |
| 808 | |
| 809 | IndexSelector<3> sortSelector; |
| 810 | sortSelector.Init<false>(channelWeights, sortEP, 1 << 11); |
| 811 | |
| 812 | for (int16_t px = 0; px < 16; px++) |
| 813 | { |
| 814 | MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4); |
| 815 | |
| 816 | if (alphaTest) |
| 817 | { |
| 818 | ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255)); |
| 819 | |
| 820 | ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0 |
| 821 | } |
| 822 | |
| 823 | sortBin = sortBin + ParallelMath::MakeSInt16(px); |
| 824 | |
| 825 | sortBins[px] = sortBin; |
| 826 | } |
| 827 | } |
| 828 | |
| 829 | // Sort bins |
| 830 | for (int sortEnd = 1; sortEnd < 16; sortEnd++) |
| 831 | { |
| 832 | for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--) |
| 833 | { |
| 834 | MSInt16 a = sortBins[sortLoc]; |
| 835 | MSInt16 b = sortBins[sortLoc - 1]; |
| 836 | |
| 837 | sortBins[sortLoc] = ParallelMath::Max(a, b); |
| 838 | sortBins[sortLoc - 1] = ParallelMath::Min(a, b); |
| 839 | } |
| 840 | } |
| 841 | |
| 842 | MUInt15 firstElement = ParallelMath::MakeUInt15(0); |
| 843 | for (uint16_t e = 0; e < 16; e++) |
| 844 | { |
| 845 | ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0)); |
| 846 | ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1)); |
| 847 | if (!ParallelMath::AnySet(isInvalid)) |
| 848 | break; |
| 849 | } |
| 850 | |
| 851 | MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement; |
| 852 | |
| 853 | MUInt15 sortedInputs[16][4]; |
| 854 | MFloat floatSortedInputs[16][4]; |
| 855 | MFloat pwFloatSortedInputs[16][4]; |
| 856 | |
| 857 | for (int e = 0; e < 16; e++) |
| 858 | { |
| 859 | for (int ch = 0; ch < 4; ch++) |
| 860 | sortedInputs[e][ch] = ParallelMath::MakeUInt15(0); |
| 861 | } |
| 862 | |
| 863 | for (int block = 0; block < ParallelMath::ParallelSize; block++) |
| 864 | { |
| 865 | for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++) |
| 866 | { |
| 867 | ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block); |
| 868 | int originalIndex = (sortBin & 15); |
| 869 | |
| 870 | for (int ch = 0; ch < 4; ch++) |
| 871 | ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block)); |
| 872 | } |
| 873 | } |
| 874 | |
| 875 | for (int e = 0; e < 16; e++) |
| 876 | { |
| 877 | for (int ch = 0; ch < 4; ch++) |
| 878 | { |
| 879 | MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]); |
| 880 | floatSortedInputs[e][ch] = f; |
| 881 | pwFloatSortedInputs[e][ch] = f * channelWeights[ch]; |
| 882 | } |
| 883 | } |
| 884 | |
| 885 | for (int n0 = 0; n0 <= 15; n0++) |
| 886 | { |
| 887 | int remainingFor1 = 16 - n0; |
| 888 | if (remainingFor1 == 16) |
| 889 | remainingFor1 = 15; |
| 890 | |
| 891 | for (int n1 = 0; n1 <= remainingFor1; n1++) |
| 892 | { |
| 893 | int remainingFor2 = 16 - n1 - n0; |
| 894 | if (remainingFor2 == 16) |
| 895 | remainingFor2 = 15; |
| 896 | |
| 897 | for (int n2 = 0; n2 <= remainingFor2; n2++) |
| 898 | { |
| 899 | int n3 = 16 - n2 - n1 - n0; |
| 900 | |
| 901 | if (n3 == 16) |
| 902 | continue; |
| 903 | |
| 904 | int counts[4] = { n0, n1, n2, n3 }; |
| 905 | |
| 906 | TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); |
| 907 | } |
| 908 | } |
| 909 | } |
| 910 | |
| 911 | TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); |
| 912 | |
| 913 | if (alphaTest) |
| 914 | { |
| 915 | for (int n0 = 0; n0 <= 15; n0++) |
| 916 | { |
| 917 | int remainingFor1 = 16 - n0; |
| 918 | if (remainingFor1 == 16) |
| 919 | remainingFor1 = 15; |
| 920 | |
| 921 | for (int n1 = 0; n1 <= remainingFor1; n1++) |
| 922 | { |
| 923 | int n2 = 16 - n1 - n0; |
| 924 | |
| 925 | if (n2 == 16) |
| 926 | continue; |
| 927 | |
| 928 | int counts[3] = { n0, n1, n2 }; |
| 929 | |
| 930 | TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); |
| 931 | } |
| 932 | } |
| 933 | |
| 934 | TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); |
| 935 | } |
| 936 | } |
| 937 | else |
| 938 | { |
| 939 | int minRange = alphaTest ? 3 : 4; |
| 940 | |
| 941 | for (int range = minRange; range <= 4; range++) |
| 942 | { |
| 943 | int tweakRounds = BCCommon::TweakRoundsForRange(range); |
| 944 | if (tweakRounds > maxTweakRounds) |
| 945 | tweakRounds = maxTweakRounds; |
| 946 | |
| 947 | for (int tweak = 0; tweak < tweakRounds; tweak++) |
| 948 | { |
| 949 | MUInt15 endPoints[2][3]; |
| 950 | |
| 951 | ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]); |
| 952 | |
| 953 | for (int refine = 0; refine < numRefineRounds; refine++) |
| 954 | { |
| 955 | EndpointRefiner<3> refiner; |
| 956 | refiner.Init(range, channelWeights); |
| 957 | |
| 958 | TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn); |
| 959 | |
| 960 | if (refine != numRefineRounds - 1) |
| 961 | refiner.GetRefinedEndpointsLDR(endPoints, &rtn); |
| 962 | } |
| 963 | } |
| 964 | } |
| 965 | } |
| 966 | |
| 967 | for (int block = 0; block < ParallelMath::ParallelSize; block++) |
| 968 | { |
| 969 | ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block); |
| 970 | assert(range == 3 || range == 4); |
| 971 | |
| 972 | ParallelMath::ScalarUInt16 compressedEP[2]; |
| 973 | for (int ep = 0; ep < 2; ep++) |
| 974 | { |
| 975 | ParallelMath::ScalarUInt16 endPoint[3]; |
| 976 | for (int ch = 0; ch < 3; ch++) |
| 977 | endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block); |
| 978 | |
| 979 | int compressed = (endPoint[0] & 0xf8) << 8; |
| 980 | compressed |= (endPoint[1] & 0xfc) << 3; |
| 981 | compressed |= (endPoint[2] & 0xf8) >> 3; |
| 982 | |
| 983 | compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed); |
| 984 | } |
| 985 | |
| 986 | int indexOrder[4]; |
| 987 | |
| 988 | if (range == 4) |
| 989 | { |
| 990 | if (compressedEP[0] == compressedEP[1]) |
| 991 | { |
| 992 | indexOrder[0] = 0; |
| 993 | indexOrder[1] = 0; |
| 994 | indexOrder[2] = 0; |
| 995 | indexOrder[3] = 0; |
| 996 | } |
| 997 | else if (compressedEP[0] < compressedEP[1]) |
| 998 | { |
| 999 | std::swap(compressedEP[0], compressedEP[1]); |
| 1000 | indexOrder[0] = 1; |
| 1001 | indexOrder[1] = 3; |
| 1002 | indexOrder[2] = 2; |
| 1003 | indexOrder[3] = 0; |
| 1004 | } |
| 1005 | else |
| 1006 | { |
| 1007 | indexOrder[0] = 0; |
| 1008 | indexOrder[1] = 2; |
| 1009 | indexOrder[2] = 3; |
| 1010 | indexOrder[3] = 1; |
| 1011 | } |
| 1012 | } |
| 1013 | else |
| 1014 | { |
| 1015 | assert(range == 3); |
| 1016 | |
| 1017 | if (compressedEP[0] > compressedEP[1]) |
| 1018 | { |
| 1019 | std::swap(compressedEP[0], compressedEP[1]); |
| 1020 | indexOrder[0] = 1; |
| 1021 | indexOrder[1] = 2; |
| 1022 | indexOrder[2] = 0; |
| 1023 | } |
| 1024 | else |
| 1025 | { |
| 1026 | indexOrder[0] = 0; |
| 1027 | indexOrder[1] = 2; |
| 1028 | indexOrder[2] = 1; |
| 1029 | } |
| 1030 | indexOrder[3] = 3; |
| 1031 | } |
| 1032 | |
| 1033 | packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff); |
| 1034 | packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff); |
| 1035 | packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff); |
| 1036 | packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff); |
| 1037 | |
| 1038 | for (int i = 0; i < 16; i += 4) |
| 1039 | { |
| 1040 | int packedIndexes = 0; |
| 1041 | for (int subi = 0; subi < 4; subi++) |
| 1042 | { |
| 1043 | ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block); |
| 1044 | packedIndexes |= (indexOrder[index] << (subi * 2)); |
| 1045 | } |
| 1046 | |
| 1047 | packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes); |
| 1048 | } |
| 1049 | |
| 1050 | packedBlocks += packedBlockStride; |
| 1051 | } |
| 1052 | } |
| 1053 | |
| 1054 | #endif |
| 1055 | |