| 1 | #include <array> |
| 2 | #include <string.h> |
| 3 | #include <limits> |
| 4 | #ifdef __ARM_NEON |
| 5 | # include <arm_neon.h> |
| 6 | #endif |
| 7 | |
| 8 | #include "Dither.hpp" |
| 9 | #include "ForceInline.hpp" |
| 10 | #include "Math.hpp" |
| 11 | #include "ProcessCommon.hpp" |
| 12 | #include "ProcessRGB.hpp" |
| 13 | #include "Tables.hpp" |
| 14 | #include "Vector.hpp" |
| 15 | #if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER |
| 16 | # ifdef _MSC_VER |
| 17 | # include <intrin.h> |
| 18 | # include <Windows.h> |
| 19 | # define _bswap(x) _byteswap_ulong(x) |
| 20 | # define _bswap64(x) _byteswap_uint64(x) |
| 21 | # else |
| 22 | # include <x86intrin.h> |
| 23 | # endif |
| 24 | #endif |
| 25 | |
| 26 | #ifndef _bswap |
| 27 | # define _bswap(x) __builtin_bswap32(x) |
| 28 | # define _bswap64(x) __builtin_bswap64(x) |
| 29 | #endif |
| 30 | |
| 31 | static const uint32_t MaxError = 1065369600; // ((38+76+14) * 255)^2 |
| 32 | // common T-/H-mode table |
| 33 | static uint8_t tableTH[8] = { 3, 6, 11, 16, 23, 32, 41, 64 }; |
| 34 | |
| 35 | // thresholds for the early compression-mode decision scheme |
| 36 | // default: 0.03, 0.09, and 0.38 |
| 37 | float ecmd_threshold[3] = { 0.03f, 0.09f, 0.38f }; |
| 38 | |
| 39 | static const uint8_t ModeUndecided = 0; |
| 40 | static const uint8_t ModePlanar = 0x1; |
| 41 | static const uint8_t ModeTH = 0x2; |
| 42 | |
| 43 | const unsigned int R = 2; |
| 44 | const unsigned int G = 1; |
| 45 | const unsigned int B = 0; |
| 46 | |
| 47 | struct Luma |
| 48 | { |
| 49 | #ifdef __AVX2__ |
| 50 | float max, min; |
| 51 | uint8_t minIdx = 255, maxIdx = 255; |
| 52 | __m128i luma8; |
| 53 | #elif defined __ARM_NEON && defined __aarch64__ |
| 54 | float max, min; |
| 55 | uint8_t minIdx = 255, maxIdx = 255; |
| 56 | uint8x16_t luma8; |
| 57 | #else |
| 58 | uint8_t max = 0, min = 255, maxIdx = 0, minIdx = 0; |
| 59 | uint8_t val[16]; |
| 60 | #endif |
| 61 | }; |
| 62 | |
| 63 | #ifdef __AVX2__ |
| 64 | struct Plane |
| 65 | { |
| 66 | uint64_t plane; |
| 67 | uint64_t error; |
| 68 | __m256i sum4; |
| 69 | }; |
| 70 | #endif |
| 71 | |
| 72 | #if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__) |
| 73 | struct Channels |
| 74 | { |
| 75 | #ifdef __AVX2__ |
| 76 | __m128i r8, g8, b8; |
| 77 | #elif defined __ARM_NEON && defined __aarch64__ |
| 78 | uint8x16x2_t r, g, b; |
| 79 | #endif |
| 80 | }; |
| 81 | #endif |
| 82 | |
| 83 | namespace |
| 84 | { |
| 85 | static etcpak_force_inline uint8_t clamp( uint8_t min, int16_t val, uint8_t max ) |
| 86 | { |
| 87 | return val < min ? min : ( val > max ? max : val ); |
| 88 | } |
| 89 | |
| 90 | static etcpak_force_inline uint8_t clampMin( uint8_t min, int16_t val ) |
| 91 | { |
| 92 | return val < min ? min : val; |
| 93 | } |
| 94 | |
| 95 | static etcpak_force_inline uint8_t clampMax( int16_t val, uint8_t max ) |
| 96 | { |
| 97 | return val > max ? max : val; |
| 98 | } |
| 99 | |
| 100 | // slightly faster than std::sort |
| 101 | static void insertionSort( uint8_t* arr1, uint8_t* arr2 ) |
| 102 | { |
| 103 | for( uint8_t i = 1; i < 16; ++i ) |
| 104 | { |
| 105 | uint8_t value = arr1[i]; |
| 106 | uint8_t hole = i; |
| 107 | |
| 108 | for( ; hole > 0 && value < arr1[hole - 1]; --hole ) |
| 109 | { |
| 110 | arr1[hole] = arr1[hole - 1]; |
| 111 | arr2[hole] = arr2[hole - 1]; |
| 112 | } |
| 113 | arr1[hole] = value; |
| 114 | arr2[hole] = i; |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | //converts indices from |a0|a1|e0|e1|i0|i1|m0|m1|b0|b1|f0|f1|j0|j1|n0|n1|c0|c1|g0|g1|k0|k1|o0|o1|d0|d1|h0|h1|l0|l1|p0|p1| previously used by T- and H-modes |
| 119 | // into |p0|o0|n0|m0|l0|k0|j0|i0|h0|g0|f0|e0|d0|c0|b0|a0|p1|o1|n1|m1|l1|k1|j1|i1|h1|g1|f1|e1|d1|c1|b1|a1| which should be used for all modes. |
| 120 | // NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved. |
| 121 | static etcpak_force_inline int indexConversion( int pixelIndices ) |
| 122 | { |
| 123 | int correctIndices = 0; |
| 124 | int LSB[4][4]; |
| 125 | int MSB[4][4]; |
| 126 | int shift = 0; |
| 127 | for( int y = 3; y >= 0; y-- ) |
| 128 | { |
| 129 | for( int x = 3; x >= 0; x-- ) |
| 130 | { |
| 131 | LSB[x][y] = ( pixelIndices >> shift ) & 1; |
| 132 | shift++; |
| 133 | MSB[x][y] = ( pixelIndices >> shift ) & 1; |
| 134 | shift++; |
| 135 | } |
| 136 | } |
| 137 | shift = 0; |
| 138 | for( int x = 0; x < 4; x++ ) |
| 139 | { |
| 140 | for( int y = 0; y < 4; y++ ) |
| 141 | { |
| 142 | correctIndices |= ( LSB[x][y] << shift ); |
| 143 | correctIndices |= ( MSB[x][y] << ( 16 + shift ) ); |
| 144 | shift++; |
| 145 | } |
| 146 | } |
| 147 | return correctIndices; |
| 148 | } |
| 149 | |
| 150 | // Swapping two RGB-colors |
| 151 | // NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved. |
| 152 | static etcpak_force_inline void swapColors( uint8_t( colors )[2][3] ) |
| 153 | { |
| 154 | uint8_t temp = colors[0][R]; |
| 155 | colors[0][R] = colors[1][R]; |
| 156 | colors[1][R] = temp; |
| 157 | |
| 158 | temp = colors[0][G]; |
| 159 | colors[0][G] = colors[1][G]; |
| 160 | colors[1][G] = temp; |
| 161 | |
| 162 | temp = colors[0][B]; |
| 163 | colors[0][B] = colors[1][B]; |
| 164 | colors[1][B] = temp; |
| 165 | } |
| 166 | |
| 167 | |
| 168 | // calculates quantized colors for T or H modes |
| 169 | void compressColor( uint8_t( currColor )[2][3], uint8_t( quantColor )[2][3], bool t_mode ) |
| 170 | { |
| 171 | if( t_mode ) |
| 172 | { |
| 173 | quantColor[0][R] = clampMax( 15 * ( currColor[0][R] + 8 ) / 255, 15 ); |
| 174 | quantColor[0][G] = clampMax( 15 * ( currColor[0][G] + 8 ) / 255, 15 ); |
| 175 | quantColor[0][B] = clampMax( 15 * ( currColor[0][B] + 8 ) / 255, 15 ); |
| 176 | } |
| 177 | else // clamped to [1,14] to get a wider range |
| 178 | { |
| 179 | quantColor[0][R] = clamp( 1, 15 * ( currColor[0][R] + 8 ) / 255, 14 ); |
| 180 | quantColor[0][G] = clamp( 1, 15 * ( currColor[0][G] + 8 ) / 255, 14 ); |
| 181 | quantColor[0][B] = clamp( 1, 15 * ( currColor[0][B] + 8 ) / 255, 14 ); |
| 182 | } |
| 183 | |
| 184 | // clamped to [1,14] to get a wider range |
| 185 | quantColor[1][R] = clamp( 1, 15 * ( currColor[1][R] + 8 ) / 255, 14 ); |
| 186 | quantColor[1][G] = clamp( 1, 15 * ( currColor[1][G] + 8 ) / 255, 14 ); |
| 187 | quantColor[1][B] = clamp( 1, 15 * ( currColor[1][B] + 8 ) / 255, 14 ); |
| 188 | } |
| 189 | |
| 190 | // three decoding functions come from ETCPACK v2.74 and are slightly changed. |
| 191 | static etcpak_force_inline void decompressColor( uint8_t( colorsRGB444 )[2][3], uint8_t( colors )[2][3] ) |
| 192 | { |
| 193 | // The color should be retrieved as: |
| 194 | // |
| 195 | // c = round(255/(r_bits^2-1))*comp_color |
| 196 | // |
| 197 | // This is similar to bit replication |
| 198 | // |
| 199 | // Note -- this code only work for bit replication from 4 bits and up --- 3 bits needs |
| 200 | // two copy operations. |
| 201 | colors[0][R] = ( colorsRGB444[0][R] << 4 ) | colorsRGB444[0][R]; |
| 202 | colors[0][G] = ( colorsRGB444[0][G] << 4 ) | colorsRGB444[0][G]; |
| 203 | colors[0][B] = ( colorsRGB444[0][B] << 4 ) | colorsRGB444[0][B]; |
| 204 | colors[1][R] = ( colorsRGB444[1][R] << 4 ) | colorsRGB444[1][R]; |
| 205 | colors[1][G] = ( colorsRGB444[1][G] << 4 ) | colorsRGB444[1][G]; |
| 206 | colors[1][B] = ( colorsRGB444[1][B] << 4 ) | colorsRGB444[1][B]; |
| 207 | } |
| 208 | |
| 209 | // calculates the paint colors from the block colors |
| 210 | // using a distance d and one of the H- or T-patterns. |
| 211 | static void calculatePaintColors59T( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] ) |
| 212 | { |
| 213 | ////////////////////////////////////////////// |
| 214 | // |
| 215 | // C3 C1 C4----C1---C2 |
| 216 | // | | | |
| 217 | // | | | |
| 218 | // |-------| | |
| 219 | // | | | |
| 220 | // | | | |
| 221 | // C4 C2 C3 |
| 222 | // |
| 223 | ////////////////////////////////////////////// |
| 224 | |
| 225 | // C4 |
| 226 | pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] ); |
| 227 | pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] ); |
| 228 | pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] ); |
| 229 | |
| 230 | // C3 |
| 231 | pColors[0][R] = colors[0][R]; |
| 232 | pColors[0][G] = colors[0][G]; |
| 233 | pColors[0][B] = colors[0][B]; |
| 234 | // C2 |
| 235 | pColors[1][R] = clampMax( colors[1][R] + tableTH[d], 255 ); |
| 236 | pColors[1][G] = clampMax( colors[1][G] + tableTH[d], 255 ); |
| 237 | pColors[1][B] = clampMax( colors[1][B] + tableTH[d], 255 ); |
| 238 | // C1 |
| 239 | pColors[2][R] = colors[1][R]; |
| 240 | pColors[2][G] = colors[1][G]; |
| 241 | pColors[2][B] = colors[1][B]; |
| 242 | } |
| 243 | |
| 244 | static void calculatePaintColors58H( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] ) |
| 245 | { |
| 246 | pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] ); |
| 247 | pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] ); |
| 248 | pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] ); |
| 249 | |
| 250 | // C1 |
| 251 | pColors[0][R] = clampMax( colors[0][R] + tableTH[d], 255 ); |
| 252 | pColors[0][G] = clampMax( colors[0][G] + tableTH[d], 255 ); |
| 253 | pColors[0][B] = clampMax( colors[0][B] + tableTH[d], 255 ); |
| 254 | // C2 |
| 255 | pColors[1][R] = clampMin( 0, colors[0][R] - tableTH[d] ); |
| 256 | pColors[1][G] = clampMin( 0, colors[0][G] - tableTH[d] ); |
| 257 | pColors[1][B] = clampMin( 0, colors[0][B] - tableTH[d] ); |
| 258 | // C3 |
| 259 | pColors[2][R] = clampMax( colors[1][R] + tableTH[d], 255 ); |
| 260 | pColors[2][G] = clampMax( colors[1][G] + tableTH[d], 255 ); |
| 261 | pColors[2][B] = clampMax( colors[1][B] + tableTH[d], 255 ); |
| 262 | } |
| 263 | |
| 264 | #if defined _MSC_VER && !defined __clang__ |
| 265 | static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask ) |
| 266 | { |
| 267 | unsigned long ret; |
| 268 | _BitScanForward( &ret, mask ); |
| 269 | return ret; |
| 270 | } |
| 271 | #endif |
| 272 | |
| 273 | typedef std::array<uint16_t, 4> v4i; |
| 274 | |
| 275 | #ifdef __AVX2__ |
| 276 | static etcpak_force_inline __m256i Sum4_AVX2( const uint8_t* data) noexcept |
| 277 | { |
| 278 | __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0); |
| 279 | __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1); |
| 280 | __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2); |
| 281 | __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3); |
| 282 | |
| 283 | __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF)); |
| 284 | __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF)); |
| 285 | __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF)); |
| 286 | __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF)); |
| 287 | |
| 288 | __m256i t0 = _mm256_cvtepu8_epi16(dm0); |
| 289 | __m256i t1 = _mm256_cvtepu8_epi16(dm1); |
| 290 | __m256i t2 = _mm256_cvtepu8_epi16(dm2); |
| 291 | __m256i t3 = _mm256_cvtepu8_epi16(dm3); |
| 292 | |
| 293 | __m256i sum0 = _mm256_add_epi16(t0, t1); |
| 294 | __m256i sum1 = _mm256_add_epi16(t2, t3); |
| 295 | |
| 296 | __m256i s0 = _mm256_permute2x128_si256(sum0, sum1, (0) | (3 << 4)); // 0, 0, 3, 3 |
| 297 | __m256i s1 = _mm256_permute2x128_si256(sum0, sum1, (1) | (2 << 4)); // 1, 1, 2, 2 |
| 298 | |
| 299 | __m256i s2 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(1, 3, 0, 2)); |
| 300 | __m256i s3 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(0, 2, 1, 3)); |
| 301 | __m256i s4 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(3, 1, 0, 2)); |
| 302 | __m256i s5 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(2, 0, 1, 3)); |
| 303 | |
| 304 | __m256i sum5 = _mm256_add_epi16(s2, s3); // 3, 0, 3, 0 |
| 305 | __m256i sum6 = _mm256_add_epi16(s4, s5); // 2, 1, 1, 2 |
| 306 | return _mm256_add_epi16(sum5, sum6); // 3+2, 0+1, 3+1, 3+2 |
| 307 | } |
| 308 | |
| 309 | static etcpak_force_inline __m256i Average_AVX2( const __m256i data) noexcept |
| 310 | { |
| 311 | __m256i a = _mm256_add_epi16(data, _mm256_set1_epi16(4)); |
| 312 | |
| 313 | return _mm256_srli_epi16(a, 3); |
| 314 | } |
| 315 | |
| 316 | static etcpak_force_inline __m128i CalcErrorBlock_AVX2( const __m256i data, const v4i a[8]) noexcept |
| 317 | { |
| 318 | // |
| 319 | __m256i a0 = _mm256_load_si256((__m256i*)a[0].data()); |
| 320 | __m256i a1 = _mm256_load_si256((__m256i*)a[4].data()); |
| 321 | |
| 322 | // err = 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) ); |
| 323 | __m256i a4 = _mm256_madd_epi16(a0, a0); |
| 324 | __m256i a5 = _mm256_madd_epi16(a1, a1); |
| 325 | |
| 326 | __m256i a6 = _mm256_hadd_epi32(a4, a5); |
| 327 | __m256i a7 = _mm256_slli_epi32(a6, 3); |
| 328 | |
| 329 | __m256i a8 = _mm256_add_epi32(a7, _mm256_set1_epi32(0x3FFFFFFF)); // Big value to prevent negative values, but small enough to prevent overflow |
| 330 | |
| 331 | // average is not swapped |
| 332 | // err -= block[0] * 2 * average[0]; |
| 333 | // err -= block[1] * 2 * average[1]; |
| 334 | // err -= block[2] * 2 * average[2]; |
| 335 | __m256i a2 = _mm256_slli_epi16(a0, 1); |
| 336 | __m256i a3 = _mm256_slli_epi16(a1, 1); |
| 337 | __m256i b0 = _mm256_madd_epi16(a2, data); |
| 338 | __m256i b1 = _mm256_madd_epi16(a3, data); |
| 339 | |
| 340 | __m256i b2 = _mm256_hadd_epi32(b0, b1); |
| 341 | __m256i b3 = _mm256_sub_epi32(a8, b2); |
| 342 | __m256i b4 = _mm256_hadd_epi32(b3, b3); |
| 343 | |
| 344 | __m256i b5 = _mm256_permutevar8x32_epi32(b4, _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0)); |
| 345 | |
| 346 | return _mm256_castsi256_si128(b5); |
| 347 | } |
| 348 | |
| 349 | static etcpak_force_inline void ProcessAverages_AVX2(const __m256i d, v4i a[8] ) noexcept |
| 350 | { |
| 351 | __m256i t = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(31)), _mm256_set1_epi16(128)); |
| 352 | |
| 353 | __m256i c = _mm256_srli_epi16(_mm256_add_epi16(t, _mm256_srli_epi16(t, 8)), 8); |
| 354 | |
| 355 | __m256i c1 = _mm256_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2)); |
| 356 | __m256i diff = _mm256_sub_epi16(c, c1); |
| 357 | diff = _mm256_max_epi16(diff, _mm256_set1_epi16(-4)); |
| 358 | diff = _mm256_min_epi16(diff, _mm256_set1_epi16(3)); |
| 359 | |
| 360 | __m256i co = _mm256_add_epi16(c1, diff); |
| 361 | |
| 362 | c = _mm256_blend_epi16(co, c, 0xF0); |
| 363 | |
| 364 | __m256i a0 = _mm256_or_si256(_mm256_slli_epi16(c, 3), _mm256_srli_epi16(c, 2)); |
| 365 | |
| 366 | _mm256_store_si256((__m256i*)a[4].data(), a0); |
| 367 | |
| 368 | __m256i t0 = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(15)), _mm256_set1_epi16(128)); |
| 369 | __m256i t1 = _mm256_srli_epi16(_mm256_add_epi16(t0, _mm256_srli_epi16(t0, 8)), 8); |
| 370 | |
| 371 | __m256i t2 = _mm256_or_si256(t1, _mm256_slli_epi16(t1, 4)); |
| 372 | |
| 373 | _mm256_store_si256((__m256i*)a[0].data(), t2); |
| 374 | } |
| 375 | |
| 376 | static etcpak_force_inline uint64_t EncodeAverages_AVX2( const v4i a[8], size_t idx ) noexcept |
| 377 | { |
| 378 | uint64_t d = ( idx << 24 ); |
| 379 | size_t base = idx << 1; |
| 380 | |
| 381 | __m128i a0 = _mm_load_si128((const __m128i*)a[base].data()); |
| 382 | |
| 383 | __m128i r0, r1; |
| 384 | |
| 385 | if( ( idx & 0x2 ) == 0 ) |
| 386 | { |
| 387 | r0 = _mm_srli_epi16(a0, 4); |
| 388 | |
| 389 | __m128i a1 = _mm_unpackhi_epi64(r0, r0); |
| 390 | r1 = _mm_slli_epi16(a1, 4); |
| 391 | } |
| 392 | else |
| 393 | { |
| 394 | __m128i a1 = _mm_and_si128(a0, _mm_set1_epi16(-8)); |
| 395 | |
| 396 | r0 = _mm_unpackhi_epi64(a1, a1); |
| 397 | __m128i a2 = _mm_sub_epi16(a1, r0); |
| 398 | __m128i a3 = _mm_srai_epi16(a2, 3); |
| 399 | r1 = _mm_and_si128(a3, _mm_set1_epi16(0x07)); |
| 400 | } |
| 401 | |
| 402 | __m128i r2 = _mm_or_si128(r0, r1); |
| 403 | // do missing swap for average values |
| 404 | __m128i r3 = _mm_shufflelo_epi16(r2, _MM_SHUFFLE(3, 0, 1, 2)); |
| 405 | __m128i r4 = _mm_packus_epi16(r3, _mm_setzero_si128()); |
| 406 | d |= _mm_cvtsi128_si32(r4); |
| 407 | |
| 408 | return d; |
| 409 | } |
| 410 | |
| 411 | static etcpak_force_inline uint64_t CheckSolid_AVX2( const uint8_t* src ) noexcept |
| 412 | { |
| 413 | __m256i d0 = _mm256_loadu_si256(((__m256i*)src) + 0); |
| 414 | __m256i d1 = _mm256_loadu_si256(((__m256i*)src) + 1); |
| 415 | |
| 416 | __m256i c = _mm256_broadcastd_epi32(_mm256_castsi256_si128(d0)); |
| 417 | |
| 418 | __m256i c0 = _mm256_cmpeq_epi8(d0, c); |
| 419 | __m256i c1 = _mm256_cmpeq_epi8(d1, c); |
| 420 | |
| 421 | __m256i m = _mm256_and_si256(c0, c1); |
| 422 | |
| 423 | if (!_mm256_testc_si256(m, _mm256_set1_epi32(-1))) |
| 424 | { |
| 425 | return 0; |
| 426 | } |
| 427 | |
| 428 | return 0x02000000 | |
| 429 | ( (unsigned int)( src[0] & 0xF8 ) << 16 ) | |
| 430 | ( (unsigned int)( src[1] & 0xF8 ) << 8 ) | |
| 431 | ( (unsigned int)( src[2] & 0xF8 ) ); |
| 432 | } |
| 433 | |
| 434 | static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const uint8_t* src) noexcept |
| 435 | { |
| 436 | __m256i sum4 = Sum4_AVX2( src ); |
| 437 | |
| 438 | ProcessAverages_AVX2(Average_AVX2( sum4 ), a ); |
| 439 | |
| 440 | return CalcErrorBlock_AVX2( sum4, a); |
| 441 | } |
| 442 | |
| 443 | static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const __m256i sum4) noexcept |
| 444 | { |
| 445 | ProcessAverages_AVX2(Average_AVX2( sum4 ), a ); |
| 446 | |
| 447 | return CalcErrorBlock_AVX2( sum4, a); |
| 448 | } |
| 449 | |
| 450 | static etcpak_force_inline void FindBestFit_4x2_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept |
| 451 | { |
| 452 | __m256i sel0 = _mm256_setzero_si256(); |
| 453 | __m256i sel1 = _mm256_setzero_si256(); |
| 454 | |
| 455 | for (unsigned int j = 0; j < 2; ++j) |
| 456 | { |
| 457 | unsigned int bid = offset + 1 - j; |
| 458 | |
| 459 | __m256i squareErrorSum = _mm256_setzero_si256(); |
| 460 | |
| 461 | __m128i a0 = _mm_loadl_epi64((const __m128i*)a[bid].data()); |
| 462 | __m256i a1 = _mm256_broadcastq_epi64(a0); |
| 463 | |
| 464 | // Processing one full row each iteration |
| 465 | for (size_t i = 0; i < 8; i += 4) |
| 466 | { |
| 467 | __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4)); |
| 468 | |
| 469 | __m256i rgb16 = _mm256_cvtepu8_epi16(rgb); |
| 470 | __m256i d = _mm256_sub_epi16(a1, rgb16); |
| 471 | |
| 472 | // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16 |
| 473 | // This produces slightly different results, but is significant faster |
| 474 | __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14)); |
| 475 | __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0); |
| 476 | __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1); |
| 477 | __m128i pixel3 = _mm256_castsi256_si128(pixel2); |
| 478 | |
| 479 | __m128i pix0 = _mm_broadcastw_epi16(pixel3); |
| 480 | __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16)); |
| 481 | __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); |
| 482 | |
| 483 | // Processing first two pixels of the row |
| 484 | { |
| 485 | __m256i pix = _mm256_abs_epi16(pixel); |
| 486 | |
| 487 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
| 488 | // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries. |
| 489 | __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0]))); |
| 490 | __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1]))); |
| 491 | |
| 492 | __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1)); |
| 493 | __m256i minError = _mm256_min_epi16(error0, error1); |
| 494 | |
| 495 | // Exploiting symmetry of the selector table and use the sign bit |
| 496 | // This produces slightly different results, but is significant faster |
| 497 | __m256i minIndex1 = _mm256_srli_epi16(pixel, 15); |
| 498 | |
| 499 | // Interleaving values so madd instruction can be used |
| 500 | __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
| 501 | __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
| 502 | |
| 503 | __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi); |
| 504 | // Squaring the minimum error to produce correct values when adding |
| 505 | __m256i squareError = _mm256_madd_epi16(minError2, minError2); |
| 506 | |
| 507 | squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError); |
| 508 | |
| 509 | // Packing selector bits |
| 510 | __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8)); |
| 511 | __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8)); |
| 512 | |
| 513 | sel0 = _mm256_or_si256(sel0, minIndexLo2); |
| 514 | sel1 = _mm256_or_si256(sel1, minIndexHi2); |
| 515 | } |
| 516 | |
| 517 | pixel3 = _mm256_extracti128_si256(pixel2, 1); |
| 518 | pix0 = _mm_broadcastw_epi16(pixel3); |
| 519 | pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16)); |
| 520 | pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); |
| 521 | |
| 522 | // Processing second two pixels of the row |
| 523 | { |
| 524 | __m256i pix = _mm256_abs_epi16(pixel); |
| 525 | |
| 526 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
| 527 | // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries. |
| 528 | __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0]))); |
| 529 | __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1]))); |
| 530 | |
| 531 | __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1)); |
| 532 | __m256i minError = _mm256_min_epi16(error0, error1); |
| 533 | |
| 534 | // Exploiting symmetry of the selector table and use the sign bit |
| 535 | __m256i minIndex1 = _mm256_srli_epi16(pixel, 15); |
| 536 | |
| 537 | // Interleaving values so madd instruction can be used |
| 538 | __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
| 539 | __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
| 540 | |
| 541 | __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi); |
| 542 | // Squaring the minimum error to produce correct values when adding |
| 543 | __m256i squareError = _mm256_madd_epi16(minError2, minError2); |
| 544 | |
| 545 | squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError); |
| 546 | |
| 547 | // Packing selector bits |
| 548 | __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8)); |
| 549 | __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8)); |
| 550 | __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2); |
| 551 | __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2); |
| 552 | |
| 553 | sel0 = _mm256_or_si256(sel0, minIndexLo3); |
| 554 | sel1 = _mm256_or_si256(sel1, minIndexHi3); |
| 555 | } |
| 556 | } |
| 557 | |
| 558 | data += 8 * 4; |
| 559 | |
| 560 | _mm256_store_si256((__m256i*)terr[1 - j], squareErrorSum); |
| 561 | } |
| 562 | |
| 563 | // Interleave selector bits |
| 564 | __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1); |
| 565 | __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1); |
| 566 | |
| 567 | __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4)); |
| 568 | __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4)); |
| 569 | |
| 570 | __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1); |
| 571 | |
| 572 | __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2); |
| 573 | |
| 574 | _mm256_store_si256((__m256i*)tsel, sel); |
| 575 | } |
| 576 | |
| 577 | static etcpak_force_inline void FindBestFit_2x4_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept |
| 578 | { |
| 579 | __m256i sel0 = _mm256_setzero_si256(); |
| 580 | __m256i sel1 = _mm256_setzero_si256(); |
| 581 | |
| 582 | __m256i squareErrorSum0 = _mm256_setzero_si256(); |
| 583 | __m256i squareErrorSum1 = _mm256_setzero_si256(); |
| 584 | |
| 585 | __m128i a0 = _mm_loadl_epi64((const __m128i*)a[offset + 1].data()); |
| 586 | __m128i a1 = _mm_loadl_epi64((const __m128i*)a[offset + 0].data()); |
| 587 | |
| 588 | __m128i a2 = _mm_broadcastq_epi64(a0); |
| 589 | __m128i a3 = _mm_broadcastq_epi64(a1); |
| 590 | __m256i a4 = _mm256_insertf128_si256(_mm256_castsi128_si256(a2), a3, 1); |
| 591 | |
| 592 | // Processing one full row each iteration |
| 593 | for (size_t i = 0; i < 16; i += 4) |
| 594 | { |
| 595 | __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4)); |
| 596 | |
| 597 | __m256i rgb16 = _mm256_cvtepu8_epi16(rgb); |
| 598 | __m256i d = _mm256_sub_epi16(a4, rgb16); |
| 599 | |
| 600 | // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16 |
| 601 | // This produces slightly different results, but is significant faster |
| 602 | __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14)); |
| 603 | __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0); |
| 604 | __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1); |
| 605 | __m128i pixel3 = _mm256_castsi256_si128(pixel2); |
| 606 | |
| 607 | __m128i pix0 = _mm_broadcastw_epi16(pixel3); |
| 608 | __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16)); |
| 609 | __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); |
| 610 | |
| 611 | // Processing first two pixels of the row |
| 612 | { |
| 613 | __m256i pix = _mm256_abs_epi16(pixel); |
| 614 | |
| 615 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
| 616 | // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries. |
| 617 | __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0]))); |
| 618 | __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1]))); |
| 619 | |
| 620 | __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1)); |
| 621 | __m256i minError = _mm256_min_epi16(error0, error1); |
| 622 | |
| 623 | // Exploiting symmetry of the selector table and use the sign bit |
| 624 | __m256i minIndex1 = _mm256_srli_epi16(pixel, 15); |
| 625 | |
| 626 | // Interleaving values so madd instruction can be used |
| 627 | __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
| 628 | __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
| 629 | |
| 630 | __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi); |
| 631 | // Squaring the minimum error to produce correct values when adding |
| 632 | __m256i squareError = _mm256_madd_epi16(minError2, minError2); |
| 633 | |
| 634 | squareErrorSum0 = _mm256_add_epi32(squareErrorSum0, squareError); |
| 635 | |
| 636 | // Packing selector bits |
| 637 | __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i)); |
| 638 | __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i)); |
| 639 | |
| 640 | sel0 = _mm256_or_si256(sel0, minIndexLo2); |
| 641 | sel1 = _mm256_or_si256(sel1, minIndexHi2); |
| 642 | } |
| 643 | |
| 644 | pixel3 = _mm256_extracti128_si256(pixel2, 1); |
| 645 | pix0 = _mm_broadcastw_epi16(pixel3); |
| 646 | pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16)); |
| 647 | pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); |
| 648 | |
| 649 | // Processing second two pixels of the row |
| 650 | { |
| 651 | __m256i pix = _mm256_abs_epi16(pixel); |
| 652 | |
| 653 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
| 654 | // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries. |
| 655 | __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0]))); |
| 656 | __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1]))); |
| 657 | |
| 658 | __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1)); |
| 659 | __m256i minError = _mm256_min_epi16(error0, error1); |
| 660 | |
| 661 | // Exploiting symmetry of the selector table and use the sign bit |
| 662 | __m256i minIndex1 = _mm256_srli_epi16(pixel, 15); |
| 663 | |
| 664 | // Interleaving values so madd instruction can be used |
| 665 | __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
| 666 | __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
| 667 | |
| 668 | __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi); |
| 669 | // Squaring the minimum error to produce correct values when adding |
| 670 | __m256i squareError = _mm256_madd_epi16(minError2, minError2); |
| 671 | |
| 672 | squareErrorSum1 = _mm256_add_epi32(squareErrorSum1, squareError); |
| 673 | |
| 674 | // Packing selector bits |
| 675 | __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i)); |
| 676 | __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i)); |
| 677 | __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2); |
| 678 | __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2); |
| 679 | |
| 680 | sel0 = _mm256_or_si256(sel0, minIndexLo3); |
| 681 | sel1 = _mm256_or_si256(sel1, minIndexHi3); |
| 682 | } |
| 683 | } |
| 684 | |
| 685 | _mm256_store_si256((__m256i*)terr[1], squareErrorSum0); |
| 686 | _mm256_store_si256((__m256i*)terr[0], squareErrorSum1); |
| 687 | |
| 688 | // Interleave selector bits |
| 689 | __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1); |
| 690 | __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1); |
| 691 | |
| 692 | __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4)); |
| 693 | __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4)); |
| 694 | |
| 695 | __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1); |
| 696 | |
| 697 | __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2); |
| 698 | |
| 699 | _mm256_store_si256((__m256i*)tsel, sel); |
| 700 | } |
| 701 | |
| 702 | static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate) noexcept |
| 703 | { |
| 704 | size_t tidx[2]; |
| 705 | |
| 706 | // Get index of minimum error (terr[0] and terr[1]) |
| 707 | __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]); |
| 708 | __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]); |
| 709 | |
| 710 | __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4)); |
| 711 | __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4)); |
| 712 | |
| 713 | __m256i errMin0 = _mm256_min_epu32(errLo, errHi); |
| 714 | |
| 715 | __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1)); |
| 716 | __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1); |
| 717 | |
| 718 | __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2)); |
| 719 | __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2); |
| 720 | |
| 721 | __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4)); |
| 722 | __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4)); |
| 723 | |
| 724 | __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0); |
| 725 | __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1); |
| 726 | |
| 727 | uint32_t mask0 = _mm256_movemask_epi8(errMask0); |
| 728 | uint32_t mask1 = _mm256_movemask_epi8(errMask1); |
| 729 | |
| 730 | tidx[0] = _bit_scan_forward(mask0) >> 2; |
| 731 | tidx[1] = _bit_scan_forward(mask1) >> 2; |
| 732 | |
| 733 | d |= tidx[0] << 26; |
| 734 | d |= tidx[1] << 29; |
| 735 | |
| 736 | unsigned int t0 = tsel[tidx[0]]; |
| 737 | unsigned int t1 = tsel[tidx[1]]; |
| 738 | |
| 739 | if (!rotate) |
| 740 | { |
| 741 | t0 &= 0xFF00FF00; |
| 742 | t1 &= 0x00FF00FF; |
| 743 | } |
| 744 | else |
| 745 | { |
| 746 | t0 &= 0xCCCCCCCC; |
| 747 | t1 &= 0x33333333; |
| 748 | } |
| 749 | |
| 750 | // Flip selectors from sign bit |
| 751 | unsigned int t2 = (t0 | t1) ^ 0xFFFF0000; |
| 752 | |
| 753 | return d | static_cast<uint64_t>(_bswap(t2)) << 32; |
| 754 | } |
| 755 | |
| 756 | static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cvf) noexcept |
| 757 | { |
| 758 | __m128i co = _mm_cvttps_epi32(cof); |
| 759 | __m128i ch = _mm_cvttps_epi32(chf); |
| 760 | __m128i cv = _mm_cvttps_epi32(cvf); |
| 761 | |
| 762 | __m128i coh = _mm_packus_epi32(co, ch); |
| 763 | __m128i cv0 = _mm_packus_epi32(cv, _mm_setzero_si128()); |
| 764 | |
| 765 | __m256i cohv0 = _mm256_inserti128_si256(_mm256_castsi128_si256(coh), cv0, 1); |
| 766 | __m256i cohv1 = _mm256_min_epu16(cohv0, _mm256_set1_epi16(1023)); |
| 767 | |
| 768 | __m256i cohv2 = _mm256_sub_epi16(cohv1, _mm256_set1_epi16(15)); |
| 769 | __m256i cohv3 = _mm256_srai_epi16(cohv2, 1); |
| 770 | |
| 771 | __m256i cohvrb0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(11)); |
| 772 | __m256i cohvrb1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(4)); |
| 773 | __m256i cohvg0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(9)); |
| 774 | __m256i cohvg1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(6)); |
| 775 | |
| 776 | __m256i cohvrb2 = _mm256_srai_epi16(cohvrb0, 7); |
| 777 | __m256i cohvrb3 = _mm256_srai_epi16(cohvrb1, 7); |
| 778 | __m256i cohvg2 = _mm256_srai_epi16(cohvg0, 8); |
| 779 | __m256i cohvg3 = _mm256_srai_epi16(cohvg1, 8); |
| 780 | |
| 781 | __m256i cohvrb4 = _mm256_sub_epi16(cohvrb0, cohvrb2); |
| 782 | __m256i cohvrb5 = _mm256_sub_epi16(cohvrb4, cohvrb3); |
| 783 | __m256i cohvg4 = _mm256_sub_epi16(cohvg0, cohvg2); |
| 784 | __m256i cohvg5 = _mm256_sub_epi16(cohvg4, cohvg3); |
| 785 | |
| 786 | __m256i cohvrb6 = _mm256_srai_epi16(cohvrb5, 3); |
| 787 | __m256i cohvg6 = _mm256_srai_epi16(cohvg5, 2); |
| 788 | |
| 789 | __m256i cohv4 = _mm256_blend_epi16(cohvg6, cohvrb6, 0x55); |
| 790 | |
| 791 | __m128i cohv5 = _mm_packus_epi16(_mm256_castsi256_si128(cohv4), _mm256_extracti128_si256(cohv4, 1)); |
| 792 | return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1)); |
| 793 | } |
| 794 | |
| 795 | static etcpak_force_inline Plane Planar_AVX2( const Channels& ch, uint8_t& mode, bool useHeuristics ) |
| 796 | { |
| 797 | __m128i t0 = _mm_sad_epu8( ch.r8, _mm_setzero_si128() ); |
| 798 | __m128i t1 = _mm_sad_epu8( ch.g8, _mm_setzero_si128() ); |
| 799 | __m128i t2 = _mm_sad_epu8( ch.b8, _mm_setzero_si128() ); |
| 800 | |
| 801 | __m128i r8s = _mm_shuffle_epi8( ch.r8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) ); |
| 802 | __m128i g8s = _mm_shuffle_epi8( ch.g8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) ); |
| 803 | __m128i b8s = _mm_shuffle_epi8( ch.b8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) ); |
| 804 | |
| 805 | __m128i s0 = _mm_sad_epu8( r8s, _mm_setzero_si128() ); |
| 806 | __m128i s1 = _mm_sad_epu8( g8s, _mm_setzero_si128() ); |
| 807 | __m128i s2 = _mm_sad_epu8( b8s, _mm_setzero_si128() ); |
| 808 | |
| 809 | __m256i sr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), s0, 1 ); |
| 810 | __m256i sg0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t1 ), s1, 1 ); |
| 811 | __m256i sb0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), s2, 1 ); |
| 812 | |
| 813 | __m256i sr1 = _mm256_slli_epi64( sr0, 32 ); |
| 814 | __m256i sg1 = _mm256_slli_epi64( sg0, 16 ); |
| 815 | |
| 816 | __m256i srb = _mm256_or_si256( sr1, sb0 ); |
| 817 | __m256i srgb = _mm256_or_si256( srb, sg1 ); |
| 818 | |
| 819 | if( mode != ModePlanar && useHeuristics ) |
| 820 | { |
| 821 | Plane plane; |
| 822 | plane.sum4 = _mm256_permute4x64_epi64( srgb, _MM_SHUFFLE( 2, 3, 0, 1 ) ); |
| 823 | return plane; |
| 824 | } |
| 825 | |
| 826 | __m128i t3 = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( t0 ), _mm_castsi128_ps( t1 ), _MM_SHUFFLE( 2, 0, 2, 0 ) ) ); |
| 827 | __m128i t4 = _mm_shuffle_epi32( t2, _MM_SHUFFLE( 3, 1, 2, 0 ) ); |
| 828 | __m128i t5 = _mm_hadd_epi32( t3, t4 ); |
| 829 | __m128i t6 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 1, 1, 1, 1 ) ); |
| 830 | __m128i t7 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
| 831 | |
| 832 | __m256i sr = _mm256_broadcastw_epi16( t5 ); |
| 833 | __m256i sg = _mm256_broadcastw_epi16( t6 ); |
| 834 | __m256i sb = _mm256_broadcastw_epi16( t7 ); |
| 835 | |
| 836 | __m256i r08 = _mm256_cvtepu8_epi16( ch.r8 ); |
| 837 | __m256i g08 = _mm256_cvtepu8_epi16( ch.g8 ); |
| 838 | __m256i b08 = _mm256_cvtepu8_epi16( ch.b8 ); |
| 839 | |
| 840 | __m256i r16 = _mm256_slli_epi16( r08, 4 ); |
| 841 | __m256i g16 = _mm256_slli_epi16( g08, 4 ); |
| 842 | __m256i b16 = _mm256_slli_epi16( b08, 4 ); |
| 843 | |
| 844 | __m256i difR0 = _mm256_sub_epi16( r16, sr ); |
| 845 | __m256i difG0 = _mm256_sub_epi16( g16, sg ); |
| 846 | __m256i difB0 = _mm256_sub_epi16( b16, sb ); |
| 847 | |
| 848 | __m256i difRyz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) ); |
| 849 | __m256i difGyz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) ); |
| 850 | __m256i difByz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) ); |
| 851 | |
| 852 | __m256i difRxz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) ); |
| 853 | __m256i difGxz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) ); |
| 854 | __m256i difBxz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) ); |
| 855 | |
| 856 | __m256i difRGyz = _mm256_hadd_epi32( difRyz, difGyz ); |
| 857 | __m256i difByzxz = _mm256_hadd_epi32( difByz, difBxz ); |
| 858 | |
| 859 | __m256i difRGxz = _mm256_hadd_epi32( difRxz, difGxz ); |
| 860 | |
| 861 | __m128i sumRGyz = _mm_add_epi32( _mm256_castsi256_si128( difRGyz ), _mm256_extracti128_si256( difRGyz, 1 ) ); |
| 862 | __m128i sumByzxz = _mm_add_epi32( _mm256_castsi256_si128( difByzxz ), _mm256_extracti128_si256( difByzxz, 1 ) ); |
| 863 | __m128i sumRGxz = _mm_add_epi32( _mm256_castsi256_si128( difRGxz ), _mm256_extracti128_si256( difRGxz, 1 ) ); |
| 864 | |
| 865 | __m128i sumRGByz = _mm_hadd_epi32( sumRGyz, sumByzxz ); |
| 866 | __m128i sumRGByzxz = _mm_hadd_epi32( sumRGxz, sumByzxz ); |
| 867 | |
| 868 | __m128i sumRGBxz = _mm_shuffle_epi32( sumRGByzxz, _MM_SHUFFLE( 2, 3, 1, 0 ) ); |
| 869 | |
| 870 | __m128 sumRGByzf = _mm_cvtepi32_ps( sumRGByz ); |
| 871 | __m128 sumRGBxzf = _mm_cvtepi32_ps( sumRGBxz ); |
| 872 | |
| 873 | const float value = ( 255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f; |
| 874 | |
| 875 | __m128 scale = _mm_set1_ps( -4.0f / value ); |
| 876 | |
| 877 | __m128 af = _mm_mul_ps( sumRGBxzf, scale ); |
| 878 | __m128 bf = _mm_mul_ps( sumRGByzf, scale ); |
| 879 | |
| 880 | __m128 df = _mm_mul_ps( _mm_cvtepi32_ps( t5 ), _mm_set1_ps( 4.0f / 16.0f ) ); |
| 881 | |
| 882 | // calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y; |
| 883 | __m128 cof0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) ); |
| 884 | __m128 chf0 = _mm_fnmadd_ps( af, _mm_set1_ps( 425.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) ); |
| 885 | __m128 cvf0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( 425.0f ), df ) ); |
| 886 | |
| 887 | // convert to r6g7b6 |
| 888 | __m128i cohv = r6g7b6_AVX2( cof0, chf0, cvf0 ); |
| 889 | |
| 890 | uint64_t rgbho = _mm_extract_epi64( cohv, 0 ); |
| 891 | uint32_t rgbv0 = _mm_extract_epi32( cohv, 2 ); |
| 892 | |
| 893 | // Error calculation |
| 894 | uint64_t error = 0; |
| 895 | if( !useHeuristics ) |
| 896 | { |
| 897 | auto ro0 = ( rgbho >> 48 ) & 0x3F; |
| 898 | auto go0 = ( rgbho >> 40 ) & 0x7F; |
| 899 | auto bo0 = ( rgbho >> 32 ) & 0x3F; |
| 900 | auto ro1 = ( ro0 >> 4 ) | ( ro0 << 2 ); |
| 901 | auto go1 = ( go0 >> 6 ) | ( go0 << 1 ); |
| 902 | auto bo1 = ( bo0 >> 4 ) | ( bo0 << 2 ); |
| 903 | auto ro2 = ( ro1 << 2 ) + 2; |
| 904 | auto go2 = ( go1 << 2 ) + 2; |
| 905 | auto bo2 = ( bo1 << 2 ) + 2; |
| 906 | |
| 907 | __m256i ro3 = _mm256_set1_epi16( ro2 ); |
| 908 | __m256i go3 = _mm256_set1_epi16( go2 ); |
| 909 | __m256i bo3 = _mm256_set1_epi16( bo2 ); |
| 910 | |
| 911 | auto rh0 = ( rgbho >> 16 ) & 0x3F; |
| 912 | auto gh0 = ( rgbho >> 8 ) & 0x7F; |
| 913 | auto bh0 = ( rgbho >> 0 ) & 0x3F; |
| 914 | auto rh1 = ( rh0 >> 4 ) | ( rh0 << 2 ); |
| 915 | auto gh1 = ( gh0 >> 6 ) | ( gh0 << 1 ); |
| 916 | auto bh1 = ( bh0 >> 4 ) | ( bh0 << 2 ); |
| 917 | |
| 918 | auto rh2 = rh1 - ro1; |
| 919 | auto gh2 = gh1 - go1; |
| 920 | auto bh2 = bh1 - bo1; |
| 921 | |
| 922 | __m256i rh3 = _mm256_set1_epi16( rh2 ); |
| 923 | __m256i gh3 = _mm256_set1_epi16( gh2 ); |
| 924 | __m256i bh3 = _mm256_set1_epi16( bh2 ); |
| 925 | |
| 926 | auto rv0 = ( rgbv0 >> 16 ) & 0x3F; |
| 927 | auto gv0 = ( rgbv0 >> 8 ) & 0x7F; |
| 928 | auto bv0 = ( rgbv0 >> 0 ) & 0x3F; |
| 929 | auto rv1 = ( rv0 >> 4 ) | ( rv0 << 2 ); |
| 930 | auto gv1 = ( gv0 >> 6 ) | ( gv0 << 1 ); |
| 931 | auto bv1 = ( bv0 >> 4 ) | ( bv0 << 2 ); |
| 932 | |
| 933 | auto rv2 = rv1 - ro1; |
| 934 | auto gv2 = gv1 - go1; |
| 935 | auto bv2 = bv1 - bo1; |
| 936 | |
| 937 | __m256i rv3 = _mm256_set1_epi16( rv2 ); |
| 938 | __m256i gv3 = _mm256_set1_epi16( gv2 ); |
| 939 | __m256i bv3 = _mm256_set1_epi16( bv2 ); |
| 940 | |
| 941 | __m256i x = _mm256_set_epi16( 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 ); |
| 942 | |
| 943 | __m256i rh4 = _mm256_mullo_epi16( rh3, x ); |
| 944 | __m256i gh4 = _mm256_mullo_epi16( gh3, x ); |
| 945 | __m256i bh4 = _mm256_mullo_epi16( bh3, x ); |
| 946 | |
| 947 | __m256i y = _mm256_set_epi16( 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0 ); |
| 948 | |
| 949 | __m256i rv4 = _mm256_mullo_epi16( rv3, y ); |
| 950 | __m256i gv4 = _mm256_mullo_epi16( gv3, y ); |
| 951 | __m256i bv4 = _mm256_mullo_epi16( bv3, y ); |
| 952 | |
| 953 | __m256i rxy = _mm256_add_epi16( rh4, rv4 ); |
| 954 | __m256i gxy = _mm256_add_epi16( gh4, gv4 ); |
| 955 | __m256i bxy = _mm256_add_epi16( bh4, bv4 ); |
| 956 | |
| 957 | __m256i rp0 = _mm256_add_epi16( rxy, ro3 ); |
| 958 | __m256i gp0 = _mm256_add_epi16( gxy, go3 ); |
| 959 | __m256i bp0 = _mm256_add_epi16( bxy, bo3 ); |
| 960 | |
| 961 | __m256i rp1 = _mm256_srai_epi16( rp0, 2 ); |
| 962 | __m256i gp1 = _mm256_srai_epi16( gp0, 2 ); |
| 963 | __m256i bp1 = _mm256_srai_epi16( bp0, 2 ); |
| 964 | |
| 965 | __m256i rp2 = _mm256_max_epi16( _mm256_min_epi16( rp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() ); |
| 966 | __m256i gp2 = _mm256_max_epi16( _mm256_min_epi16( gp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() ); |
| 967 | __m256i bp2 = _mm256_max_epi16( _mm256_min_epi16( bp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() ); |
| 968 | |
| 969 | __m256i rdif = _mm256_sub_epi16( r08, rp2 ); |
| 970 | __m256i gdif = _mm256_sub_epi16( g08, gp2 ); |
| 971 | __m256i bdif = _mm256_sub_epi16( b08, bp2 ); |
| 972 | |
| 973 | __m256i rerr = _mm256_mullo_epi16( rdif, _mm256_set1_epi16( 38 ) ); |
| 974 | __m256i gerr = _mm256_mullo_epi16( gdif, _mm256_set1_epi16( 76 ) ); |
| 975 | __m256i berr = _mm256_mullo_epi16( bdif, _mm256_set1_epi16( 14 ) ); |
| 976 | |
| 977 | __m256i sum0 = _mm256_add_epi16( rerr, gerr ); |
| 978 | __m256i sum1 = _mm256_add_epi16( sum0, berr ); |
| 979 | |
| 980 | __m256i sum2 = _mm256_madd_epi16( sum1, sum1 ); |
| 981 | |
| 982 | __m128i sum3 = _mm_add_epi32( _mm256_castsi256_si128( sum2 ), _mm256_extracti128_si256( sum2, 1 ) ); |
| 983 | |
| 984 | uint32_t err0 = _mm_extract_epi32( sum3, 0 ); |
| 985 | uint32_t err1 = _mm_extract_epi32( sum3, 1 ); |
| 986 | uint32_t err2 = _mm_extract_epi32( sum3, 2 ); |
| 987 | uint32_t err3 = _mm_extract_epi32( sum3, 3 ); |
| 988 | |
| 989 | error = err0 + err1 + err2 + err3; |
| 990 | } |
| 991 | /**/ |
| 992 | |
| 993 | uint32_t rgbv = ( rgbv0 & 0x3F ) | ( ( rgbv0 >> 2 ) & 0x1FC0 ) | ( ( rgbv0 >> 3 ) & 0x7E000 ); |
| 994 | uint64_t rgbho0_ = ( rgbho & 0x3F0000003F ) | ( ( rgbho >> 2 ) & 0x1FC000001FC0 ) | ( ( rgbho >> 3 ) & 0x7E0000007E000 ); |
| 995 | uint64_t rgbho0 = ( rgbho0_ & 0x7FFFF ) | ( ( rgbho0_ >> 13 ) & 0x3FFFF80000 ); |
| 996 | |
| 997 | uint32_t hi = rgbv | ((rgbho0 & 0x1FFF) << 19); |
| 998 | rgbho0 >>= 13; |
| 999 | uint32_t lo = ( rgbho0 & 0x1 ) | ( ( rgbho0 & 0x1FE ) << 1 ) | ( ( rgbho0 & 0x600 ) << 2 ) | ( ( rgbho0 & 0x3F800 ) << 5 ) | ( ( rgbho0 & 0x1FC0000 ) << 6 ); |
| 1000 | |
| 1001 | uint32_t idx = ( ( rgbho >> 33 ) & 0xF ) | ( ( rgbho >> 41 ) & 0x10 ) | ( ( rgbho >> 48 ) & 0x20 ); |
| 1002 | lo |= g_flags[idx]; |
| 1003 | uint64_t result = static_cast<uint32_t>(_bswap(lo)); |
| 1004 | result |= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << 32; |
| 1005 | |
| 1006 | Plane plane; |
| 1007 | |
| 1008 | plane.plane = result; |
| 1009 | if( useHeuristics ) |
| 1010 | { |
| 1011 | plane.error = 0; |
| 1012 | mode = ModePlanar; |
| 1013 | } |
| 1014 | else |
| 1015 | { |
| 1016 | plane.error = error; |
| 1017 | } |
| 1018 | plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1)); |
| 1019 | |
| 1020 | return plane; |
| 1021 | } |
| 1022 | |
| 1023 | static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate, const uint64_t value, const uint32_t error) noexcept |
| 1024 | { |
| 1025 | size_t tidx[2]; |
| 1026 | |
| 1027 | // Get index of minimum error (terr[0] and terr[1]) |
| 1028 | __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]); |
| 1029 | __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]); |
| 1030 | |
| 1031 | __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4)); |
| 1032 | __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4)); |
| 1033 | |
| 1034 | __m256i errMin0 = _mm256_min_epu32(errLo, errHi); |
| 1035 | |
| 1036 | __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1)); |
| 1037 | __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1); |
| 1038 | |
| 1039 | __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2)); |
| 1040 | __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2); |
| 1041 | |
| 1042 | __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4)); |
| 1043 | __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4)); |
| 1044 | |
| 1045 | __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0); |
| 1046 | __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1); |
| 1047 | |
| 1048 | uint32_t mask0 = _mm256_movemask_epi8(errMask0); |
| 1049 | uint32_t mask1 = _mm256_movemask_epi8(errMask1); |
| 1050 | |
| 1051 | tidx[0] = _bit_scan_forward(mask0) >> 2; |
| 1052 | tidx[1] = _bit_scan_forward(mask1) >> 2; |
| 1053 | |
| 1054 | if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error) |
| 1055 | { |
| 1056 | return value; |
| 1057 | } |
| 1058 | |
| 1059 | d |= tidx[0] << 26; |
| 1060 | d |= tidx[1] << 29; |
| 1061 | |
| 1062 | unsigned int t0 = tsel[tidx[0]]; |
| 1063 | unsigned int t1 = tsel[tidx[1]]; |
| 1064 | |
| 1065 | if (!rotate) |
| 1066 | { |
| 1067 | t0 &= 0xFF00FF00; |
| 1068 | t1 &= 0x00FF00FF; |
| 1069 | } |
| 1070 | else |
| 1071 | { |
| 1072 | t0 &= 0xCCCCCCCC; |
| 1073 | t1 &= 0x33333333; |
| 1074 | } |
| 1075 | |
| 1076 | // Flip selectors from sign bit |
| 1077 | unsigned int t2 = (t0 | t1) ^ 0xFFFF0000; |
| 1078 | |
| 1079 | return d | static_cast<uint64_t>(_bswap(t2)) << 32; |
| 1080 | } |
| 1081 | |
| 1082 | #endif |
| 1083 | |
| 1084 | static etcpak_force_inline void Average( const uint8_t* data, v4i* a ) |
| 1085 | { |
| 1086 | #ifdef __SSE4_1__ |
| 1087 | __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0); |
| 1088 | __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1); |
| 1089 | __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2); |
| 1090 | __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3); |
| 1091 | |
| 1092 | __m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); |
| 1093 | __m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128()); |
| 1094 | __m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128()); |
| 1095 | __m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128()); |
| 1096 | __m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128()); |
| 1097 | __m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128()); |
| 1098 | __m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128()); |
| 1099 | __m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128()); |
| 1100 | |
| 1101 | __m128i sum0 = _mm_add_epi16(d0l, d1l); |
| 1102 | __m128i sum1 = _mm_add_epi16(d0h, d1h); |
| 1103 | __m128i sum2 = _mm_add_epi16(d2l, d3l); |
| 1104 | __m128i sum3 = _mm_add_epi16(d2h, d3h); |
| 1105 | |
| 1106 | __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128()); |
| 1107 | __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128()); |
| 1108 | __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128()); |
| 1109 | __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128()); |
| 1110 | __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128()); |
| 1111 | __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128()); |
| 1112 | __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128()); |
| 1113 | __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128()); |
| 1114 | |
| 1115 | __m128i b0 = _mm_add_epi32(sum0l, sum0h); |
| 1116 | __m128i b1 = _mm_add_epi32(sum1l, sum1h); |
| 1117 | __m128i b2 = _mm_add_epi32(sum2l, sum2h); |
| 1118 | __m128i b3 = _mm_add_epi32(sum3l, sum3h); |
| 1119 | |
| 1120 | __m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3); |
| 1121 | __m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3); |
| 1122 | __m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3); |
| 1123 | __m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3); |
| 1124 | |
| 1125 | _mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2)))); |
| 1126 | _mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2)))); |
| 1127 | #elif defined __ARM_NEON |
| 1128 | uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data + 0), uint8x16_t()); |
| 1129 | uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t()); |
| 1130 | uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t()); |
| 1131 | uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t()); |
| 1132 | |
| 1133 | uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) }; |
| 1134 | uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) }; |
| 1135 | uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) }; |
| 1136 | uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) }; |
| 1137 | |
| 1138 | uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ) ) ), uint16x8_t()); |
| 1139 | uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ) ) ), uint16x8_t()); |
| 1140 | uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ) ) ), uint16x8_t()); |
| 1141 | uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ) ) ), uint16x8_t()); |
| 1142 | |
| 1143 | uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) }; |
| 1144 | uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) }; |
| 1145 | uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) }; |
| 1146 | uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) }; |
| 1147 | |
| 1148 | uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]); |
| 1149 | uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]); |
| 1150 | uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]); |
| 1151 | uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]); |
| 1152 | |
| 1153 | uint32x4_t a0 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b2, b3), vdupq_n_u32(4)), 3); |
| 1154 | uint32x4_t a1 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b1), vdupq_n_u32(4)), 3); |
| 1155 | uint32x4_t a2 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b1, b3), vdupq_n_u32(4)), 3); |
| 1156 | uint32x4_t a3 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b2), vdupq_n_u32(4)), 3); |
| 1157 | |
| 1158 | uint16x8_t o0 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a0 )), vqmovun_s32(vreinterpretq_s32_u32( a1 ))); |
| 1159 | uint16x8_t o1 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a2 )), vqmovun_s32(vreinterpretq_s32_u32( a3 ))); |
| 1160 | |
| 1161 | a[0] = v4i{o0[2], o0[1], o0[0], 0}; |
| 1162 | a[1] = v4i{o0[6], o0[5], o0[4], 0}; |
| 1163 | a[2] = v4i{o1[2], o1[1], o1[0], 0}; |
| 1164 | a[3] = v4i{o1[6], o1[5], o1[4], 0}; |
| 1165 | #else |
| 1166 | uint32_t r[4]; |
| 1167 | uint32_t g[4]; |
| 1168 | uint32_t b[4]; |
| 1169 | |
| 1170 | memset(r, 0, sizeof(r)); |
| 1171 | memset(g, 0, sizeof(g)); |
| 1172 | memset(b, 0, sizeof(b)); |
| 1173 | |
| 1174 | for( int j=0; j<4; j++ ) |
| 1175 | { |
| 1176 | for( int i=0; i<4; i++ ) |
| 1177 | { |
| 1178 | int index = (j & 2) + (i >> 1); |
| 1179 | b[index] += *data++; |
| 1180 | g[index] += *data++; |
| 1181 | r[index] += *data++; |
| 1182 | data++; |
| 1183 | } |
| 1184 | } |
| 1185 | |
| 1186 | a[0] = v4i{ uint16_t( (r[2] + r[3] + 4) / 8 ), uint16_t( (g[2] + g[3] + 4) / 8 ), uint16_t( (b[2] + b[3] + 4) / 8 ), 0}; |
| 1187 | a[1] = v4i{ uint16_t( (r[0] + r[1] + 4) / 8 ), uint16_t( (g[0] + g[1] + 4) / 8 ), uint16_t( (b[0] + b[1] + 4) / 8 ), 0}; |
| 1188 | a[2] = v4i{ uint16_t( (r[1] + r[3] + 4) / 8 ), uint16_t( (g[1] + g[3] + 4) / 8 ), uint16_t( (b[1] + b[3] + 4) / 8 ), 0}; |
| 1189 | a[3] = v4i{ uint16_t( (r[0] + r[2] + 4) / 8 ), uint16_t( (g[0] + g[2] + 4) / 8 ), uint16_t( (b[0] + b[2] + 4) / 8 ), 0}; |
| 1190 | #endif |
| 1191 | } |
| 1192 | |
| 1193 | static etcpak_force_inline void CalcErrorBlock( const uint8_t* data, unsigned int err[4][4] ) |
| 1194 | { |
| 1195 | #ifdef __SSE4_1__ |
| 1196 | __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0); |
| 1197 | __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1); |
| 1198 | __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2); |
| 1199 | __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3); |
| 1200 | |
| 1201 | __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF)); |
| 1202 | __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF)); |
| 1203 | __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF)); |
| 1204 | __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF)); |
| 1205 | |
| 1206 | __m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128()); |
| 1207 | __m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128()); |
| 1208 | __m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128()); |
| 1209 | __m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128()); |
| 1210 | __m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128()); |
| 1211 | __m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128()); |
| 1212 | __m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128()); |
| 1213 | __m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128()); |
| 1214 | |
| 1215 | __m128i sum0 = _mm_add_epi16(d0l, d1l); |
| 1216 | __m128i sum1 = _mm_add_epi16(d0h, d1h); |
| 1217 | __m128i sum2 = _mm_add_epi16(d2l, d3l); |
| 1218 | __m128i sum3 = _mm_add_epi16(d2h, d3h); |
| 1219 | |
| 1220 | __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128()); |
| 1221 | __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128()); |
| 1222 | __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128()); |
| 1223 | __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128()); |
| 1224 | __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128()); |
| 1225 | __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128()); |
| 1226 | __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128()); |
| 1227 | __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128()); |
| 1228 | |
| 1229 | __m128i b0 = _mm_add_epi32(sum0l, sum0h); |
| 1230 | __m128i b1 = _mm_add_epi32(sum1l, sum1h); |
| 1231 | __m128i b2 = _mm_add_epi32(sum2l, sum2h); |
| 1232 | __m128i b3 = _mm_add_epi32(sum3l, sum3h); |
| 1233 | |
| 1234 | __m128i a0 = _mm_add_epi32(b2, b3); |
| 1235 | __m128i a1 = _mm_add_epi32(b0, b1); |
| 1236 | __m128i a2 = _mm_add_epi32(b1, b3); |
| 1237 | __m128i a3 = _mm_add_epi32(b0, b2); |
| 1238 | |
| 1239 | _mm_storeu_si128((__m128i*)&err[0], a0); |
| 1240 | _mm_storeu_si128((__m128i*)&err[1], a1); |
| 1241 | _mm_storeu_si128((__m128i*)&err[2], a2); |
| 1242 | _mm_storeu_si128((__m128i*)&err[3], a3); |
| 1243 | #elif defined __ARM_NEON |
| 1244 | uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data + 0), uint8x16_t()); |
| 1245 | uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t()); |
| 1246 | uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t()); |
| 1247 | uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t()); |
| 1248 | |
| 1249 | uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) }; |
| 1250 | uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) }; |
| 1251 | uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) }; |
| 1252 | uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) }; |
| 1253 | |
| 1254 | uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ))), uint16x8_t()); |
| 1255 | uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ))), uint16x8_t()); |
| 1256 | uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ))), uint16x8_t()); |
| 1257 | uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ))), uint16x8_t()); |
| 1258 | |
| 1259 | uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) }; |
| 1260 | uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) }; |
| 1261 | uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) }; |
| 1262 | uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) }; |
| 1263 | |
| 1264 | uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]); |
| 1265 | uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]); |
| 1266 | uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]); |
| 1267 | uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]); |
| 1268 | |
| 1269 | uint32x4_t a0 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b2, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) ); |
| 1270 | uint32x4_t a1 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b1) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) ); |
| 1271 | uint32x4_t a2 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b1, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) ); |
| 1272 | uint32x4_t a3 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b2) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) ); |
| 1273 | |
| 1274 | vst1q_u32(err[0], a0); |
| 1275 | vst1q_u32(err[1], a1); |
| 1276 | vst1q_u32(err[2], a2); |
| 1277 | vst1q_u32(err[3], a3); |
| 1278 | #else |
| 1279 | unsigned int terr[4][4]; |
| 1280 | |
| 1281 | memset(terr, 0, 16 * sizeof(unsigned int)); |
| 1282 | |
| 1283 | for( int j=0; j<4; j++ ) |
| 1284 | { |
| 1285 | for( int i=0; i<4; i++ ) |
| 1286 | { |
| 1287 | int index = (j & 2) + (i >> 1); |
| 1288 | unsigned int d = *data++; |
| 1289 | terr[index][0] += d; |
| 1290 | d = *data++; |
| 1291 | terr[index][1] += d; |
| 1292 | d = *data++; |
| 1293 | terr[index][2] += d; |
| 1294 | data++; |
| 1295 | } |
| 1296 | } |
| 1297 | |
| 1298 | for( int i=0; i<3; i++ ) |
| 1299 | { |
| 1300 | err[0][i] = terr[2][i] + terr[3][i]; |
| 1301 | err[1][i] = terr[0][i] + terr[1][i]; |
| 1302 | err[2][i] = terr[1][i] + terr[3][i]; |
| 1303 | err[3][i] = terr[0][i] + terr[2][i]; |
| 1304 | } |
| 1305 | for( int i=0; i<4; i++ ) |
| 1306 | { |
| 1307 | err[i][3] = 0; |
| 1308 | } |
| 1309 | #endif |
| 1310 | } |
| 1311 | |
| 1312 | static etcpak_force_inline unsigned int CalcError( const unsigned int block[4], const v4i& average ) |
| 1313 | { |
| 1314 | unsigned int err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow |
| 1315 | err -= block[0] * 2 * average[2]; |
| 1316 | err -= block[1] * 2 * average[1]; |
| 1317 | err -= block[2] * 2 * average[0]; |
| 1318 | err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) ); |
| 1319 | return err; |
| 1320 | } |
| 1321 | |
| 1322 | static etcpak_force_inline void ProcessAverages( v4i* a ) |
| 1323 | { |
| 1324 | #ifdef __SSE4_1__ |
| 1325 | for( int i=0; i<2; i++ ) |
| 1326 | { |
| 1327 | __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data()); |
| 1328 | |
| 1329 | __m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128)); |
| 1330 | |
| 1331 | __m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8); |
| 1332 | |
| 1333 | __m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2)); |
| 1334 | __m128i diff = _mm_sub_epi16(c, c1); |
| 1335 | diff = _mm_max_epi16(diff, _mm_set1_epi16(-4)); |
| 1336 | diff = _mm_min_epi16(diff, _mm_set1_epi16(3)); |
| 1337 | |
| 1338 | __m128i co = _mm_add_epi16(c1, diff); |
| 1339 | |
| 1340 | c = _mm_blend_epi16(co, c, 0xF0); |
| 1341 | |
| 1342 | __m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2)); |
| 1343 | |
| 1344 | _mm_storeu_si128((__m128i*)a[4+i*2].data(), a0); |
| 1345 | } |
| 1346 | |
| 1347 | for( int i=0; i<2; i++ ) |
| 1348 | { |
| 1349 | __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data()); |
| 1350 | |
| 1351 | __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128)); |
| 1352 | __m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8); |
| 1353 | |
| 1354 | __m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4)); |
| 1355 | |
| 1356 | _mm_storeu_si128((__m128i*)a[i*2].data(), t2); |
| 1357 | } |
| 1358 | #elif defined __ARM_NEON |
| 1359 | for( int i=0; i<2; i++ ) |
| 1360 | { |
| 1361 | int16x8_t d = vld1q_s16((int16_t*)&a[i*2]); |
| 1362 | int16x8_t t = vaddq_s16(vmulq_s16(d, vdupq_n_s16(31)), vdupq_n_s16(128)); |
| 1363 | int16x8_t c = vshrq_n_s16(vaddq_s16(t, vshrq_n_s16(t, 8)), 8); |
| 1364 | |
| 1365 | int16x8_t c1 = vcombine_s16(vget_high_s16(c), vget_high_s16(c)); |
| 1366 | int16x8_t diff = vsubq_s16(c, c1); |
| 1367 | diff = vmaxq_s16(diff, vdupq_n_s16(-4)); |
| 1368 | diff = vminq_s16(diff, vdupq_n_s16(3)); |
| 1369 | |
| 1370 | int16x8_t co = vaddq_s16(c1, diff); |
| 1371 | |
| 1372 | c = vcombine_s16(vget_low_s16(co), vget_high_s16(c)); |
| 1373 | |
| 1374 | int16x8_t a0 = vorrq_s16(vshlq_n_s16(c, 3), vshrq_n_s16(c, 2)); |
| 1375 | |
| 1376 | vst1q_s16((int16_t*)&a[4+i*2], a0); |
| 1377 | } |
| 1378 | |
| 1379 | for( int i=0; i<2; i++ ) |
| 1380 | { |
| 1381 | int16x8_t d = vld1q_s16((int16_t*)&a[i*2]); |
| 1382 | |
| 1383 | int16x8_t t0 = vaddq_s16(vmulq_s16(d, vdupq_n_s16(15)), vdupq_n_s16(128)); |
| 1384 | int16x8_t t1 = vshrq_n_s16(vaddq_s16(t0, vshrq_n_s16(t0, 8)), 8); |
| 1385 | |
| 1386 | int16x8_t t2 = vorrq_s16(t1, vshlq_n_s16(t1, 4)); |
| 1387 | |
| 1388 | vst1q_s16((int16_t*)&a[i*2], t2); |
| 1389 | } |
| 1390 | #else |
| 1391 | for( int i=0; i<2; i++ ) |
| 1392 | { |
| 1393 | for( int j=0; j<3; j++ ) |
| 1394 | { |
| 1395 | int32_t c1 = mul8bit( a[i*2+1][j], 31 ); |
| 1396 | int32_t c2 = mul8bit( a[i*2][j], 31 ); |
| 1397 | |
| 1398 | int32_t diff = c2 - c1; |
| 1399 | if( diff > 3 ) diff = 3; |
| 1400 | else if( diff < -4 ) diff = -4; |
| 1401 | |
| 1402 | int32_t co = c1 + diff; |
| 1403 | |
| 1404 | a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 ); |
| 1405 | a[4+i*2][j] = ( co << 3 ) | ( co >> 2 ); |
| 1406 | } |
| 1407 | } |
| 1408 | |
| 1409 | for( int i=0; i<4; i++ ) |
| 1410 | { |
| 1411 | a[i][0] = g_avg2[mul8bit( a[i][0], 15 )]; |
| 1412 | a[i][1] = g_avg2[mul8bit( a[i][1], 15 )]; |
| 1413 | a[i][2] = g_avg2[mul8bit( a[i][2], 15 )]; |
| 1414 | } |
| 1415 | #endif |
| 1416 | } |
| 1417 | |
| 1418 | static etcpak_force_inline void EncodeAverages( uint64_t& _d, const v4i* a, size_t idx ) |
| 1419 | { |
| 1420 | auto d = _d; |
| 1421 | d |= ( idx << 24 ); |
| 1422 | size_t base = idx << 1; |
| 1423 | |
| 1424 | if( ( idx & 0x2 ) == 0 ) |
| 1425 | { |
| 1426 | for( int i=0; i<3; i++ ) |
| 1427 | { |
| 1428 | d |= uint64_t( a[base+0][i] >> 4 ) << ( i*8 ); |
| 1429 | d |= uint64_t( a[base+1][i] >> 4 ) << ( i*8 + 4 ); |
| 1430 | } |
| 1431 | } |
| 1432 | else |
| 1433 | { |
| 1434 | for( int i=0; i<3; i++ ) |
| 1435 | { |
| 1436 | d |= uint64_t( a[base+1][i] & 0xF8 ) << ( i*8 ); |
| 1437 | int32_t c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3; |
| 1438 | c &= ~0xFFFFFFF8; |
| 1439 | d |= ((uint64_t)c) << ( i*8 ); |
| 1440 | } |
| 1441 | } |
| 1442 | _d = d; |
| 1443 | } |
| 1444 | |
| 1445 | static etcpak_force_inline uint64_t CheckSolid( const uint8_t* src ) |
| 1446 | { |
| 1447 | #ifdef __SSE4_1__ |
| 1448 | __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0); |
| 1449 | __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1); |
| 1450 | __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2); |
| 1451 | __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3); |
| 1452 | |
| 1453 | __m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0)); |
| 1454 | |
| 1455 | __m128i c0 = _mm_cmpeq_epi8(d0, c); |
| 1456 | __m128i c1 = _mm_cmpeq_epi8(d1, c); |
| 1457 | __m128i c2 = _mm_cmpeq_epi8(d2, c); |
| 1458 | __m128i c3 = _mm_cmpeq_epi8(d3, c); |
| 1459 | |
| 1460 | __m128i m0 = _mm_and_si128(c0, c1); |
| 1461 | __m128i m1 = _mm_and_si128(c2, c3); |
| 1462 | __m128i m = _mm_and_si128(m0, m1); |
| 1463 | |
| 1464 | if (!_mm_testc_si128(m, _mm_set1_epi32(-1))) |
| 1465 | { |
| 1466 | return 0; |
| 1467 | } |
| 1468 | #elif defined __ARM_NEON |
| 1469 | int32x4_t d0 = vld1q_s32((int32_t*)src + 0); |
| 1470 | int32x4_t d1 = vld1q_s32((int32_t*)src + 4); |
| 1471 | int32x4_t d2 = vld1q_s32((int32_t*)src + 8); |
| 1472 | int32x4_t d3 = vld1q_s32((int32_t*)src + 12); |
| 1473 | |
| 1474 | int32x4_t c = vdupq_n_s32(d0[0]); |
| 1475 | |
| 1476 | int32x4_t c0 = vreinterpretq_s32_u32(vceqq_s32(d0, c)); |
| 1477 | int32x4_t c1 = vreinterpretq_s32_u32(vceqq_s32(d1, c)); |
| 1478 | int32x4_t c2 = vreinterpretq_s32_u32(vceqq_s32(d2, c)); |
| 1479 | int32x4_t c3 = vreinterpretq_s32_u32(vceqq_s32(d3, c)); |
| 1480 | |
| 1481 | int32x4_t m0 = vandq_s32(c0, c1); |
| 1482 | int32x4_t m1 = vandq_s32(c2, c3); |
| 1483 | int64x2_t m = vreinterpretq_s64_s32(vandq_s32(m0, m1)); |
| 1484 | |
| 1485 | if (m[0] != -1 || m[1] != -1) |
| 1486 | { |
| 1487 | return 0; |
| 1488 | } |
| 1489 | #else |
| 1490 | const uint8_t* ptr = src + 4; |
| 1491 | for( int i=1; i<16; i++ ) |
| 1492 | { |
| 1493 | if( memcmp( src, ptr, 4 ) != 0 ) |
| 1494 | { |
| 1495 | return 0; |
| 1496 | } |
| 1497 | ptr += 4; |
| 1498 | } |
| 1499 | #endif |
| 1500 | return 0x02000000 | |
| 1501 | ( (unsigned int)( src[0] & 0xF8 ) << 16 ) | |
| 1502 | ( (unsigned int)( src[1] & 0xF8 ) << 8 ) | |
| 1503 | ( (unsigned int)( src[2] & 0xF8 ) ); |
| 1504 | } |
| 1505 | |
| 1506 | static etcpak_force_inline void PrepareAverages( v4i a[8], const uint8_t* src, unsigned int err[4] ) |
| 1507 | { |
| 1508 | Average( src, a ); |
| 1509 | ProcessAverages( a ); |
| 1510 | |
| 1511 | unsigned int errblock[4][4]; |
| 1512 | CalcErrorBlock( src, errblock ); |
| 1513 | |
| 1514 | for( int i=0; i<4; i++ ) |
| 1515 | { |
| 1516 | err[i/2] += CalcError( errblock[i], a[i] ); |
| 1517 | err[2+i/2] += CalcError( errblock[i], a[i+4] ); |
| 1518 | } |
| 1519 | } |
| 1520 | |
| 1521 | static etcpak_force_inline void FindBestFit( uint64_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data ) |
| 1522 | { |
| 1523 | for( size_t i=0; i<16; i++ ) |
| 1524 | { |
| 1525 | uint16_t* sel = tsel[i]; |
| 1526 | unsigned int bid = id[i]; |
| 1527 | uint64_t* ter = terr[bid%2]; |
| 1528 | |
| 1529 | uint8_t b = *data++; |
| 1530 | uint8_t g = *data++; |
| 1531 | uint8_t r = *data++; |
| 1532 | data++; |
| 1533 | |
| 1534 | int dr = a[bid][0] - r; |
| 1535 | int dg = a[bid][1] - g; |
| 1536 | int db = a[bid][2] - b; |
| 1537 | |
| 1538 | #ifdef __SSE4_1__ |
| 1539 | // Reference implementation |
| 1540 | |
| 1541 | __m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28); |
| 1542 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
| 1543 | __m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0])); |
| 1544 | __m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1])); |
| 1545 | __m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0])); |
| 1546 | __m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1])); |
| 1547 | |
| 1548 | __m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1)); |
| 1549 | __m128i minError0 = _mm_min_epi32(error0, error1); |
| 1550 | |
| 1551 | __m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2)); |
| 1552 | __m128i minError1 = _mm_min_epi32(error2, error3); |
| 1553 | |
| 1554 | __m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0)); |
| 1555 | __m128i minError = _mm_min_epi32(minError0, minError1); |
| 1556 | |
| 1557 | // Squaring the minimum error to produce correct values when adding |
| 1558 | __m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
| 1559 | __m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow); |
| 1560 | squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0)); |
| 1561 | _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow); |
| 1562 | __m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
| 1563 | __m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh); |
| 1564 | squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1)); |
| 1565 | _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh); |
| 1566 | |
| 1567 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
| 1568 | error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2])); |
| 1569 | error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3])); |
| 1570 | error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2])); |
| 1571 | error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3])); |
| 1572 | |
| 1573 | index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1)); |
| 1574 | minError0 = _mm_min_epi32(error0, error1); |
| 1575 | |
| 1576 | index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2)); |
| 1577 | minError1 = _mm_min_epi32(error2, error3); |
| 1578 | |
| 1579 | __m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0)); |
| 1580 | minError = _mm_min_epi32(minError0, minError1); |
| 1581 | |
| 1582 | // Squaring the minimum error to produce correct values when adding |
| 1583 | minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
| 1584 | squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow); |
| 1585 | squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2)); |
| 1586 | _mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow); |
| 1587 | minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
| 1588 | squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh); |
| 1589 | squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3)); |
| 1590 | _mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh); |
| 1591 | __m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1); |
| 1592 | _mm_storeu_si128((__m128i*)sel, minIndex); |
| 1593 | #elif defined __ARM_NEON |
| 1594 | int32x4_t pix = vdupq_n_s32(dr * 77 + dg * 151 + db * 28); |
| 1595 | |
| 1596 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
| 1597 | uint32x4_t error0 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[0]))); |
| 1598 | uint32x4_t error1 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[1]))); |
| 1599 | uint32x4_t error2 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[0]))); |
| 1600 | uint32x4_t error3 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[1]))); |
| 1601 | |
| 1602 | uint32x4_t index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1)); |
| 1603 | uint32x4_t minError0 = vminq_u32(error0, error1); |
| 1604 | |
| 1605 | uint32x4_t index1 = vreinterpretq_u32_s32(vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2)))); |
| 1606 | uint32x4_t minError1 = vminq_u32(error2, error3); |
| 1607 | |
| 1608 | uint32x4_t blendMask = vcltq_u32(minError1, minError0); |
| 1609 | uint32x4_t minIndex0 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask)); |
| 1610 | uint32x4_t minError = vminq_u32(minError0, minError1); |
| 1611 | |
| 1612 | // Squaring the minimum error to produce correct values when adding |
| 1613 | uint32x4_t squareErrorLow = vmulq_u32(minError, minError); |
| 1614 | uint32x4_t squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError))), 1); |
| 1615 | uint32x4x2_t squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh); |
| 1616 | uint64x2x2_t squareError = { vreinterpretq_u64_u32(squareErrorZip.val[0]), vreinterpretq_u64_u32(squareErrorZip.val[1]) }; |
| 1617 | squareError.val[0] = vaddq_u64(squareError.val[0], vld1q_u64(ter + 0)); |
| 1618 | squareError.val[1] = vaddq_u64(squareError.val[1], vld1q_u64(ter + 2)); |
| 1619 | vst1q_u64(ter + 0, squareError.val[0]); |
| 1620 | vst1q_u64(ter + 2, squareError.val[1]); |
| 1621 | |
| 1622 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
| 1623 | error0 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[2]))); |
| 1624 | error1 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[3]))); |
| 1625 | error2 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[2]))); |
| 1626 | error3 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[3]))); |
| 1627 | |
| 1628 | index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1)); |
| 1629 | minError0 = vminq_u32(error0, error1); |
| 1630 | |
| 1631 | index1 = vreinterpretq_u32_s32( vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))) ); |
| 1632 | minError1 = vminq_u32(error2, error3); |
| 1633 | |
| 1634 | blendMask = vcltq_u32(minError1, minError0); |
| 1635 | uint32x4_t minIndex1 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask)); |
| 1636 | minError = vminq_u32(minError0, minError1); |
| 1637 | |
| 1638 | // Squaring the minimum error to produce correct values when adding |
| 1639 | squareErrorLow = vmulq_u32(minError, minError); |
| 1640 | squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32( vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError)) ), 1 ); |
| 1641 | squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh); |
| 1642 | squareError.val[0] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[0] ), vld1q_u64(ter + 4)); |
| 1643 | squareError.val[1] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[1] ), vld1q_u64(ter + 6)); |
| 1644 | vst1q_u64(ter + 4, squareError.val[0]); |
| 1645 | vst1q_u64(ter + 6, squareError.val[1]); |
| 1646 | |
| 1647 | uint16x8_t minIndex = vcombine_u16(vqmovn_u32(minIndex0), vqmovn_u32(minIndex1)); |
| 1648 | vst1q_u16(sel, minIndex); |
| 1649 | #else |
| 1650 | int pix = dr * 77 + dg * 151 + db * 28; |
| 1651 | |
| 1652 | for( int t=0; t<8; t++ ) |
| 1653 | { |
| 1654 | const int64_t* tab = g_table256[t]; |
| 1655 | unsigned int idx = 0; |
| 1656 | uint64_t err = sq( tab[0] + pix ); |
| 1657 | for( int j=1; j<4; j++ ) |
| 1658 | { |
| 1659 | uint64_t local = sq( tab[j] + pix ); |
| 1660 | if( local < err ) |
| 1661 | { |
| 1662 | err = local; |
| 1663 | idx = j; |
| 1664 | } |
| 1665 | } |
| 1666 | *sel++ = idx; |
| 1667 | *ter++ += err; |
| 1668 | } |
| 1669 | #endif |
| 1670 | } |
| 1671 | } |
| 1672 | |
| 1673 | #if defined __SSE4_1__ || defined __ARM_NEON |
| 1674 | // Non-reference implementation, but faster. Produces same results as the AVX2 version |
| 1675 | static etcpak_force_inline void FindBestFit( uint32_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data ) |
| 1676 | { |
| 1677 | for( size_t i=0; i<16; i++ ) |
| 1678 | { |
| 1679 | uint16_t* sel = tsel[i]; |
| 1680 | unsigned int bid = id[i]; |
| 1681 | uint32_t* ter = terr[bid%2]; |
| 1682 | |
| 1683 | uint8_t b = *data++; |
| 1684 | uint8_t g = *data++; |
| 1685 | uint8_t r = *data++; |
| 1686 | data++; |
| 1687 | |
| 1688 | int dr = a[bid][0] - r; |
| 1689 | int dg = a[bid][1] - g; |
| 1690 | int db = a[bid][2] - b; |
| 1691 | |
| 1692 | #ifdef __SSE4_1__ |
| 1693 | // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16 |
| 1694 | // This produces slightly different results, but is significant faster |
| 1695 | __m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14); |
| 1696 | __m128i pix = _mm_abs_epi16(pixel); |
| 1697 | |
| 1698 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
| 1699 | // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries. |
| 1700 | __m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0])); |
| 1701 | __m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1])); |
| 1702 | |
| 1703 | __m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1)); |
| 1704 | __m128i minError = _mm_min_epi16(error0, error1); |
| 1705 | |
| 1706 | // Exploiting symmetry of the selector table and use the sign bit |
| 1707 | // This produces slightly different results, but is needed to produce same results as AVX2 implementation |
| 1708 | __m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1)); |
| 1709 | __m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit)); |
| 1710 | |
| 1711 | // Squaring the minimum error to produce correct values when adding |
| 1712 | __m128i squareErrorLo = _mm_mullo_epi16(minError, minError); |
| 1713 | __m128i squareErrorHi = _mm_mulhi_epi16(minError, minError); |
| 1714 | |
| 1715 | __m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi); |
| 1716 | __m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi); |
| 1717 | |
| 1718 | squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0)); |
| 1719 | _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow); |
| 1720 | squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1)); |
| 1721 | _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh); |
| 1722 | |
| 1723 | _mm_storeu_si128((__m128i*)sel, minIndex); |
| 1724 | #elif defined __ARM_NEON |
| 1725 | int16x8_t pixel = vdupq_n_s16( dr * 38 + dg * 76 + db * 14 ); |
| 1726 | int16x8_t pix = vabsq_s16( pixel ); |
| 1727 | |
| 1728 | int16x8_t error0 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[0] ) ); |
| 1729 | int16x8_t error1 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[1] ) ); |
| 1730 | |
| 1731 | int16x8_t index = vandq_s16( vreinterpretq_s16_u16( vcltq_s16( error1, error0 ) ), vdupq_n_s16( 1 ) ); |
| 1732 | int16x8_t minError = vminq_s16( error0, error1 ); |
| 1733 | |
| 1734 | int16x8_t indexBit = vandq_s16( vmvnq_s16( vshrq_n_s16( pixel, 15 ) ), vdupq_n_s16( -1 ) ); |
| 1735 | int16x8_t minIndex = vorrq_s16( index, vaddq_s16( indexBit, indexBit ) ); |
| 1736 | |
| 1737 | int16x4_t minErrorLow = vget_low_s16( minError ); |
| 1738 | int16x4_t minErrorHigh = vget_high_s16( minError ); |
| 1739 | |
| 1740 | int32x4_t squareErrorLow = vmull_s16( minErrorLow, minErrorLow ); |
| 1741 | int32x4_t squareErrorHigh = vmull_s16( minErrorHigh, minErrorHigh ); |
| 1742 | |
| 1743 | int32x4_t squareErrorSumLow = vaddq_s32( squareErrorLow, vld1q_s32( (int32_t*)ter ) ); |
| 1744 | int32x4_t squareErrorSumHigh = vaddq_s32( squareErrorHigh, vld1q_s32( (int32_t*)ter + 4 ) ); |
| 1745 | |
| 1746 | vst1q_s32( (int32_t*)ter, squareErrorSumLow ); |
| 1747 | vst1q_s32( (int32_t*)ter + 4, squareErrorSumHigh ); |
| 1748 | |
| 1749 | vst1q_s16( (int16_t*)sel, minIndex ); |
| 1750 | #endif |
| 1751 | } |
| 1752 | } |
| 1753 | #endif |
| 1754 | |
| 1755 | static etcpak_force_inline uint8_t convert6(float f) |
| 1756 | { |
| 1757 | int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1; |
| 1758 | return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3; |
| 1759 | } |
| 1760 | |
| 1761 | static etcpak_force_inline uint8_t convert7(float f) |
| 1762 | { |
| 1763 | int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1; |
| 1764 | return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2; |
| 1765 | } |
| 1766 | |
| 1767 | static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode, bool useHeuristics ) |
| 1768 | { |
| 1769 | int32_t r = 0; |
| 1770 | int32_t g = 0; |
| 1771 | int32_t b = 0; |
| 1772 | |
| 1773 | for( int i = 0; i < 16; ++i ) |
| 1774 | { |
| 1775 | b += src[i * 4 + 0]; |
| 1776 | g += src[i * 4 + 1]; |
| 1777 | r += src[i * 4 + 2]; |
| 1778 | } |
| 1779 | |
| 1780 | int32_t difRyz = 0; |
| 1781 | int32_t difGyz = 0; |
| 1782 | int32_t difByz = 0; |
| 1783 | int32_t difRxz = 0; |
| 1784 | int32_t difGxz = 0; |
| 1785 | int32_t difBxz = 0; |
| 1786 | |
| 1787 | const int32_t scaling[] = { -255, -85, 85, 255 }; |
| 1788 | |
| 1789 | for (int i = 0; i < 16; ++i) |
| 1790 | { |
| 1791 | int32_t difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b; |
| 1792 | int32_t difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g; |
| 1793 | int32_t difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r; |
| 1794 | |
| 1795 | difRyz += difR * scaling[i % 4]; |
| 1796 | difGyz += difG * scaling[i % 4]; |
| 1797 | difByz += difB * scaling[i % 4]; |
| 1798 | |
| 1799 | difRxz += difR * scaling[i / 4]; |
| 1800 | difGxz += difG * scaling[i / 4]; |
| 1801 | difBxz += difB * scaling[i / 4]; |
| 1802 | } |
| 1803 | |
| 1804 | const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f); |
| 1805 | |
| 1806 | float aR = difRxz * scale; |
| 1807 | float aG = difGxz * scale; |
| 1808 | float aB = difBxz * scale; |
| 1809 | |
| 1810 | float bR = difRyz * scale; |
| 1811 | float bG = difGyz * scale; |
| 1812 | float bB = difByz * scale; |
| 1813 | |
| 1814 | float dR = r * (4.0f / 16.0f); |
| 1815 | float dG = g * (4.0f / 16.0f); |
| 1816 | float dB = b * (4.0f / 16.0f); |
| 1817 | |
| 1818 | // calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y; |
| 1819 | float cofR = std::fma(aR, 255.0f, std::fma(bR, 255.0f, dR)); |
| 1820 | float cofG = std::fma(aG, 255.0f, std::fma(bG, 255.0f, dG)); |
| 1821 | float cofB = std::fma(aB, 255.0f, std::fma(bB, 255.0f, dB)); |
| 1822 | float chfR = std::fma(aR, -425.0f, std::fma(bR, 255.0f, dR)); |
| 1823 | float chfG = std::fma(aG, -425.0f, std::fma(bG, 255.0f, dG)); |
| 1824 | float chfB = std::fma(aB, -425.0f, std::fma(bB, 255.0f, dB)); |
| 1825 | float cvfR = std::fma(aR, 255.0f, std::fma(bR, -425.0f, dR)); |
| 1826 | float cvfG = std::fma(aG, 255.0f, std::fma(bG, -425.0f, dG)); |
| 1827 | float cvfB = std::fma(aB, 255.0f, std::fma(bB, -425.0f, dB)); |
| 1828 | |
| 1829 | // convert to r6g7b6 |
| 1830 | int32_t coR = convert6(cofR); |
| 1831 | int32_t coG = convert7(cofG); |
| 1832 | int32_t coB = convert6(cofB); |
| 1833 | int32_t chR = convert6(chfR); |
| 1834 | int32_t chG = convert7(chfG); |
| 1835 | int32_t chB = convert6(chfB); |
| 1836 | int32_t cvR = convert6(cvfR); |
| 1837 | int32_t cvG = convert7(cvfG); |
| 1838 | int32_t cvB = convert6(cvfB); |
| 1839 | |
| 1840 | // Error calculation |
| 1841 | uint64_t error = 0; |
| 1842 | if( ModePlanar != mode && useHeuristics ) |
| 1843 | { |
| 1844 | auto ro0 = coR; |
| 1845 | auto go0 = coG; |
| 1846 | auto bo0 = coB; |
| 1847 | auto ro1 = ( ro0 >> 4 ) | ( ro0 << 2 ); |
| 1848 | auto go1 = ( go0 >> 6 ) | ( go0 << 1 ); |
| 1849 | auto bo1 = ( bo0 >> 4 ) | ( bo0 << 2 ); |
| 1850 | auto ro2 = ( ro1 << 2 ) + 2; |
| 1851 | auto go2 = ( go1 << 2 ) + 2; |
| 1852 | auto bo2 = ( bo1 << 2 ) + 2; |
| 1853 | |
| 1854 | auto rh0 = chR; |
| 1855 | auto gh0 = chG; |
| 1856 | auto bh0 = chB; |
| 1857 | auto rh1 = ( rh0 >> 4 ) | ( rh0 << 2 ); |
| 1858 | auto gh1 = ( gh0 >> 6 ) | ( gh0 << 1 ); |
| 1859 | auto bh1 = ( bh0 >> 4 ) | ( bh0 << 2 ); |
| 1860 | |
| 1861 | auto rh2 = rh1 - ro1; |
| 1862 | auto gh2 = gh1 - go1; |
| 1863 | auto bh2 = bh1 - bo1; |
| 1864 | |
| 1865 | auto rv0 = cvR; |
| 1866 | auto gv0 = cvG; |
| 1867 | auto bv0 = cvB; |
| 1868 | auto rv1 = ( rv0 >> 4 ) | ( rv0 << 2 ); |
| 1869 | auto gv1 = ( gv0 >> 6 ) | ( gv0 << 1 ); |
| 1870 | auto bv1 = ( bv0 >> 4 ) | ( bv0 << 2 ); |
| 1871 | |
| 1872 | auto rv2 = rv1 - ro1; |
| 1873 | auto gv2 = gv1 - go1; |
| 1874 | auto bv2 = bv1 - bo1; |
| 1875 | for( int i = 0; i < 16; ++i ) |
| 1876 | { |
| 1877 | int32_t cR = clampu8( ( rh2 * ( i / 4 ) + rv2 * ( i % 4 ) + ro2 ) >> 2 ); |
| 1878 | int32_t cG = clampu8( ( gh2 * ( i / 4 ) + gv2 * ( i % 4 ) + go2 ) >> 2 ); |
| 1879 | int32_t cB = clampu8( ( bh2 * ( i / 4 ) + bv2 * ( i % 4 ) + bo2 ) >> 2 ); |
| 1880 | |
| 1881 | int32_t difB = static_cast<int>( src[i * 4 + 0] ) - cB; |
| 1882 | int32_t difG = static_cast<int>( src[i * 4 + 1] ) - cG; |
| 1883 | int32_t difR = static_cast<int>( src[i * 4 + 2] ) - cR; |
| 1884 | |
| 1885 | int32_t dif = difR * 38 + difG * 76 + difB * 14; |
| 1886 | |
| 1887 | error += dif * dif; |
| 1888 | } |
| 1889 | } |
| 1890 | |
| 1891 | /**/ |
| 1892 | uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 ); |
| 1893 | uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 ); |
| 1894 | uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 ); |
| 1895 | uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C ); |
| 1896 | lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 ); |
| 1897 | lo |= ( ( coG & 0x3F ) << 17 ) | ( ( coG & 0x40 ) << 18 ); |
| 1898 | lo |= coR << 25; |
| 1899 | |
| 1900 | const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 ); |
| 1901 | |
| 1902 | lo |= g_flags[idx]; |
| 1903 | |
| 1904 | uint64_t result = static_cast<uint32_t>( _bswap( lo ) ); |
| 1905 | result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32; |
| 1906 | |
| 1907 | return std::make_pair( result, error ); |
| 1908 | } |
| 1909 | |
| 1910 | #ifdef __ARM_NEON |
| 1911 | |
| 1912 | static etcpak_force_inline int32x2_t Planar_NEON_DifXZ( int16x8_t dif_lo, int16x8_t dif_hi ) |
| 1913 | { |
| 1914 | int32x4_t dif0 = vmull_n_s16( vget_low_s16( dif_lo ), -255 ); |
| 1915 | int32x4_t dif1 = vmull_n_s16( vget_high_s16( dif_lo ), -85 ); |
| 1916 | int32x4_t dif2 = vmull_n_s16( vget_low_s16( dif_hi ), 85 ); |
| 1917 | int32x4_t dif3 = vmull_n_s16( vget_high_s16( dif_hi ), 255 ); |
| 1918 | int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) ); |
| 1919 | |
| 1920 | #ifndef __aarch64__ |
| 1921 | int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) ); |
| 1922 | return vpadd_s32( dif5, dif5 ); |
| 1923 | #else |
| 1924 | return vdup_n_s32( vaddvq_s32( dif4 ) ); |
| 1925 | #endif |
| 1926 | } |
| 1927 | |
| 1928 | static etcpak_force_inline int32x2_t Planar_NEON_DifYZ( int16x8_t dif_lo, int16x8_t dif_hi ) |
| 1929 | { |
| 1930 | int16x4_t scaling = { -255, -85, 85, 255 }; |
| 1931 | int32x4_t dif0 = vmull_s16( vget_low_s16( dif_lo ), scaling ); |
| 1932 | int32x4_t dif1 = vmull_s16( vget_high_s16( dif_lo ), scaling ); |
| 1933 | int32x4_t dif2 = vmull_s16( vget_low_s16( dif_hi ), scaling ); |
| 1934 | int32x4_t dif3 = vmull_s16( vget_high_s16( dif_hi ), scaling ); |
| 1935 | int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) ); |
| 1936 | |
| 1937 | #ifndef __aarch64__ |
| 1938 | int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) ); |
| 1939 | return vpadd_s32( dif5, dif5 ); |
| 1940 | #else |
| 1941 | return vdup_n_s32( vaddvq_s32( dif4 ) ); |
| 1942 | #endif |
| 1943 | } |
| 1944 | |
| 1945 | static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src ) |
| 1946 | { |
| 1947 | uint16x8_t accu8 = vpaddlq_u8( src ); |
| 1948 | #ifndef __aarch64__ |
| 1949 | uint16x4_t accu4 = vpadd_u16( vget_low_u16( accu8 ), vget_high_u16( accu8 ) ); |
| 1950 | uint16x4_t accu2 = vpadd_u16( accu4, accu4 ); |
| 1951 | uint16x4_t accu1 = vpadd_u16( accu2, accu2 ); |
| 1952 | return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) ); |
| 1953 | #else |
| 1954 | return vdupq_n_s16( vaddvq_u16( accu8 ) ); |
| 1955 | #endif |
| 1956 | } |
| 1957 | |
| 1958 | static etcpak_force_inline int16x8_t convert6_NEON( int32x4_t lo, int32x4_t hi ) |
| 1959 | { |
| 1960 | uint16x8_t x = vcombine_u16( vqmovun_s32( lo ), vqmovun_s32( hi ) ); |
| 1961 | int16x8_t i = vreinterpretq_s16_u16( vshrq_n_u16( vqshlq_n_u16( x, 6 ), 6) ); // clamp 0-1023 |
| 1962 | i = vhsubq_s16( i, vdupq_n_s16( 15 ) ); |
| 1963 | |
| 1964 | int16x8_t ip11 = vaddq_s16( i, vdupq_n_s16( 11 ) ); |
| 1965 | int16x8_t ip4 = vaddq_s16( i, vdupq_n_s16( 4 ) ); |
| 1966 | |
| 1967 | return vshrq_n_s16( vsubq_s16( vsubq_s16( ip11, vshrq_n_s16( ip11, 7 ) ), vshrq_n_s16( ip4, 7) ), 3 ); |
| 1968 | } |
| 1969 | |
| 1970 | static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x ) |
| 1971 | { |
| 1972 | int16x4_t i = vreinterpret_s16_u16( vshr_n_u16( vqshl_n_u16( vqmovun_s32( x ), 6 ), 6 ) ); // clamp 0-1023 |
| 1973 | i = vhsub_s16( i, vdup_n_s16( 15 ) ); |
| 1974 | |
| 1975 | int16x4_t p9 = vadd_s16( i, vdup_n_s16( 9 ) ); |
| 1976 | int16x4_t p6 = vadd_s16( i, vdup_n_s16( 6 ) ); |
| 1977 | return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, 8 ) ), vshr_n_s16( p6, 8 ) ), 2 ); |
| 1978 | } |
| 1979 | |
| 1980 | static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode, bool useHeuristics ) |
| 1981 | { |
| 1982 | uint8x16x4_t srcBlock = vld4q_u8( src ); |
| 1983 | |
| 1984 | int16x8_t bSumWide = Planar_NEON_SumWide( srcBlock.val[0] ); |
| 1985 | int16x8_t gSumWide = Planar_NEON_SumWide( srcBlock.val[1] ); |
| 1986 | int16x8_t rSumWide = Planar_NEON_SumWide( srcBlock.val[2] ); |
| 1987 | |
| 1988 | int16x8_t dif_R_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[2] ), 4) ), rSumWide ); |
| 1989 | int16x8_t dif_R_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[2] ), 4) ), rSumWide ); |
| 1990 | |
| 1991 | int16x8_t dif_G_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[1] ), 4 ) ), gSumWide ); |
| 1992 | int16x8_t dif_G_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[1] ), 4 ) ), gSumWide ); |
| 1993 | |
| 1994 | int16x8_t dif_B_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[0] ), 4) ), bSumWide ); |
| 1995 | int16x8_t dif_B_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[0] ), 4) ), bSumWide ); |
| 1996 | |
| 1997 | int32x2x2_t dif_xz_z = vzip_s32( vzip_s32( Planar_NEON_DifXZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifXZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifXZ( dif_G_lo, dif_G_hi ) ); |
| 1998 | int32x4_t dif_xz = vcombine_s32( dif_xz_z.val[0], dif_xz_z.val[1] ); |
| 1999 | int32x2x2_t dif_yz_z = vzip_s32( vzip_s32( Planar_NEON_DifYZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifYZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifYZ( dif_G_lo, dif_G_hi ) ); |
| 2000 | int32x4_t dif_yz = vcombine_s32( dif_yz_z.val[0], dif_yz_z.val[1] ); |
| 2001 | |
| 2002 | const float fscale = -4.0f / ( (255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f ); |
| 2003 | float32x4_t fa = vmulq_n_f32( vcvtq_f32_s32( dif_xz ), fscale ); |
| 2004 | float32x4_t fb = vmulq_n_f32( vcvtq_f32_s32( dif_yz ), fscale ); |
| 2005 | int16x4_t bgrgSum = vzip_s16( vzip_s16( vget_low_s16( bSumWide ), vget_low_s16( rSumWide ) ).val[0], vget_low_s16( gSumWide ) ).val[0]; |
| 2006 | float32x4_t fd = vmulq_n_f32( vcvtq_f32_s32( vmovl_s16( bgrgSum ) ), 4.0f / 16.0f); |
| 2007 | |
| 2008 | float32x4_t cof = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, 255.0f ); |
| 2009 | float32x4_t chf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, -425.0f ); |
| 2010 | float32x4_t cvf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, -425.0f ), fa, 255.0f ); |
| 2011 | |
| 2012 | int32x4_t coi = vcvtq_s32_f32( cof ); |
| 2013 | int32x4_t chi = vcvtq_s32_f32( chf ); |
| 2014 | int32x4_t cvi = vcvtq_s32_f32( cvf ); |
| 2015 | |
| 2016 | int32x4x2_t tr_hv = vtrnq_s32( chi, cvi ); |
| 2017 | int32x4x2_t tr_o = vtrnq_s32( coi, coi ); |
| 2018 | |
| 2019 | int16x8_t c_hvoo_br_6 = convert6_NEON( tr_hv.val[0], tr_o.val[0] ); |
| 2020 | int16x4_t c_hvox_g_7 = convert7_NEON( vcombine_s32( vget_low_s32( tr_hv.val[1] ), vget_low_s32( tr_o.val[1] ) ) ); |
| 2021 | int16x8_t c_hvoo_br_8 = vorrq_s16( vshrq_n_s16( c_hvoo_br_6, 4 ), vshlq_n_s16( c_hvoo_br_6, 2 ) ); |
| 2022 | int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, 6 ), vshl_n_s16( c_hvox_g_7, 1 ) ); |
| 2023 | |
| 2024 | uint64_t error = 0; |
| 2025 | if( mode != ModePlanar && useHeuristics ) |
| 2026 | { |
| 2027 | int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), 3 ); |
| 2028 | |
| 2029 | rec_gxbr_o = vadd_s16( vshl_n_s16( rec_gxbr_o, 2 ), vdup_n_s16( 2 ) ); |
| 2030 | int16x8_t rec_ro_wide = vdupq_lane_s16( rec_gxbr_o, 3 ); |
| 2031 | int16x8_t rec_go_wide = vdupq_lane_s16( rec_gxbr_o, 0 ); |
| 2032 | int16x8_t rec_bo_wide = vdupq_lane_s16( rec_gxbr_o, 1 ); |
| 2033 | |
| 2034 | int16x4_t br_hv2 = vsub_s16( vget_low_s16( c_hvoo_br_8 ), vget_high_s16( c_hvoo_br_8 ) ); |
| 2035 | int16x4_t gg_hv2 = vsub_s16( c_hvox_g_8, vdup_lane_s16( c_hvox_g_8, 2 ) ); |
| 2036 | |
| 2037 | int16x8_t scaleh_lo = { 0, 0, 0, 0, 1, 1, 1, 1 }; |
| 2038 | int16x8_t scaleh_hi = { 2, 2, 2, 2, 3, 3, 3, 3 }; |
| 2039 | int16x8_t scalev = { 0, 1, 2, 3, 0, 1, 2, 3 }; |
| 2040 | |
| 2041 | int16x8_t rec_r_1 = vmlaq_lane_s16( rec_ro_wide, scalev, br_hv2, 3 ); |
| 2042 | int16x8_t rec_r_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_lo, br_hv2, 2 ), 2 ) ) ); |
| 2043 | int16x8_t rec_r_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_hi, br_hv2, 2 ), 2 ) ) ); |
| 2044 | |
| 2045 | int16x8_t rec_b_1 = vmlaq_lane_s16( rec_bo_wide, scalev, br_hv2, 1 ); |
| 2046 | int16x8_t rec_b_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_lo, br_hv2, 0 ), 2 ) ) ); |
| 2047 | int16x8_t rec_b_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_hi, br_hv2, 0 ), 2 ) ) ); |
| 2048 | |
| 2049 | int16x8_t rec_g_1 = vmlaq_lane_s16( rec_go_wide, scalev, gg_hv2, 1 ); |
| 2050 | int16x8_t rec_g_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_lo, gg_hv2, 0 ), 2 ) ) ); |
| 2051 | int16x8_t rec_g_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_hi, gg_hv2, 0 ), 2 ) ) ); |
| 2052 | |
| 2053 | int16x8_t dif_r_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[2] ) ) ), rec_r_lo ); |
| 2054 | int16x8_t dif_r_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[2] ) ) ), rec_r_hi ); |
| 2055 | |
| 2056 | int16x8_t dif_g_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[1] ) ) ), rec_g_lo ); |
| 2057 | int16x8_t dif_g_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[1] ) ) ), rec_g_hi ); |
| 2058 | |
| 2059 | int16x8_t dif_b_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[0] ) ) ), rec_b_lo ); |
| 2060 | int16x8_t dif_b_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[0] ) ) ), rec_b_hi ); |
| 2061 | |
| 2062 | int16x8_t dif_lo = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_lo, 38 ), dif_g_lo, 76 ), dif_b_lo, 14 ); |
| 2063 | int16x8_t dif_hi = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_hi, 38 ), dif_g_hi, 76 ), dif_b_hi, 14 ); |
| 2064 | |
| 2065 | int16x4_t tmpDif = vget_low_s16( dif_lo ); |
| 2066 | int32x4_t difsq_0 = vmull_s16( tmpDif, tmpDif ); |
| 2067 | tmpDif = vget_high_s16( dif_lo ); |
| 2068 | int32x4_t difsq_1 = vmull_s16( tmpDif, tmpDif ); |
| 2069 | tmpDif = vget_low_s16( dif_hi ); |
| 2070 | int32x4_t difsq_2 = vmull_s16( tmpDif, tmpDif ); |
| 2071 | tmpDif = vget_high_s16( dif_hi ); |
| 2072 | int32x4_t difsq_3 = vmull_s16( tmpDif, tmpDif ); |
| 2073 | |
| 2074 | uint32x4_t difsq_5 = vaddq_u32( vreinterpretq_u32_s32( difsq_0 ), vreinterpretq_u32_s32( difsq_1 ) ); |
| 2075 | uint32x4_t difsq_6 = vaddq_u32( vreinterpretq_u32_s32( difsq_2 ), vreinterpretq_u32_s32( difsq_3 ) ); |
| 2076 | |
| 2077 | uint64x2_t difsq_7 = vaddl_u32( vget_low_u32( difsq_5 ), vget_high_u32( difsq_5 ) ); |
| 2078 | uint64x2_t difsq_8 = vaddl_u32( vget_low_u32( difsq_6 ), vget_high_u32( difsq_6 ) ); |
| 2079 | |
| 2080 | uint64x2_t difsq_9 = vaddq_u64( difsq_7, difsq_8 ); |
| 2081 | |
| 2082 | #ifdef __aarch64__ |
| 2083 | error = vaddvq_u64( difsq_9 ); |
| 2084 | #else |
| 2085 | error = vgetq_lane_u64( difsq_9, 0 ) + vgetq_lane_u64( difsq_9, 1 ); |
| 2086 | #endif |
| 2087 | } |
| 2088 | |
| 2089 | int32_t coR = c_hvoo_br_6[6]; |
| 2090 | int32_t coG = c_hvox_g_7[2]; |
| 2091 | int32_t coB = c_hvoo_br_6[4]; |
| 2092 | |
| 2093 | int32_t chR = c_hvoo_br_6[2]; |
| 2094 | int32_t chG = c_hvox_g_7[0]; |
| 2095 | int32_t chB = c_hvoo_br_6[0]; |
| 2096 | |
| 2097 | int32_t cvR = c_hvoo_br_6[3]; |
| 2098 | int32_t cvG = c_hvox_g_7[1]; |
| 2099 | int32_t cvB = c_hvoo_br_6[1]; |
| 2100 | |
| 2101 | uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 ); |
| 2102 | uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 ); |
| 2103 | uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 ); |
| 2104 | uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C ); |
| 2105 | lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 ); |
| 2106 | lo |= ( ( coG & 0x3F) << 17) | ( (coG & 0x40 ) << 18 ); |
| 2107 | lo |= coR << 25; |
| 2108 | |
| 2109 | const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 ); |
| 2110 | |
| 2111 | lo |= g_flags[idx]; |
| 2112 | |
| 2113 | uint64_t result = static_cast<uint32_t>( _bswap(lo) ); |
| 2114 | result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32; |
| 2115 | |
| 2116 | return std::make_pair( result, error ); |
| 2117 | } |
| 2118 | |
| 2119 | #endif |
| 2120 | |
| 2121 | #ifdef __AVX2__ |
| 2122 | uint32_t calculateErrorTH( bool tMode, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist, __m128i r8, __m128i g8, __m128i b8 ) |
| 2123 | #else |
| 2124 | uint32_t calculateErrorTH( bool tMode, uint8_t* src, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist ) |
| 2125 | #endif |
| 2126 | { |
| 2127 | uint32_t blockErr = 0, bestBlockErr = MaxError; |
| 2128 | |
| 2129 | uint32_t pixColors; |
| 2130 | uint8_t possibleColors[4][3]; |
| 2131 | uint8_t colors[2][3]; |
| 2132 | |
| 2133 | decompressColor( colorsRGB444, colors ); |
| 2134 | |
| 2135 | #ifdef __AVX2__ |
| 2136 | __m128i reverseMask = _mm_set_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 ); |
| 2137 | #endif |
| 2138 | |
| 2139 | // test distances |
| 2140 | for( uint8_t d = startDist; d < 8; ++d ) |
| 2141 | { |
| 2142 | if( d >= 2 && dist == d - 2 ) break; |
| 2143 | |
| 2144 | blockErr = 0; |
| 2145 | pixColors = 0; |
| 2146 | |
| 2147 | if( tMode ) |
| 2148 | { |
| 2149 | calculatePaintColors59T( d, colors, possibleColors ); |
| 2150 | } |
| 2151 | else |
| 2152 | { |
| 2153 | calculatePaintColors58H( d, colors, possibleColors ); |
| 2154 | } |
| 2155 | |
| 2156 | #ifdef __AVX2__ |
| 2157 | // RGB ordering |
| 2158 | __m128i b8Rev = _mm_shuffle_epi8( b8, reverseMask ); |
| 2159 | __m128i g8Rev = _mm_shuffle_epi8( g8, reverseMask ); |
| 2160 | __m128i r8Rev = _mm_shuffle_epi8( r8, reverseMask ); |
| 2161 | |
| 2162 | // extends 3x128 bits RGB into 3x256 bits RGB for error comparisions |
| 2163 | static const __m128i zero = _mm_setzero_si128(); |
| 2164 | __m128i b8Lo = _mm_unpacklo_epi8( b8Rev, zero ); |
| 2165 | __m128i g8Lo = _mm_unpacklo_epi8( g8Rev, zero ); |
| 2166 | __m128i r8Lo = _mm_unpacklo_epi8( r8Rev, zero ); |
| 2167 | __m128i b8Hi = _mm_unpackhi_epi8( b8Rev, zero ); |
| 2168 | __m128i g8Hi = _mm_unpackhi_epi8( g8Rev, zero ); |
| 2169 | __m128i r8Hi = _mm_unpackhi_epi8( r8Rev, zero ); |
| 2170 | |
| 2171 | __m256i b8 = _mm256_set_m128i( b8Hi, b8Lo ); |
| 2172 | __m256i g8 = _mm256_set_m128i( g8Hi, g8Lo ); |
| 2173 | __m256i r8 = _mm256_set_m128i( r8Hi, r8Lo ); |
| 2174 | |
| 2175 | // caculates differences between the pixel colrs and the palette colors |
| 2176 | __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[0][B] ) ) ); |
| 2177 | __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[0][G] ) ) ); |
| 2178 | __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[0][R] ) ) ); |
| 2179 | |
| 2180 | // luma-based error calculations |
| 2181 | static const __m256i bWeight = _mm256_set1_epi16( 14 ); |
| 2182 | static const __m256i gWeight = _mm256_set1_epi16( 76 ); |
| 2183 | static const __m256i rWeight = _mm256_set1_epi16( 38 ); |
| 2184 | |
| 2185 | diffb = _mm256_mullo_epi16( diffb, bWeight ); |
| 2186 | diffg = _mm256_mullo_epi16( diffg, gWeight ); |
| 2187 | diffr = _mm256_mullo_epi16( diffr, rWeight ); |
| 2188 | |
| 2189 | // obtains the error with the current palette color |
| 2190 | __m256i lowestPixErr = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr ); |
| 2191 | |
| 2192 | // error calucations with the remaining three palette colors |
| 2193 | static const uint32_t masks[4] = { 0, 0x55555555, 0xAAAAAAAA, 0xFFFFFFFF }; |
| 2194 | for( uint8_t c = 1; c < 4; c++ ) |
| 2195 | { |
| 2196 | __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[c][B] ) ) ); |
| 2197 | __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[c][G] ) ) ); |
| 2198 | __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[c][R] ) ) ); |
| 2199 | |
| 2200 | diffb = _mm256_mullo_epi16( diffb, bWeight ); |
| 2201 | diffg = _mm256_mullo_epi16( diffg, gWeight ); |
| 2202 | diffr = _mm256_mullo_epi16( diffr, rWeight ); |
| 2203 | |
| 2204 | // error comparison with the previous best color |
| 2205 | __m256i pixErrors = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr ); |
| 2206 | __m256i minErr = _mm256_min_epu16( lowestPixErr, pixErrors ); |
| 2207 | __m256i cmpRes = _mm256_cmpeq_epi16( pixErrors, minErr ); |
| 2208 | lowestPixErr = minErr; |
| 2209 | |
| 2210 | // update pixel colors |
| 2211 | uint32_t updPixColors = _mm256_movemask_epi8( cmpRes ); |
| 2212 | uint32_t prevPixColors = pixColors & ~updPixColors; |
| 2213 | uint32_t mskPixColors = masks[c] & updPixColors; |
| 2214 | pixColors = prevPixColors | mskPixColors; |
| 2215 | } |
| 2216 | |
| 2217 | // accumulate the block error |
| 2218 | alignas( 32 ) uint16_t pixErr16[16] = { 0, }; |
| 2219 | _mm256_storeu_si256( (__m256i*)pixErr16, lowestPixErr ); |
| 2220 | for( uint8_t p = 0; p < 16; p++ ) |
| 2221 | { |
| 2222 | blockErr += (int)( pixErr16[p] ) * pixErr16[p]; |
| 2223 | } |
| 2224 | #else |
| 2225 | for( size_t y = 0; y < 4; ++y ) |
| 2226 | { |
| 2227 | for( size_t x = 0; x < 4; ++x ) |
| 2228 | { |
| 2229 | uint32_t bestPixErr = MaxError; |
| 2230 | pixColors <<= 2; // Make room for next value |
| 2231 | |
| 2232 | // Loop possible block colors |
| 2233 | for( uint8_t c = 0; c < 4; ++c ) |
| 2234 | { |
| 2235 | int diff[3]; |
| 2236 | diff[R] = src[4 * ( x * 4 + y ) + R] - possibleColors[c][R]; |
| 2237 | diff[G] = src[4 * ( x * 4 + y ) + G] - possibleColors[c][G]; |
| 2238 | diff[B] = src[4 * ( x * 4 + y ) + B] - possibleColors[c][B]; |
| 2239 | |
| 2240 | const uint32_t err = 38 * abs( diff[R] ) + 76 * abs( diff[G] ) + 14 * abs( diff[B] ); |
| 2241 | uint32_t pixErr = err * err; |
| 2242 | |
| 2243 | // Choose best error |
| 2244 | if( pixErr < bestPixErr ) |
| 2245 | { |
| 2246 | bestPixErr = pixErr; |
| 2247 | pixColors ^= ( pixColors & 3 ); // Reset the two first bits |
| 2248 | pixColors |= c; |
| 2249 | } |
| 2250 | } |
| 2251 | blockErr += bestPixErr; |
| 2252 | } |
| 2253 | } |
| 2254 | #endif |
| 2255 | |
| 2256 | if( blockErr < bestBlockErr ) |
| 2257 | { |
| 2258 | bestBlockErr = blockErr; |
| 2259 | dist = d; |
| 2260 | pixIndices = pixColors; |
| 2261 | } |
| 2262 | } |
| 2263 | |
| 2264 | return bestBlockErr; |
| 2265 | } |
| 2266 | |
| 2267 | |
| 2268 | // main T-/H-mode compression function |
| 2269 | #ifdef __AVX2__ |
| 2270 | uint32_t compressBlockTH( uint8_t* src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool& tMode, __m128i r8, __m128i g8, __m128i b8 ) |
| 2271 | #else |
| 2272 | uint32_t compressBlockTH( uint8_t *src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool &tMode ) |
| 2273 | #endif |
| 2274 | { |
| 2275 | #ifdef __AVX2__ |
| 2276 | alignas( 8 ) uint8_t luma[16] = { 0, }; |
| 2277 | _mm_storeu_si128 ( (__m128i* )luma, l.luma8 ); |
| 2278 | #elif defined __ARM_NEON && defined __aarch64__ |
| 2279 | alignas( 8 ) uint8_t luma[16] = { 0 }; |
| 2280 | vst1q_u8( luma, l.luma8 ); |
| 2281 | #else |
| 2282 | uint8_t* luma = l.val; |
| 2283 | #endif |
| 2284 | |
| 2285 | uint8_t pixIdx[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; |
| 2286 | |
| 2287 | // 1) sorts the pairs of (luma, pix_idx) |
| 2288 | insertionSort( luma, pixIdx ); |
| 2289 | |
| 2290 | // 2) finds the min (left+right) |
| 2291 | uint8_t minSumRangeIdx = 0; |
| 2292 | uint16_t minSumRangeValue; |
| 2293 | uint16_t sum; |
| 2294 | static const uint8_t diffBonus[15] = {8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 8}; |
| 2295 | const int16_t temp = luma[15] - luma[0]; |
| 2296 | |
| 2297 | minSumRangeValue = luma[15] - luma[1] + diffBonus[0]; |
| 2298 | for( uint8_t i = 1; i < 14; i++ ) |
| 2299 | { |
| 2300 | sum = temp - luma[i+1] + luma[i] + diffBonus[i]; |
| 2301 | if( minSumRangeValue > sum ) |
| 2302 | { |
| 2303 | minSumRangeValue = sum; |
| 2304 | minSumRangeIdx = i; |
| 2305 | } |
| 2306 | } |
| 2307 | |
| 2308 | sum = luma[14] - luma[0] + diffBonus[14]; |
| 2309 | if( minSumRangeValue > sum ) |
| 2310 | { |
| 2311 | minSumRangeValue = sum; |
| 2312 | minSumRangeIdx = 14; |
| 2313 | } |
| 2314 | uint8_t lRange, rRange; |
| 2315 | |
| 2316 | lRange = luma[minSumRangeIdx] - luma[0]; |
| 2317 | rRange = luma[15] - luma[minSumRangeIdx + 1]; |
| 2318 | |
| 2319 | // 3) sets a proper mode |
| 2320 | bool swap = false; |
| 2321 | if( lRange >= rRange ) |
| 2322 | { |
| 2323 | if( lRange >= rRange * 2 ) |
| 2324 | { |
| 2325 | swap = true; |
| 2326 | tMode = true; |
| 2327 | } |
| 2328 | } |
| 2329 | else |
| 2330 | { |
| 2331 | if( lRange * 2 <= rRange ) tMode = true; |
| 2332 | } |
| 2333 | // 4) calculates the two base colors |
| 2334 | uint8_t rangeIdx[4] = { pixIdx[0], pixIdx[minSumRangeIdx], pixIdx[minSumRangeIdx + 1], pixIdx[15] }; |
| 2335 | |
| 2336 | uint16_t r[4], g[4], b[4]; |
| 2337 | for( uint8_t i = 0; i < 4; ++i ) |
| 2338 | { |
| 2339 | uint8_t idx = rangeIdx[i] * 4; |
| 2340 | b[i] = src[idx]; |
| 2341 | g[i] = src[idx + 1]; |
| 2342 | r[i] = src[idx + 2]; |
| 2343 | } |
| 2344 | |
| 2345 | uint8_t mid_rgb[2][3]; |
| 2346 | if( swap ) |
| 2347 | { |
| 2348 | mid_rgb[1][B] = ( b[0] + b[1] ) / 2; |
| 2349 | mid_rgb[1][G] = ( g[0] + g[1] ) / 2; |
| 2350 | mid_rgb[1][R] = ( r[0] + r[1] ) / 2; |
| 2351 | |
| 2352 | uint16_t sum_rgb[3] = { 0, 0, 0 }; |
| 2353 | for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ ) |
| 2354 | { |
| 2355 | uint8_t idx = pixIdx[i] * 4; |
| 2356 | sum_rgb[B] += src[idx]; |
| 2357 | sum_rgb[G] += src[idx + 1]; |
| 2358 | sum_rgb[R] += src[idx + 2]; |
| 2359 | } |
| 2360 | const uint8_t temp = 15 - minSumRangeIdx; |
| 2361 | mid_rgb[0][B] = sum_rgb[B] / temp; |
| 2362 | mid_rgb[0][G] = sum_rgb[G] / temp; |
| 2363 | mid_rgb[0][R] = sum_rgb[R] / temp; |
| 2364 | } |
| 2365 | else |
| 2366 | { |
| 2367 | mid_rgb[0][B] = (b[0] + b[1]) / 2; |
| 2368 | mid_rgb[0][G] = (g[0] + g[1]) / 2; |
| 2369 | mid_rgb[0][R] = (r[0] + r[1]) / 2; |
| 2370 | if( tMode ) |
| 2371 | { |
| 2372 | uint16_t sum_rgb[3] = { 0, 0, 0 }; |
| 2373 | for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ ) |
| 2374 | { |
| 2375 | uint8_t idx = pixIdx[i] * 4; |
| 2376 | sum_rgb[B] += src[idx]; |
| 2377 | sum_rgb[G] += src[idx + 1]; |
| 2378 | sum_rgb[R] += src[idx + 2]; |
| 2379 | } |
| 2380 | const uint8_t temp = 15 - minSumRangeIdx; |
| 2381 | mid_rgb[1][B] = sum_rgb[B] / temp; |
| 2382 | mid_rgb[1][G] = sum_rgb[G] / temp; |
| 2383 | mid_rgb[1][R] = sum_rgb[R] / temp; |
| 2384 | } |
| 2385 | else |
| 2386 | { |
| 2387 | mid_rgb[1][B] = (b[2] + b[3]) / 2; |
| 2388 | mid_rgb[1][G] = (g[2] + g[3]) / 2; |
| 2389 | mid_rgb[1][R] = (r[2] + r[3]) / 2; |
| 2390 | } |
| 2391 | } |
| 2392 | |
| 2393 | // 5) sets the start distance index |
| 2394 | uint32_t startDistCandidate; |
| 2395 | uint32_t avgDist; |
| 2396 | if( tMode ) |
| 2397 | { |
| 2398 | if( swap ) |
| 2399 | { |
| 2400 | avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] ) / 6; |
| 2401 | } |
| 2402 | else |
| 2403 | { |
| 2404 | avgDist = ( b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 6; |
| 2405 | } |
| 2406 | } |
| 2407 | else |
| 2408 | { |
| 2409 | avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] + b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 12; |
| 2410 | } |
| 2411 | |
| 2412 | if( avgDist <= 16) |
| 2413 | { |
| 2414 | startDistCandidate = 0; |
| 2415 | } |
| 2416 | else if( avgDist <= 23 ) |
| 2417 | { |
| 2418 | startDistCandidate = 1; |
| 2419 | } |
| 2420 | else if( avgDist <= 32 ) |
| 2421 | { |
| 2422 | startDistCandidate = 2; |
| 2423 | } |
| 2424 | else if( avgDist <= 41 ) |
| 2425 | { |
| 2426 | startDistCandidate = 3; |
| 2427 | } |
| 2428 | else |
| 2429 | { |
| 2430 | startDistCandidate = 4; |
| 2431 | } |
| 2432 | |
| 2433 | uint32_t bestErr = MaxError; |
| 2434 | uint32_t bestPixIndices; |
| 2435 | uint8_t bestDist = 10; |
| 2436 | uint8_t colorsRGB444[2][3]; |
| 2437 | compressColor( mid_rgb, colorsRGB444, tMode ); |
| 2438 | compressed1 = 0; |
| 2439 | |
| 2440 | // 6) finds the best candidate with the lowest error |
| 2441 | #ifdef __AVX2__ |
| 2442 | // Vectorized ver |
| 2443 | bestErr = calculateErrorTH( tMode, colorsRGB444, bestDist, bestPixIndices, startDistCandidate, r8, g8, b8 ); |
| 2444 | #else |
| 2445 | // Scalar ver |
| 2446 | bestErr = calculateErrorTH( tMode, src, colorsRGB444, bestDist, bestPixIndices, startDistCandidate ); |
| 2447 | #endif |
| 2448 | |
| 2449 | // 7) outputs the final T or H block |
| 2450 | if( tMode ) |
| 2451 | { |
| 2452 | // Put the compress params into the compression block |
| 2453 | compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 23; |
| 2454 | compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 19; |
| 2455 | compressed1 |= ( colorsRGB444[0][B] ) << 15; |
| 2456 | compressed1 |= ( colorsRGB444[1][R] ) << 11; |
| 2457 | compressed1 |= ( colorsRGB444[1][G] ) << 7; |
| 2458 | compressed1 |= ( colorsRGB444[1][B] ) << 3; |
| 2459 | compressed1 |= bestDist & 0x7; |
| 2460 | } |
| 2461 | else |
| 2462 | { |
| 2463 | int bestRGB444ColPacked[2]; |
| 2464 | bestRGB444ColPacked[0] = (colorsRGB444[0][R] << 8) + (colorsRGB444[0][G] << 4) + colorsRGB444[0][B]; |
| 2465 | bestRGB444ColPacked[1] = (colorsRGB444[1][R] << 8) + (colorsRGB444[1][G] << 4) + colorsRGB444[1][B]; |
| 2466 | if( ( bestRGB444ColPacked[0] >= bestRGB444ColPacked[1] ) ^ ( ( bestDist & 1 ) == 1 ) ) |
| 2467 | { |
| 2468 | swapColors( colorsRGB444 ); |
| 2469 | // Reshuffle pixel indices to to exchange C1 with C3, and C2 with C4 |
| 2470 | bestPixIndices = ( 0x55555555 & bestPixIndices ) | ( 0xaaaaaaaa & ( ~bestPixIndices ) ); |
| 2471 | } |
| 2472 | |
| 2473 | // Put the compress params into the compression block |
| 2474 | compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 22; |
| 2475 | compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 18; |
| 2476 | compressed1 |= ( colorsRGB444[0][B] & 0xf ) << 14; |
| 2477 | compressed1 |= ( colorsRGB444[1][R] & 0xf ) << 10; |
| 2478 | compressed1 |= ( colorsRGB444[1][G] & 0xf ) << 6; |
| 2479 | compressed1 |= ( colorsRGB444[1][B] & 0xf ) << 2; |
| 2480 | compressed1 |= ( bestDist >> 1 ) & 0x3; |
| 2481 | } |
| 2482 | |
| 2483 | bestPixIndices = indexConversion( bestPixIndices ); |
| 2484 | compressed2 = 0; |
| 2485 | compressed2 = ( compressed2 & ~( ( 0x2 << 31 ) - 1 ) ) | ( bestPixIndices & ( ( 2 << 31 ) - 1 ) ); |
| 2486 | |
| 2487 | return bestErr; |
| 2488 | } |
| 2489 | //#endif |
| 2490 | |
| 2491 | template<class T, class S> |
| 2492 | static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error) |
| 2493 | { |
| 2494 | size_t tidx[2]; |
| 2495 | tidx[0] = GetLeastError( terr[0], 8 ); |
| 2496 | tidx[1] = GetLeastError( terr[1], 8 ); |
| 2497 | |
| 2498 | if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error) |
| 2499 | { |
| 2500 | return value; |
| 2501 | } |
| 2502 | |
| 2503 | d |= tidx[0] << 26; |
| 2504 | d |= tidx[1] << 29; |
| 2505 | for( int i=0; i<16; i++ ) |
| 2506 | { |
| 2507 | uint64_t t = tsel[i][tidx[id[i]%2]]; |
| 2508 | d |= ( t & 0x1 ) << ( i + 32 ); |
| 2509 | d |= ( t & 0x2 ) << ( i + 47 ); |
| 2510 | } |
| 2511 | |
| 2512 | return FixByteOrder(d); |
| 2513 | } |
| 2514 | |
| 2515 | } |
| 2516 | |
| 2517 | static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src ) |
| 2518 | { |
| 2519 | #ifdef __AVX2__ |
| 2520 | uint64_t d = CheckSolid_AVX2( src ); |
| 2521 | if( d != 0 ) return d; |
| 2522 | |
| 2523 | alignas(32) v4i a[8]; |
| 2524 | |
| 2525 | __m128i err0 = PrepareAverages_AVX2( a, src ); |
| 2526 | |
| 2527 | // Get index of minimum error (err0) |
| 2528 | __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1)); |
| 2529 | __m128i errMin0 = _mm_min_epu32(err0, err1); |
| 2530 | |
| 2531 | __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2)); |
| 2532 | __m128i errMin2 = _mm_min_epu32(errMin1, errMin0); |
| 2533 | |
| 2534 | __m128i errMask = _mm_cmpeq_epi32(errMin2, err0); |
| 2535 | |
| 2536 | uint32_t mask = _mm_movemask_epi8(errMask); |
| 2537 | |
| 2538 | uint32_t idx = _bit_scan_forward(mask) >> 2; |
| 2539 | |
| 2540 | d |= EncodeAverages_AVX2( a, idx ); |
| 2541 | |
| 2542 | alignas(32) uint32_t terr[2][8] = {}; |
| 2543 | alignas(32) uint32_t tsel[8]; |
| 2544 | |
| 2545 | if ((idx == 0) || (idx == 2)) |
| 2546 | { |
| 2547 | FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src ); |
| 2548 | } |
| 2549 | else |
| 2550 | { |
| 2551 | FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src ); |
| 2552 | } |
| 2553 | |
| 2554 | return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1 ); |
| 2555 | #else |
| 2556 | uint64_t d = CheckSolid( src ); |
| 2557 | if( d != 0 ) return d; |
| 2558 | |
| 2559 | v4i a[8]; |
| 2560 | unsigned int err[4] = {}; |
| 2561 | PrepareAverages( a, src, err ); |
| 2562 | size_t idx = GetLeastError( err, 4 ); |
| 2563 | EncodeAverages( d, a, idx ); |
| 2564 | |
| 2565 | #if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION |
| 2566 | uint32_t terr[2][8] = {}; |
| 2567 | #else |
| 2568 | uint64_t terr[2][8] = {}; |
| 2569 | #endif |
| 2570 | uint16_t tsel[16][8]; |
| 2571 | auto id = g_id[idx]; |
| 2572 | FindBestFit( terr, tsel, a, id, src ); |
| 2573 | |
| 2574 | return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) ); |
| 2575 | #endif |
| 2576 | } |
| 2577 | |
| 2578 | #ifdef __AVX2__ |
| 2579 | // horizontal min/max functions. https://stackoverflow.com/questions/22256525/horizontal-minimum-and-maximum-using-sse |
| 2580 | // if an error occurs in GCC, please change the value of -march in CFLAGS to a specific value for your CPU (e.g., skylake). |
| 2581 | static inline int16_t hMax( __m128i buffer, uint8_t& idx ) |
| 2582 | { |
| 2583 | __m128i tmp1 = _mm_sub_epi8( _mm_set1_epi8( (char)( 255 ) ), buffer ); |
| 2584 | __m128i tmp2 = _mm_min_epu8( tmp1, _mm_srli_epi16( tmp1, 8 ) ); |
| 2585 | __m128i tmp3 = _mm_minpos_epu16( tmp2 ); |
| 2586 | uint8_t result = 255 - (uint8_t)_mm_cvtsi128_si32( tmp3 ); |
| 2587 | __m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) ); |
| 2588 | idx = _tzcnt_u32( _mm_movemask_epi8( mask ) ); |
| 2589 | |
| 2590 | return result; |
| 2591 | } |
| 2592 | #elif defined __ARM_NEON && defined __aarch64__ |
| 2593 | static inline int16_t hMax( uint8x16_t buffer, uint8_t& idx ) |
| 2594 | { |
| 2595 | const uint8_t max = vmaxvq_u8( buffer ); |
| 2596 | const uint16x8_t vmax = vdupq_n_u16( max ); |
| 2597 | uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() ); |
| 2598 | uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[0] ); |
| 2599 | uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[1] ); |
| 2600 | uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmax ); |
| 2601 | uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmax ); |
| 2602 | |
| 2603 | static const uint16_t mask_lsb[] = { |
| 2604 | 0x1, 0x2, 0x4, 0x8, |
| 2605 | 0x10, 0x20, 0x40, 0x80 }; |
| 2606 | |
| 2607 | static const uint16_t mask_msb[] = { |
| 2608 | 0x100, 0x200, 0x400, 0x800, |
| 2609 | 0x1000, 0x2000, 0x4000, 0x8000 }; |
| 2610 | |
| 2611 | uint16x8_t vmask_lsb = vld1q_u16( mask_lsb ); |
| 2612 | uint16x8_t vmask_msb = vld1q_u16( mask_msb ); |
| 2613 | uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask ); |
| 2614 | uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask ); |
| 2615 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
| 2616 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
| 2617 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
| 2618 | uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), 0 ); |
| 2619 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
| 2620 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
| 2621 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
| 2622 | uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), 0 ); |
| 2623 | idx = idx_lane1 != 0 ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 ); |
| 2624 | |
| 2625 | return max; |
| 2626 | } |
| 2627 | #endif |
| 2628 | |
| 2629 | #ifdef __AVX2__ |
| 2630 | static inline int16_t hMin( __m128i buffer, uint8_t& idx ) |
| 2631 | { |
| 2632 | __m128i tmp2 = _mm_min_epu8( buffer, _mm_srli_epi16( buffer, 8 ) ); |
| 2633 | __m128i tmp3 = _mm_minpos_epu16( tmp2 ); |
| 2634 | uint8_t result = (uint8_t)_mm_cvtsi128_si32( tmp3 ); |
| 2635 | __m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) ); |
| 2636 | idx = _tzcnt_u32( _mm_movemask_epi8( mask ) ); |
| 2637 | return result; |
| 2638 | } |
| 2639 | #elif defined __ARM_NEON && defined __aarch64__ |
| 2640 | static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx ) |
| 2641 | { |
| 2642 | const uint8_t min = vminvq_u8( buffer ); |
| 2643 | const uint16x8_t vmin = vdupq_n_u16( min ); |
| 2644 | uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() ); |
| 2645 | uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[0] ); |
| 2646 | uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[1] ); |
| 2647 | uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmin ); |
| 2648 | uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmin ); |
| 2649 | |
| 2650 | static const uint16_t mask_lsb[] = { |
| 2651 | 0x1, 0x2, 0x4, 0x8, |
| 2652 | 0x10, 0x20, 0x40, 0x80 }; |
| 2653 | |
| 2654 | static const uint16_t mask_msb[] = { |
| 2655 | 0x100, 0x200, 0x400, 0x800, |
| 2656 | 0x1000, 0x2000, 0x4000, 0x8000 }; |
| 2657 | |
| 2658 | uint16x8_t vmask_lsb = vld1q_u16( mask_lsb ); |
| 2659 | uint16x8_t vmask_msb = vld1q_u16( mask_msb ); |
| 2660 | uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask ); |
| 2661 | uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask ); |
| 2662 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
| 2663 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
| 2664 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
| 2665 | uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), 0 ); |
| 2666 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
| 2667 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
| 2668 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
| 2669 | uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), 0 ); |
| 2670 | idx = idx_lane1 != 0 ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 ); |
| 2671 | |
| 2672 | return min; |
| 2673 | } |
| 2674 | #endif |
| 2675 | |
| 2676 | // During search it is not convenient to store the bits the way they are stored in the |
| 2677 | // file format. Hence, after search, it is converted to this format. |
| 2678 | // NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved. |
| 2679 | static inline void stuff59bits( unsigned int thumbT59W1, unsigned int thumbT59W2, unsigned int& thumbTW1, unsigned int& thumbTW2 ) |
| 2680 | { |
| 2681 | // Put bits in twotimer configuration for 59 (red overflows) |
| 2682 | // |
| 2683 | // Go from this bit layout: |
| 2684 | // |
| 2685 | // |63 62 61 60 59|58 57 56 55|54 53 52 51|50 49 48 47|46 45 44 43|42 41 40 39|38 37 36 35|34 33 32| |
| 2686 | // |----empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|--dist--| |
| 2687 | // |
| 2688 | // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| |
| 2689 | // |----------------------------------------index bits---------------------------------------------| |
| 2690 | // |
| 2691 | // |
| 2692 | // To this: |
| 2693 | // |
| 2694 | // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 |
| 2695 | // ----------------------------------------------------------------------------------------------- |
| 2696 | // |// // //|R0a |//|R0b |G0 |B0 |R1 |G1 |B1 |da |df|db| |
| 2697 | // ----------------------------------------------------------------------------------------------- |
| 2698 | // |
| 2699 | // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| |
| 2700 | // |----------------------------------------index bits---------------------------------------------| |
| 2701 | // |
| 2702 | // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 |
| 2703 | // ----------------------------------------------------------------------------------------------- |
| 2704 | // | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp| |
| 2705 | // | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt| |
| 2706 | // ------------------------------------------------------------------------------------------------ |
| 2707 | |
| 2708 | uint8_t R0a; |
| 2709 | uint8_t bit, a, b, c, d, bits; |
| 2710 | |
| 2711 | R0a = ( thumbT59W1 >> 25 ) & 0x3; |
| 2712 | |
| 2713 | // Fix middle part |
| 2714 | thumbTW1 = thumbT59W1 << 1; |
| 2715 | // Fix R0a (top two bits of R0) |
| 2716 | thumbTW1 = ( thumbTW1 & ~( 0x3 << 27 ) ) | ( ( R0a & 0x3 ) << 27 ); |
| 2717 | // Fix db (lowest bit of d) |
| 2718 | thumbTW1 = ( thumbTW1 & ~0x1 ) | ( thumbT59W1 & 0x1 ); |
| 2719 | |
| 2720 | // Make sure that red overflows: |
| 2721 | a = ( thumbTW1 >> 28 ) & 0x1; |
| 2722 | b = ( thumbTW1 >> 27 ) & 0x1; |
| 2723 | c = ( thumbTW1 >> 25 ) & 0x1; |
| 2724 | d = ( thumbTW1 >> 24 ) & 0x1; |
| 2725 | |
| 2726 | // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111 |
| 2727 | // The following logical expression checks for the presence of any of those: |
| 2728 | bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d ); |
| 2729 | bits = 0xf * bit; |
| 2730 | thumbTW1 = ( thumbTW1 & ~( 0x7 << 29 ) ) | ( bits & 0x7 ) << 29; |
| 2731 | thumbTW1 = ( thumbTW1 & ~( 0x1 << 26 ) ) | ( !bit & 0x1 ) << 26; |
| 2732 | |
| 2733 | // Set diffbit |
| 2734 | thumbTW1 = ( thumbTW1 & ~0x2 ) | 0x2; |
| 2735 | thumbTW2 = thumbT59W2; |
| 2736 | } |
| 2737 | |
| 2738 | // During search it is not convenient to store the bits the way they are stored in the |
| 2739 | // file format. Hence, after search, it is converted to this format. |
| 2740 | // NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved. |
| 2741 | static inline void stuff58bits( unsigned int thumbH58W1, unsigned int thumbH58W2, unsigned int& thumbHW1, unsigned int& thumbHW2 ) |
| 2742 | { |
| 2743 | // Put bits in twotimer configuration for 58 (red doesn't overflow, green does) |
| 2744 | // |
| 2745 | // Go from this bit layout: |
| 2746 | // |
| 2747 | // |
| 2748 | // |63 62 61 60 59 58|57 56 55 54|53 52 51 50|49 48 47 46|45 44 43 42|41 40 39 38|37 36 35 34|33 32| |
| 2749 | // |-------empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|d2 d1| |
| 2750 | // |
| 2751 | // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| |
| 2752 | // |---------------------------------------index bits----------------------------------------------| |
| 2753 | // |
| 2754 | // To this: |
| 2755 | // |
| 2756 | // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 |
| 2757 | // ----------------------------------------------------------------------------------------------- |
| 2758 | // |//|R0 |G0 |// // //|G0|B0|//|B0b |R1 |G1 |B0 |d2|df|d1| |
| 2759 | // ----------------------------------------------------------------------------------------------- |
| 2760 | // |
| 2761 | // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| |
| 2762 | // |---------------------------------------index bits----------------------------------------------| |
| 2763 | // |
| 2764 | // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 |
| 2765 | // ----------------------------------------------------------------------------------------------- |
| 2766 | // | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp| |
| 2767 | // | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt| |
| 2768 | // ----------------------------------------------------------------------------------------------- |
| 2769 | // |
| 2770 | // |
| 2771 | // Thus, what we are really doing is going from this bit layout: |
| 2772 | // |
| 2773 | // |
| 2774 | // |63 62 61 60 59 58|57 56 55 54 53 52 51|50 49|48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33|32 | |
| 2775 | // |-------empty-----|part0---------------|part1|part2------------------------------------------|part3| |
| 2776 | // |
| 2777 | // To this: |
| 2778 | // |
| 2779 | // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 |
| 2780 | // --------------------------------------------------------------------------------------------------| |
| 2781 | // |//|part0 |// // //|part1|//|part2 |df|part3| |
| 2782 | // --------------------------------------------------------------------------------------------------| |
| 2783 | |
| 2784 | unsigned int part0, part1, part2, part3; |
| 2785 | uint8_t bit, a, b, c, d, bits; |
| 2786 | |
| 2787 | // move parts |
| 2788 | part0 = ( thumbH58W1 >> 19 ) & 0x7f; |
| 2789 | part1 = ( thumbH58W1 >> 17 ) & 0x3; |
| 2790 | part2 = ( thumbH58W1 >> 1 ) & 0xffff; |
| 2791 | part3 = thumbH58W1 & 0x1; |
| 2792 | thumbHW1 = 0; |
| 2793 | thumbHW1 = ( thumbHW1 & ~( 0x7f << 24 ) ) | ( ( part0 & 0x7f ) << 24 ); |
| 2794 | thumbHW1 = ( thumbHW1 & ~( 0x3 << 19 ) ) | ( ( part1 & 0x3 ) << 19 ); |
| 2795 | thumbHW1 = ( thumbHW1 & ~( 0xffff << 2 ) ) | ( ( part2 & 0xffff ) << 2 ); |
| 2796 | thumbHW1 = ( thumbHW1 & ~0x1 ) | ( part3 & 0x1 ); |
| 2797 | |
| 2798 | // Make sure that red does not overflow: |
| 2799 | bit = ( thumbHW1 >> 30 ) & 0x1; |
| 2800 | thumbHW1 = ( thumbHW1 & ~( 0x1 << 31 ) ) | ( ( !bit & 0x1 ) << 31 ); |
| 2801 | |
| 2802 | // Make sure that green overflows: |
| 2803 | a = ( thumbHW1 >> 20 ) & 0x1; |
| 2804 | b = ( thumbHW1 >> 19 ) & 0x1; |
| 2805 | c = ( thumbHW1 >> 17 ) & 0x1; |
| 2806 | d = ( thumbHW1 >> 16 ) & 0x1; |
| 2807 | // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111 |
| 2808 | // The following logical expression checks for the presence of any of those: |
| 2809 | bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d ); |
| 2810 | bits = 0xf * bit; |
| 2811 | thumbHW1 = ( thumbHW1 & ~( 0x7 << 21 ) ) | ( ( bits & 0x7 ) << 21 ); |
| 2812 | thumbHW1 = ( thumbHW1 & ~( 0x1 << 18 ) ) | ( ( !bit & 0x1 ) << 18 ); |
| 2813 | |
| 2814 | // Set diffbit |
| 2815 | thumbHW1 = ( thumbHW1 & ~0x2 ) | 0x2; |
| 2816 | thumbHW2 = thumbH58W2; |
| 2817 | } |
| 2818 | |
| 2819 | #if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__) |
| 2820 | static etcpak_force_inline Channels GetChannels( const uint8_t* src ) |
| 2821 | { |
| 2822 | Channels ch; |
| 2823 | #ifdef __AVX2__ |
| 2824 | __m128i d0 = _mm_loadu_si128( ( (__m128i*)src ) + 0 ); |
| 2825 | __m128i d1 = _mm_loadu_si128( ( (__m128i*)src ) + 1 ); |
| 2826 | __m128i d2 = _mm_loadu_si128( ( (__m128i*)src ) + 2 ); |
| 2827 | __m128i d3 = _mm_loadu_si128( ( (__m128i*)src ) + 3 ); |
| 2828 | |
| 2829 | __m128i rgb0 = _mm_shuffle_epi8( d0, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) ); |
| 2830 | __m128i rgb1 = _mm_shuffle_epi8( d1, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) ); |
| 2831 | __m128i rgb2 = _mm_shuffle_epi8( d2, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) ); |
| 2832 | __m128i rgb3 = _mm_shuffle_epi8( d3, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) ); |
| 2833 | |
| 2834 | __m128i rg0 = _mm_unpacklo_epi32( rgb0, rgb1 ); |
| 2835 | __m128i rg1 = _mm_unpacklo_epi32( rgb2, rgb3 ); |
| 2836 | __m128i b0 = _mm_unpackhi_epi32( rgb0, rgb1 ); |
| 2837 | __m128i b1 = _mm_unpackhi_epi32( rgb2, rgb3 ); |
| 2838 | |
| 2839 | // swap channels |
| 2840 | ch.b8 = _mm_unpacklo_epi64( rg0, rg1 ); |
| 2841 | ch.g8 = _mm_unpackhi_epi64( rg0, rg1 ); |
| 2842 | ch.r8 = _mm_unpacklo_epi64( b0, b1 ); |
| 2843 | #elif defined __ARM_NEON && defined __aarch64__ |
| 2844 | //load pixel data into 4 rows |
| 2845 | uint8x16_t px0 = vld1q_u8( src + 0 ); |
| 2846 | uint8x16_t px1 = vld1q_u8( src + 16 ); |
| 2847 | uint8x16_t px2 = vld1q_u8( src + 32 ); |
| 2848 | uint8x16_t px3 = vld1q_u8( src + 48 ); |
| 2849 | |
| 2850 | uint8x16x2_t px0z1 = vzipq_u8( px0, px1 ); |
| 2851 | uint8x16x2_t px2z3 = vzipq_u8( px2, px3 ); |
| 2852 | uint8x16x2_t px01 = vzipq_u8( px0z1.val[0], px0z1.val[1] ); |
| 2853 | uint8x16x2_t rgb01 = vzipq_u8( px01.val[0], px01.val[1] ); |
| 2854 | uint8x16x2_t px23 = vzipq_u8( px2z3.val[0], px2z3.val[1] ); |
| 2855 | uint8x16x2_t rgb23 = vzipq_u8( px23.val[0], px23.val[1] ); |
| 2856 | |
| 2857 | uint8x16_t rr = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[0] ), vreinterpretq_u64_u8( rgb23.val[0] ) ) ); |
| 2858 | uint8x16_t gg = vreinterpretq_u8_u64( vzip2q_u64( vreinterpretq_u64_u8( rgb01.val[0] ), vreinterpretq_u64_u8( rgb23.val[0] ) ) ); |
| 2859 | uint8x16_t bb = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[1] ), vreinterpretq_u64_u8( rgb23.val[1] ) ) ); |
| 2860 | |
| 2861 | uint8x16x2_t red = vzipq_u8( rr, uint8x16_t() ); |
| 2862 | uint8x16x2_t grn = vzipq_u8( gg, uint8x16_t() ); |
| 2863 | uint8x16x2_t blu = vzipq_u8( bb, uint8x16_t() ); |
| 2864 | ch.r = red; |
| 2865 | ch.b = blu; |
| 2866 | ch.g = grn; |
| 2867 | #endif |
| 2868 | return ch; |
| 2869 | } |
| 2870 | #endif |
| 2871 | |
| 2872 | #if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__) |
| 2873 | static etcpak_force_inline void CalculateLuma( Channels& ch, Luma& luma ) |
| 2874 | #else |
| 2875 | static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma ) |
| 2876 | #endif |
| 2877 | { |
| 2878 | #ifdef __AVX2__ |
| 2879 | __m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.b8 ), _mm256_set1_epi16( 14 ) ); |
| 2880 | __m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.g8 ), _mm256_set1_epi16( 76 ) ); |
| 2881 | __m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.r8 ), _mm256_set1_epi16( 38 ) ); |
| 2882 | |
| 2883 | __m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma ); |
| 2884 | __m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 ); |
| 2885 | __m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 ); |
| 2886 | __m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 ); |
| 2887 | |
| 2888 | static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 ); |
| 2889 | static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 ); |
| 2890 | __m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo ); |
| 2891 | __m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi ); |
| 2892 | __m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved ); |
| 2893 | luma.luma8 = luma_8bit; |
| 2894 | |
| 2895 | // min/max calculation |
| 2896 | luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f; |
| 2897 | luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f; |
| 2898 | #elif defined __ARM_NEON && defined __aarch64__ |
| 2899 | //load pixel data into 4 rows |
| 2900 | uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[0] ), 14 ); |
| 2901 | uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[1] ), 14 ); |
| 2902 | uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[0] ), 76 ); |
| 2903 | uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[1] ), 76 ); |
| 2904 | uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[0] ), 38 ); |
| 2905 | uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[1] ), 38 ); |
| 2906 | |
| 2907 | //calculate luma for rows 0,1 and 2,3 |
| 2908 | uint16x8_t lum_r01 = vaddq_u16( vaddq_u16( red0, grn0 ), blu0 ); |
| 2909 | uint16x8_t lum_r23 = vaddq_u16( vaddq_u16( red1, grn1 ), blu1 ); |
| 2910 | |
| 2911 | //divide luma values with right shift and narrow results to 8bit |
| 2912 | uint8x8_t lum_r01_d = vshrn_n_u16( lum_r01, 7 ); |
| 2913 | uint8x8_t lum_r02_d = vshrn_n_u16( lum_r23, 7 ); |
| 2914 | |
| 2915 | luma.luma8 = vcombine_u8( lum_r01_d, lum_r02_d ); |
| 2916 | //find min and max luma value |
| 2917 | luma.min = hMin( luma.luma8, luma.minIdx ) * 0.00392156f; |
| 2918 | luma.max = hMax( luma.luma8, luma.maxIdx ) * 0.00392156f; |
| 2919 | #else |
| 2920 | for( int i = 0; i < 16; ++i ) |
| 2921 | { |
| 2922 | luma.val[i] = ( src[i * 4 + 2] * 76 + src[i * 4 + 1] * 150 + src[i * 4] * 28 ) / 254; // luma calculation |
| 2923 | if( luma.min > luma.val[i] ) |
| 2924 | { |
| 2925 | luma.min = luma.val[i]; |
| 2926 | luma.minIdx = i; |
| 2927 | } |
| 2928 | if( luma.max < luma.val[i] ) |
| 2929 | { |
| 2930 | luma.max = luma.val[i]; |
| 2931 | luma.maxIdx = i; |
| 2932 | } |
| 2933 | } |
| 2934 | #endif |
| 2935 | } |
| 2936 | |
| 2937 | static etcpak_force_inline uint8_t SelectModeETC2( const Luma& luma ) |
| 2938 | { |
| 2939 | #if defined __AVX2__ || defined __ARM_NEON |
| 2940 | const float lumaRange = ( luma.max - luma.min ); |
| 2941 | #else |
| 2942 | const float lumaRange = ( luma.max - luma.min ) * ( 1.f / 255.f ); |
| 2943 | #endif |
| 2944 | // filters a very-low-contrast block |
| 2945 | if( lumaRange <= ecmd_threshold[0] ) |
| 2946 | { |
| 2947 | return ModePlanar; |
| 2948 | } |
| 2949 | // checks whether a pair of the corner pixels in a block has the min/max luma values; |
| 2950 | // if so, the ETC2 planar mode is enabled, and otherwise, the ETC1 mode is enabled |
| 2951 | else if( lumaRange <= ecmd_threshold[1] ) |
| 2952 | { |
| 2953 | #ifdef __AVX2__ |
| 2954 | static const __m128i corner_pair = _mm_set_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 0, 15, 3, 12, 12, 3, 15, 0 ); |
| 2955 | __m128i current_max_min = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx ); |
| 2956 | |
| 2957 | __m128i max_min_result = _mm_cmpeq_epi16( corner_pair, current_max_min ); |
| 2958 | |
| 2959 | int mask = _mm_movemask_epi8( max_min_result ); |
| 2960 | if( mask ) |
| 2961 | { |
| 2962 | return ModePlanar; |
| 2963 | } |
| 2964 | #else |
| 2965 | // check whether a pair of the corner pixels in a block has the min/max luma values; |
| 2966 | // if so, the ETC2 planar mode is enabled. |
| 2967 | if( ( luma.minIdx == 0 && luma.maxIdx == 15 ) || |
| 2968 | ( luma.minIdx == 15 && luma.maxIdx == 0 ) || |
| 2969 | ( luma.minIdx == 3 && luma.maxIdx == 12 ) || |
| 2970 | ( luma.minIdx == 12 && luma.maxIdx == 3 ) ) |
| 2971 | { |
| 2972 | return ModePlanar; |
| 2973 | } |
| 2974 | #endif |
| 2975 | } |
| 2976 | // filters a high-contrast block for checking both ETC1 mode and the ETC2 T/H mode |
| 2977 | else if( lumaRange >= ecmd_threshold[2] ) |
| 2978 | { |
| 2979 | return ModeTH; |
| 2980 | } |
| 2981 | return ModeUndecided; |
| 2982 | } |
| 2983 | |
| 2984 | static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool useHeuristics ) |
| 2985 | { |
| 2986 | #ifdef __AVX2__ |
| 2987 | uint64_t d = CheckSolid_AVX2( src ); |
| 2988 | if( d != 0 ) return d; |
| 2989 | #else |
| 2990 | uint64_t d = CheckSolid( src ); |
| 2991 | if (d != 0) return d; |
| 2992 | #endif |
| 2993 | |
| 2994 | uint8_t mode = ModeUndecided; |
| 2995 | Luma luma; |
| 2996 | #ifdef __AVX2__ |
| 2997 | Channels ch = GetChannels( src ); |
| 2998 | if( useHeuristics ) |
| 2999 | { |
| 3000 | CalculateLuma( ch, luma ); |
| 3001 | mode = SelectModeETC2( luma ); |
| 3002 | } |
| 3003 | |
| 3004 | auto plane = Planar_AVX2( ch, mode, useHeuristics ); |
| 3005 | if( useHeuristics && mode == ModePlanar ) return plane.plane; |
| 3006 | |
| 3007 | alignas( 32 ) v4i a[8]; |
| 3008 | __m128i err0 = PrepareAverages_AVX2( a, plane.sum4 ); |
| 3009 | |
| 3010 | // Get index of minimum error (err0) |
| 3011 | __m128i err1 = _mm_shuffle_epi32( err0, _MM_SHUFFLE( 2, 3, 0, 1 ) ); |
| 3012 | __m128i errMin0 = _mm_min_epu32(err0, err1); |
| 3013 | |
| 3014 | __m128i errMin1 = _mm_shuffle_epi32( errMin0, _MM_SHUFFLE( 1, 0, 3, 2 ) ); |
| 3015 | __m128i errMin2 = _mm_min_epu32( errMin1, errMin0 ); |
| 3016 | |
| 3017 | __m128i errMask = _mm_cmpeq_epi32( errMin2, err0 ); |
| 3018 | |
| 3019 | uint32_t mask = _mm_movemask_epi8( errMask ); |
| 3020 | |
| 3021 | size_t idx = _bit_scan_forward( mask ) >> 2; |
| 3022 | |
| 3023 | d = EncodeAverages_AVX2( a, idx ); |
| 3024 | |
| 3025 | alignas(32) uint32_t terr[2][8] = {}; |
| 3026 | alignas(32) uint32_t tsel[8]; |
| 3027 | |
| 3028 | if ((idx == 0) || (idx == 2)) |
| 3029 | { |
| 3030 | FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src ); |
| 3031 | } |
| 3032 | else |
| 3033 | { |
| 3034 | FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src ); |
| 3035 | } |
| 3036 | |
| 3037 | if( useHeuristics ) |
| 3038 | { |
| 3039 | if( mode == ModeTH ) |
| 3040 | { |
| 3041 | uint64_t result = 0; |
| 3042 | uint64_t error = 0; |
| 3043 | uint32_t compressed[4] = { 0, 0, 0, 0 }; |
| 3044 | bool tMode = false; |
| 3045 | |
| 3046 | error = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode, ch.r8, ch.g8, ch.b8 ); |
| 3047 | if( tMode ) |
| 3048 | { |
| 3049 | stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] ); |
| 3050 | } |
| 3051 | else |
| 3052 | { |
| 3053 | stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] ); |
| 3054 | } |
| 3055 | |
| 3056 | result = (uint32_t)_bswap( compressed[2] ); |
| 3057 | result |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32; |
| 3058 | |
| 3059 | plane.plane = result; |
| 3060 | plane.error = error; |
| 3061 | } |
| 3062 | else |
| 3063 | { |
| 3064 | plane.plane = 0; |
| 3065 | plane.error = MaxError; |
| 3066 | } |
| 3067 | } |
| 3068 | |
| 3069 | return EncodeSelectors_AVX2( d, terr, tsel, ( idx % 2 ) == 1, plane.plane, plane.error ); |
| 3070 | #else |
| 3071 | if( useHeuristics ) |
| 3072 | { |
| 3073 | #if defined __ARM_NEON && defined __aarch64__ |
| 3074 | Channels ch = GetChannels( src ); |
| 3075 | CalculateLuma( ch, luma ); |
| 3076 | #else |
| 3077 | CalculateLuma( src, luma ); |
| 3078 | #endif |
| 3079 | mode = SelectModeETC2( luma ); |
| 3080 | } |
| 3081 | #ifdef __ARM_NEON |
| 3082 | auto result = Planar_NEON( src, mode, useHeuristics ); |
| 3083 | #else |
| 3084 | auto result = Planar( src, mode, useHeuristics ); |
| 3085 | #endif |
| 3086 | if( result.second == 0 ) return result.first; |
| 3087 | |
| 3088 | v4i a[8]; |
| 3089 | unsigned int err[4] = {}; |
| 3090 | PrepareAverages( a, src, err ); |
| 3091 | size_t idx = GetLeastError( err, 4 ); |
| 3092 | EncodeAverages( d, a, idx ); |
| 3093 | |
| 3094 | #if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION |
| 3095 | uint32_t terr[2][8] = {}; |
| 3096 | #else |
| 3097 | uint64_t terr[2][8] = {}; |
| 3098 | #endif |
| 3099 | uint16_t tsel[16][8]; |
| 3100 | auto id = g_id[idx]; |
| 3101 | FindBestFit( terr, tsel, a, id, src ); |
| 3102 | |
| 3103 | if( useHeuristics ) |
| 3104 | { |
| 3105 | if( mode == ModeTH ) |
| 3106 | { |
| 3107 | uint32_t compressed[4] = { 0, 0, 0, 0 }; |
| 3108 | bool tMode = false; |
| 3109 | |
| 3110 | result.second = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode ); |
| 3111 | if( tMode ) |
| 3112 | { |
| 3113 | stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] ); |
| 3114 | } |
| 3115 | else |
| 3116 | { |
| 3117 | stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] ); |
| 3118 | } |
| 3119 | |
| 3120 | result.first = (uint32_t)_bswap( compressed[2] ); |
| 3121 | result.first |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32; |
| 3122 | } |
| 3123 | else |
| 3124 | { |
| 3125 | result.first = 0; |
| 3126 | result.second = MaxError; |
| 3127 | } |
| 3128 | } |
| 3129 | |
| 3130 | return EncodeSelectors( d, terr, tsel, id, result.first, result.second ); |
| 3131 | #endif |
| 3132 | } |
| 3133 | |
| 3134 | #ifdef __SSE4_1__ |
| 3135 | template<int K> |
| 3136 | static etcpak_force_inline __m128i Widen( const __m128i src ) |
| 3137 | { |
| 3138 | static_assert( K >= 0 && K <= 7, "Index out of range" ); |
| 3139 | |
| 3140 | __m128i tmp; |
| 3141 | switch( K ) |
| 3142 | { |
| 3143 | case 0: |
| 3144 | tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
| 3145 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
| 3146 | case 1: |
| 3147 | tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) ); |
| 3148 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
| 3149 | case 2: |
| 3150 | tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
| 3151 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
| 3152 | case 3: |
| 3153 | tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) ); |
| 3154 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
| 3155 | case 4: |
| 3156 | tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
| 3157 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
| 3158 | case 5: |
| 3159 | tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) ); |
| 3160 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
| 3161 | case 6: |
| 3162 | tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
| 3163 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
| 3164 | case 7: |
| 3165 | tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) ); |
| 3166 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
| 3167 | } |
| 3168 | } |
| 3169 | |
| 3170 | static etcpak_force_inline int GetMulSel( int sel ) |
| 3171 | { |
| 3172 | switch( sel ) |
| 3173 | { |
| 3174 | case 0: |
| 3175 | return 0; |
| 3176 | case 1: |
| 3177 | case 2: |
| 3178 | case 3: |
| 3179 | return 1; |
| 3180 | case 4: |
| 3181 | return 2; |
| 3182 | case 5: |
| 3183 | case 6: |
| 3184 | case 7: |
| 3185 | return 3; |
| 3186 | case 8: |
| 3187 | case 9: |
| 3188 | case 10: |
| 3189 | case 11: |
| 3190 | case 12: |
| 3191 | case 13: |
| 3192 | return 4; |
| 3193 | case 14: |
| 3194 | case 15: |
| 3195 | return 5; |
| 3196 | } |
| 3197 | } |
| 3198 | |
| 3199 | #endif |
| 3200 | |
| 3201 | #ifdef __ARM_NEON |
| 3202 | |
| 3203 | static constexpr etcpak_force_inline int GetMulSel(int sel) |
| 3204 | { |
| 3205 | return ( sel < 1 ) ? 0 : ( sel < 4 ) ? 1 : ( sel < 5 ) ? 2 : ( sel < 8 ) ? 3 : ( sel < 14 ) ? 4 : 5; |
| 3206 | } |
| 3207 | |
| 3208 | static constexpr int ClampConstant( int x, int min, int max ) |
| 3209 | { |
| 3210 | return x < min ? min : x > max ? max : x; |
| 3211 | } |
| 3212 | |
| 3213 | template <int Index> |
| 3214 | etcpak_force_inline static uint16x8_t ErrorProbe_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock ) |
| 3215 | { |
| 3216 | uint8x8_t srcValWide; |
| 3217 | #ifndef __aarch64__ |
| 3218 | if( Index < 8 ) |
| 3219 | srcValWide = vdup_lane_u8( vget_low_u8( alphaBlock ), ClampConstant( Index, 0, 7 ) ); |
| 3220 | else |
| 3221 | srcValWide = vdup_lane_u8( vget_high_u8( alphaBlock ), ClampConstant( Index - 8, 0, 7 ) ); |
| 3222 | #else |
| 3223 | srcValWide = vdup_laneq_u8( alphaBlock, Index ); |
| 3224 | #endif |
| 3225 | |
| 3226 | uint8x8_t deltaVal = vabd_u8( srcValWide, recVal ); |
| 3227 | return vmull_u8( deltaVal, deltaVal ); |
| 3228 | } |
| 3229 | |
| 3230 | etcpak_force_inline static uint16_t MinError_EAC_NEON( uint16x8_t errProbe ) |
| 3231 | { |
| 3232 | #ifndef __aarch64__ |
| 3233 | uint16x4_t tmpErr = vpmin_u16( vget_low_u16( errProbe ), vget_high_u16( errProbe ) ); |
| 3234 | tmpErr = vpmin_u16( tmpErr, tmpErr ); |
| 3235 | return vpmin_u16( tmpErr, tmpErr )[0]; |
| 3236 | #else |
| 3237 | return vminvq_u16( errProbe ); |
| 3238 | #endif |
| 3239 | } |
| 3240 | |
| 3241 | template <int Index> |
| 3242 | etcpak_force_inline static uint64_t MinErrorIndex_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock ) |
| 3243 | { |
| 3244 | uint16x8_t errProbe = ErrorProbe_EAC_NEON<Index>( recVal, alphaBlock ); |
| 3245 | uint16x8_t minErrMask = vceqq_u16( errProbe, vdupq_n_u16( MinError_EAC_NEON( errProbe ) ) ); |
| 3246 | uint64_t idx = __builtin_ctzll( vget_lane_u64( vreinterpret_u64_u8( vqmovn_u16( minErrMask ) ), 0 ) ); |
| 3247 | idx >>= 3; |
| 3248 | idx <<= 45 - Index * 3; |
| 3249 | |
| 3250 | return idx; |
| 3251 | } |
| 3252 | |
| 3253 | template <int Index> |
| 3254 | etcpak_force_inline static int16x8_t WidenMultiplier_EAC_NEON( int16x8_t multipliers ) |
| 3255 | { |
| 3256 | constexpr int Lane = GetMulSel( Index ); |
| 3257 | #ifndef __aarch64__ |
| 3258 | if( Lane < 4 ) |
| 3259 | return vdupq_lane_s16( vget_low_s16( multipliers ), ClampConstant( Lane, 0, 3 ) ); |
| 3260 | else |
| 3261 | return vdupq_lane_s16( vget_high_s16( multipliers ), ClampConstant( Lane - 4, 0, 3 ) ); |
| 3262 | #else |
| 3263 | return vdupq_laneq_s16( multipliers, Lane ); |
| 3264 | #endif |
| 3265 | } |
| 3266 | |
| 3267 | #endif |
| 3268 | |
| 3269 | static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src ) |
| 3270 | { |
| 3271 | #if defined __SSE4_1__ |
| 3272 | // Check solid |
| 3273 | __m128i s = _mm_loadu_si128( (__m128i*)src ); |
| 3274 | __m128i solidCmp = _mm_set1_epi8( src[0] ); |
| 3275 | __m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp ); |
| 3276 | if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) ) |
| 3277 | { |
| 3278 | return src[0]; |
| 3279 | } |
| 3280 | |
| 3281 | // Calculate min, max |
| 3282 | __m128i s1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 2, 3, 0, 1 ) ); |
| 3283 | __m128i max1 = _mm_max_epu8( s, s1 ); |
| 3284 | __m128i min1 = _mm_min_epu8( s, s1 ); |
| 3285 | __m128i smax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) ); |
| 3286 | __m128i smin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) ); |
| 3287 | __m128i max2 = _mm_max_epu8( max1, smax2 ); |
| 3288 | __m128i min2 = _mm_min_epu8( min1, smin2 ); |
| 3289 | __m128i smax3 = _mm_alignr_epi8( max2, max2, 2 ); |
| 3290 | __m128i smin3 = _mm_alignr_epi8( min2, min2, 2 ); |
| 3291 | __m128i max3 = _mm_max_epu8( max2, smax3 ); |
| 3292 | __m128i min3 = _mm_min_epu8( min2, smin3 ); |
| 3293 | __m128i smax4 = _mm_alignr_epi8( max3, max3, 1 ); |
| 3294 | __m128i smin4 = _mm_alignr_epi8( min3, min3, 1 ); |
| 3295 | __m128i max = _mm_max_epu8( max3, smax4 ); |
| 3296 | __m128i min = _mm_min_epu8( min3, smin4 ); |
| 3297 | __m128i max16 = _mm_unpacklo_epi8( max, _mm_setzero_si128() ); |
| 3298 | __m128i min16 = _mm_unpacklo_epi8( min, _mm_setzero_si128() ); |
| 3299 | |
| 3300 | // src range, mid |
| 3301 | __m128i srcRange = _mm_sub_epi16( max16, min16 ); |
| 3302 | __m128i srcRangeHalf = _mm_srli_epi16( srcRange, 1 ); |
| 3303 | __m128i srcMid = _mm_add_epi16( min16, srcRangeHalf ); |
| 3304 | |
| 3305 | // multiplier |
| 3306 | __m128i mul1 = _mm_mulhi_epi16( srcRange, g_alphaRange_SIMD ); |
| 3307 | __m128i mul = _mm_add_epi16( mul1, _mm_set1_epi16( 1 ) ); |
| 3308 | |
| 3309 | // wide source |
| 3310 | __m128i s16_1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 3, 2, 3, 2 ) ); |
| 3311 | __m128i s16[2] = { _mm_unpacklo_epi8( s, _mm_setzero_si128() ), _mm_unpacklo_epi8( s16_1, _mm_setzero_si128() ) }; |
| 3312 | |
| 3313 | __m128i sr[16] = { |
| 3314 | Widen<0>( s16[0] ), |
| 3315 | Widen<1>( s16[0] ), |
| 3316 | Widen<2>( s16[0] ), |
| 3317 | Widen<3>( s16[0] ), |
| 3318 | Widen<4>( s16[0] ), |
| 3319 | Widen<5>( s16[0] ), |
| 3320 | Widen<6>( s16[0] ), |
| 3321 | Widen<7>( s16[0] ), |
| 3322 | Widen<0>( s16[1] ), |
| 3323 | Widen<1>( s16[1] ), |
| 3324 | Widen<2>( s16[1] ), |
| 3325 | Widen<3>( s16[1] ), |
| 3326 | Widen<4>( s16[1] ), |
| 3327 | Widen<5>( s16[1] ), |
| 3328 | Widen<6>( s16[1] ), |
| 3329 | Widen<7>( s16[1] ) |
| 3330 | }; |
| 3331 | |
| 3332 | #ifdef __AVX2__ |
| 3333 | __m256i srcRangeWide = _mm256_broadcastsi128_si256( srcRange ); |
| 3334 | __m256i srcMidWide = _mm256_broadcastsi128_si256( srcMid ); |
| 3335 | |
| 3336 | __m256i mulWide1 = _mm256_mulhi_epi16( srcRangeWide, g_alphaRange_AVX ); |
| 3337 | __m256i mulWide = _mm256_add_epi16( mulWide1, _mm256_set1_epi16( 1 ) ); |
| 3338 | |
| 3339 | __m256i modMul[8] = { |
| 3340 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ) ), _mm256_setzero_si256() ), |
| 3341 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ) ), _mm256_setzero_si256() ), |
| 3342 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ) ), _mm256_setzero_si256() ), |
| 3343 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ) ), _mm256_setzero_si256() ), |
| 3344 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ) ), _mm256_setzero_si256() ), |
| 3345 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ) ), _mm256_setzero_si256() ), |
| 3346 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ) ), _mm256_setzero_si256() ), |
| 3347 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ) ), _mm256_setzero_si256() ), |
| 3348 | }; |
| 3349 | |
| 3350 | // find selector |
| 3351 | __m256i mulErr = _mm256_setzero_si256(); |
| 3352 | for( int j=0; j<16; j++ ) |
| 3353 | { |
| 3354 | __m256i s16Wide = _mm256_broadcastsi128_si256( sr[j] ); |
| 3355 | __m256i err1, err2; |
| 3356 | |
| 3357 | err1 = _mm256_sub_epi16( s16Wide, modMul[0] ); |
| 3358 | __m256i localErr = _mm256_mullo_epi16( err1, err1 ); |
| 3359 | |
| 3360 | err1 = _mm256_sub_epi16( s16Wide, modMul[1] ); |
| 3361 | err2 = _mm256_mullo_epi16( err1, err1 ); |
| 3362 | localErr = _mm256_min_epu16( localErr, err2 ); |
| 3363 | |
| 3364 | err1 = _mm256_sub_epi16( s16Wide, modMul[2] ); |
| 3365 | err2 = _mm256_mullo_epi16( err1, err1 ); |
| 3366 | localErr = _mm256_min_epu16( localErr, err2 ); |
| 3367 | |
| 3368 | err1 = _mm256_sub_epi16( s16Wide, modMul[3] ); |
| 3369 | err2 = _mm256_mullo_epi16( err1, err1 ); |
| 3370 | localErr = _mm256_min_epu16( localErr, err2 ); |
| 3371 | |
| 3372 | err1 = _mm256_sub_epi16( s16Wide, modMul[4] ); |
| 3373 | err2 = _mm256_mullo_epi16( err1, err1 ); |
| 3374 | localErr = _mm256_min_epu16( localErr, err2 ); |
| 3375 | |
| 3376 | err1 = _mm256_sub_epi16( s16Wide, modMul[5] ); |
| 3377 | err2 = _mm256_mullo_epi16( err1, err1 ); |
| 3378 | localErr = _mm256_min_epu16( localErr, err2 ); |
| 3379 | |
| 3380 | err1 = _mm256_sub_epi16( s16Wide, modMul[6] ); |
| 3381 | err2 = _mm256_mullo_epi16( err1, err1 ); |
| 3382 | localErr = _mm256_min_epu16( localErr, err2 ); |
| 3383 | |
| 3384 | err1 = _mm256_sub_epi16( s16Wide, modMul[7] ); |
| 3385 | err2 = _mm256_mullo_epi16( err1, err1 ); |
| 3386 | localErr = _mm256_min_epu16( localErr, err2 ); |
| 3387 | |
| 3388 | // note that this can overflow, but since we're looking for the smallest error, it shouldn't matter |
| 3389 | mulErr = _mm256_adds_epu16( mulErr, localErr ); |
| 3390 | } |
| 3391 | uint64_t minPos1 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_castsi256_si128( mulErr ) ) ); |
| 3392 | uint64_t minPos2 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_extracti128_si256( mulErr, 1 ) ) ); |
| 3393 | int sel = ( ( minPos1 & 0xFFFF ) < ( minPos2 & 0xFFFF ) ) ? ( minPos1 >> 16 ) : ( 8 + ( minPos2 >> 16 ) ); |
| 3394 | |
| 3395 | __m128i recVal16; |
| 3396 | switch( sel ) |
| 3397 | { |
| 3398 | case 0: |
| 3399 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ); |
| 3400 | break; |
| 3401 | case 1: |
| 3402 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ); |
| 3403 | break; |
| 3404 | case 2: |
| 3405 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ); |
| 3406 | break; |
| 3407 | case 3: |
| 3408 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ); |
| 3409 | break; |
| 3410 | case 4: |
| 3411 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ); |
| 3412 | break; |
| 3413 | case 5: |
| 3414 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ); |
| 3415 | break; |
| 3416 | case 6: |
| 3417 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ); |
| 3418 | break; |
| 3419 | case 7: |
| 3420 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ); |
| 3421 | break; |
| 3422 | case 8: |
| 3423 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ); |
| 3424 | break; |
| 3425 | case 9: |
| 3426 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ); |
| 3427 | break; |
| 3428 | case 10: |
| 3429 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ); |
| 3430 | break; |
| 3431 | case 11: |
| 3432 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ); |
| 3433 | break; |
| 3434 | case 12: |
| 3435 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ); |
| 3436 | break; |
| 3437 | case 13: |
| 3438 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ); |
| 3439 | break; |
| 3440 | case 14: |
| 3441 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ); |
| 3442 | break; |
| 3443 | case 15: |
| 3444 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() ); |
| 3445 | break; |
| 3446 | default: |
| 3447 | assert( false ); |
| 3448 | break; |
| 3449 | } |
| 3450 | #else |
| 3451 | // wide multiplier |
| 3452 | __m128i rangeMul[16] = { |
| 3453 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ), |
| 3454 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ), |
| 3455 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ), |
| 3456 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ), |
| 3457 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ), |
| 3458 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ), |
| 3459 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ), |
| 3460 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ), |
| 3461 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ), |
| 3462 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ), |
| 3463 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ), |
| 3464 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ), |
| 3465 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ), |
| 3466 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ), |
| 3467 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ), |
| 3468 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() ) |
| 3469 | }; |
| 3470 | |
| 3471 | // find selector |
| 3472 | int err = std::numeric_limits<int>::max(); |
| 3473 | int sel; |
| 3474 | for( int r=0; r<16; r++ ) |
| 3475 | { |
| 3476 | __m128i err1, err2, minerr; |
| 3477 | __m128i recVal16 = rangeMul[r]; |
| 3478 | int rangeErr; |
| 3479 | |
| 3480 | err1 = _mm_sub_epi16( sr[0], recVal16 ); |
| 3481 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3482 | minerr = _mm_minpos_epu16( err2 ); |
| 3483 | rangeErr = _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3484 | |
| 3485 | err1 = _mm_sub_epi16( sr[1], recVal16 ); |
| 3486 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3487 | minerr = _mm_minpos_epu16( err2 ); |
| 3488 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3489 | |
| 3490 | err1 = _mm_sub_epi16( sr[2], recVal16 ); |
| 3491 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3492 | minerr = _mm_minpos_epu16( err2 ); |
| 3493 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3494 | |
| 3495 | err1 = _mm_sub_epi16( sr[3], recVal16 ); |
| 3496 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3497 | minerr = _mm_minpos_epu16( err2 ); |
| 3498 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3499 | |
| 3500 | err1 = _mm_sub_epi16( sr[4], recVal16 ); |
| 3501 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3502 | minerr = _mm_minpos_epu16( err2 ); |
| 3503 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3504 | |
| 3505 | err1 = _mm_sub_epi16( sr[5], recVal16 ); |
| 3506 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3507 | minerr = _mm_minpos_epu16( err2 ); |
| 3508 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3509 | |
| 3510 | err1 = _mm_sub_epi16( sr[6], recVal16 ); |
| 3511 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3512 | minerr = _mm_minpos_epu16( err2 ); |
| 3513 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3514 | |
| 3515 | err1 = _mm_sub_epi16( sr[7], recVal16 ); |
| 3516 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3517 | minerr = _mm_minpos_epu16( err2 ); |
| 3518 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3519 | |
| 3520 | err1 = _mm_sub_epi16( sr[8], recVal16 ); |
| 3521 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3522 | minerr = _mm_minpos_epu16( err2 ); |
| 3523 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3524 | |
| 3525 | err1 = _mm_sub_epi16( sr[9], recVal16 ); |
| 3526 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3527 | minerr = _mm_minpos_epu16( err2 ); |
| 3528 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3529 | |
| 3530 | err1 = _mm_sub_epi16( sr[10], recVal16 ); |
| 3531 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3532 | minerr = _mm_minpos_epu16( err2 ); |
| 3533 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3534 | |
| 3535 | err1 = _mm_sub_epi16( sr[11], recVal16 ); |
| 3536 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3537 | minerr = _mm_minpos_epu16( err2 ); |
| 3538 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3539 | |
| 3540 | err1 = _mm_sub_epi16( sr[12], recVal16 ); |
| 3541 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3542 | minerr = _mm_minpos_epu16( err2 ); |
| 3543 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3544 | |
| 3545 | err1 = _mm_sub_epi16( sr[13], recVal16 ); |
| 3546 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3547 | minerr = _mm_minpos_epu16( err2 ); |
| 3548 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3549 | |
| 3550 | err1 = _mm_sub_epi16( sr[14], recVal16 ); |
| 3551 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3552 | minerr = _mm_minpos_epu16( err2 ); |
| 3553 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3554 | |
| 3555 | err1 = _mm_sub_epi16( sr[15], recVal16 ); |
| 3556 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3557 | minerr = _mm_minpos_epu16( err2 ); |
| 3558 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
| 3559 | |
| 3560 | if( rangeErr < err ) |
| 3561 | { |
| 3562 | err = rangeErr; |
| 3563 | sel = r; |
| 3564 | if( err == 0 ) break; |
| 3565 | } |
| 3566 | } |
| 3567 | |
| 3568 | __m128i recVal16 = rangeMul[sel]; |
| 3569 | #endif |
| 3570 | |
| 3571 | // find indices |
| 3572 | __m128i err1, err2, minerr; |
| 3573 | uint64_t idx = 0, tmp; |
| 3574 | |
| 3575 | err1 = _mm_sub_epi16( sr[0], recVal16 ); |
| 3576 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3577 | minerr = _mm_minpos_epu16( err2 ); |
| 3578 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3579 | idx |= ( tmp >> 16 ) << 15*3; |
| 3580 | |
| 3581 | err1 = _mm_sub_epi16( sr[1], recVal16 ); |
| 3582 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3583 | minerr = _mm_minpos_epu16( err2 ); |
| 3584 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3585 | idx |= ( tmp >> 16 ) << 14*3; |
| 3586 | |
| 3587 | err1 = _mm_sub_epi16( sr[2], recVal16 ); |
| 3588 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3589 | minerr = _mm_minpos_epu16( err2 ); |
| 3590 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3591 | idx |= ( tmp >> 16 ) << 13*3; |
| 3592 | |
| 3593 | err1 = _mm_sub_epi16( sr[3], recVal16 ); |
| 3594 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3595 | minerr = _mm_minpos_epu16( err2 ); |
| 3596 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3597 | idx |= ( tmp >> 16 ) << 12*3; |
| 3598 | |
| 3599 | err1 = _mm_sub_epi16( sr[4], recVal16 ); |
| 3600 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3601 | minerr = _mm_minpos_epu16( err2 ); |
| 3602 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3603 | idx |= ( tmp >> 16 ) << 11*3; |
| 3604 | |
| 3605 | err1 = _mm_sub_epi16( sr[5], recVal16 ); |
| 3606 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3607 | minerr = _mm_minpos_epu16( err2 ); |
| 3608 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3609 | idx |= ( tmp >> 16 ) << 10*3; |
| 3610 | |
| 3611 | err1 = _mm_sub_epi16( sr[6], recVal16 ); |
| 3612 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3613 | minerr = _mm_minpos_epu16( err2 ); |
| 3614 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3615 | idx |= ( tmp >> 16 ) << 9*3; |
| 3616 | |
| 3617 | err1 = _mm_sub_epi16( sr[7], recVal16 ); |
| 3618 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3619 | minerr = _mm_minpos_epu16( err2 ); |
| 3620 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3621 | idx |= ( tmp >> 16 ) << 8*3; |
| 3622 | |
| 3623 | err1 = _mm_sub_epi16( sr[8], recVal16 ); |
| 3624 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3625 | minerr = _mm_minpos_epu16( err2 ); |
| 3626 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3627 | idx |= ( tmp >> 16 ) << 7*3; |
| 3628 | |
| 3629 | err1 = _mm_sub_epi16( sr[9], recVal16 ); |
| 3630 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3631 | minerr = _mm_minpos_epu16( err2 ); |
| 3632 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3633 | idx |= ( tmp >> 16 ) << 6*3; |
| 3634 | |
| 3635 | err1 = _mm_sub_epi16( sr[10], recVal16 ); |
| 3636 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3637 | minerr = _mm_minpos_epu16( err2 ); |
| 3638 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3639 | idx |= ( tmp >> 16 ) << 5*3; |
| 3640 | |
| 3641 | err1 = _mm_sub_epi16( sr[11], recVal16 ); |
| 3642 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3643 | minerr = _mm_minpos_epu16( err2 ); |
| 3644 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3645 | idx |= ( tmp >> 16 ) << 4*3; |
| 3646 | |
| 3647 | err1 = _mm_sub_epi16( sr[12], recVal16 ); |
| 3648 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3649 | minerr = _mm_minpos_epu16( err2 ); |
| 3650 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3651 | idx |= ( tmp >> 16 ) << 3*3; |
| 3652 | |
| 3653 | err1 = _mm_sub_epi16( sr[13], recVal16 ); |
| 3654 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3655 | minerr = _mm_minpos_epu16( err2 ); |
| 3656 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3657 | idx |= ( tmp >> 16 ) << 2*3; |
| 3658 | |
| 3659 | err1 = _mm_sub_epi16( sr[14], recVal16 ); |
| 3660 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3661 | minerr = _mm_minpos_epu16( err2 ); |
| 3662 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3663 | idx |= ( tmp >> 16 ) << 1*3; |
| 3664 | |
| 3665 | err1 = _mm_sub_epi16( sr[15], recVal16 ); |
| 3666 | err2 = _mm_mullo_epi16( err1, err1 ); |
| 3667 | minerr = _mm_minpos_epu16( err2 ); |
| 3668 | tmp = _mm_cvtsi128_si64( minerr ); |
| 3669 | idx |= ( tmp >> 16 ) << 0*3; |
| 3670 | |
| 3671 | uint16_t rm[8]; |
| 3672 | _mm_storeu_si128( (__m128i*)rm, mul ); |
| 3673 | uint16_t sm = _mm_cvtsi128_si64( srcMid ); |
| 3674 | |
| 3675 | uint64_t d = ( uint64_t( sm ) << 56 ) | |
| 3676 | ( uint64_t( rm[GetMulSel( sel )] ) << 52 ) | |
| 3677 | ( uint64_t( sel ) << 48 ) | |
| 3678 | idx; |
| 3679 | |
| 3680 | return _bswap64( d ); |
| 3681 | #elif defined __ARM_NEON |
| 3682 | |
| 3683 | int16x8_t srcMidWide, multipliers; |
| 3684 | int srcMid; |
| 3685 | uint8x16_t srcAlphaBlock = vld1q_u8( src ); |
| 3686 | { |
| 3687 | uint8_t ref = src[0]; |
| 3688 | uint8x16_t a0 = vdupq_n_u8( ref ); |
| 3689 | uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 ); |
| 3690 | int64x2_t m = vreinterpretq_s64_u8( r ); |
| 3691 | if( m[0] == -1 && m[1] == -1 ) |
| 3692 | return ref; |
| 3693 | |
| 3694 | // srcRange |
| 3695 | #ifdef __aarch64__ |
| 3696 | uint8_t min = vminvq_u8( srcAlphaBlock ); |
| 3697 | uint8_t max = vmaxvq_u8( srcAlphaBlock ); |
| 3698 | uint8_t srcRange = max - min; |
| 3699 | multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_n_s16( g_alphaRange_NEON, srcRange ), 1 ), vdupq_n_s16( 1 ) ); |
| 3700 | srcMid = min + srcRange / 2; |
| 3701 | srcMidWide = vdupq_n_s16( srcMid ); |
| 3702 | #else |
| 3703 | uint8x8_t vmin = vpmin_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) ); |
| 3704 | vmin = vpmin_u8( vmin, vmin ); |
| 3705 | vmin = vpmin_u8( vmin, vmin ); |
| 3706 | vmin = vpmin_u8( vmin, vmin ); |
| 3707 | uint8x8_t vmax = vpmax_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) ); |
| 3708 | vmax = vpmax_u8( vmax, vmax ); |
| 3709 | vmax = vpmax_u8( vmax, vmax ); |
| 3710 | vmax = vpmax_u8( vmax, vmax ); |
| 3711 | |
| 3712 | int16x8_t srcRangeWide = vreinterpretq_s16_u16( vsubl_u8( vmax, vmin ) ); |
| 3713 | multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_s16( g_alphaRange_NEON, srcRangeWide ), 1 ), vdupq_n_s16( 1 ) ); |
| 3714 | srcMidWide = vsraq_n_s16( vreinterpretq_s16_u16(vmovl_u8(vmin)), srcRangeWide, 1); |
| 3715 | srcMid = vgetq_lane_s16( srcMidWide, 0 ); |
| 3716 | #endif |
| 3717 | } |
| 3718 | |
| 3719 | // calculate reconstructed values |
| 3720 | #define EAC_APPLY_16X( m ) m( 0 ) m( 1 ) m( 2 ) m( 3 ) m( 4 ) m( 5 ) m( 6 ) m( 7 ) m( 8 ) m( 9 ) m( 10 ) m( 11 ) m( 12 ) m( 13 ) m( 14 ) m( 15 ) |
| 3721 | |
| 3722 | #define EAC_RECONSTRUCT_VALUE( n ) vqmovun_s16( vmlaq_s16( srcMidWide, g_alpha_NEON[n], WidenMultiplier_EAC_NEON<n>( multipliers ) ) ), |
| 3723 | uint8x8_t recVals[16] = { EAC_APPLY_16X( EAC_RECONSTRUCT_VALUE ) }; |
| 3724 | |
| 3725 | // find selector |
| 3726 | int err = std::numeric_limits<int>::max(); |
| 3727 | int sel = 0; |
| 3728 | for( int r = 0; r < 16; r++ ) |
| 3729 | { |
| 3730 | uint8x8_t recVal = recVals[r]; |
| 3731 | |
| 3732 | int rangeErr = 0; |
| 3733 | #define EAC_ACCUMULATE_ERROR( n ) rangeErr += MinError_EAC_NEON( ErrorProbe_EAC_NEON<n>( recVal, srcAlphaBlock ) ); |
| 3734 | EAC_APPLY_16X( EAC_ACCUMULATE_ERROR ) |
| 3735 | |
| 3736 | if( rangeErr < err ) |
| 3737 | { |
| 3738 | err = rangeErr; |
| 3739 | sel = r; |
| 3740 | if ( err == 0 ) break; |
| 3741 | } |
| 3742 | } |
| 3743 | |
| 3744 | // combine results |
| 3745 | uint64_t d = ( uint64_t( srcMid ) << 56 ) | |
| 3746 | ( uint64_t( multipliers[GetMulSel( sel )] ) << 52 ) | |
| 3747 | ( uint64_t( sel ) << 48); |
| 3748 | |
| 3749 | // generate indices |
| 3750 | uint8x8_t recVal = recVals[sel]; |
| 3751 | #define EAC_INSERT_INDEX(n) d |= MinErrorIndex_EAC_NEON<n>( recVal, srcAlphaBlock ); |
| 3752 | EAC_APPLY_16X( EAC_INSERT_INDEX ) |
| 3753 | |
| 3754 | return _bswap64( d ); |
| 3755 | |
| 3756 | #undef EAC_APPLY_16X |
| 3757 | #undef EAC_INSERT_INDEX |
| 3758 | #undef EAC_ACCUMULATE_ERROR |
| 3759 | #undef EAC_RECONSTRUCT_VALUE |
| 3760 | |
| 3761 | #else |
| 3762 | { |
| 3763 | bool solid = true; |
| 3764 | const uint8_t* ptr = src + 1; |
| 3765 | const uint8_t ref = *src; |
| 3766 | for( int i=1; i<16; i++ ) |
| 3767 | { |
| 3768 | if( ref != *ptr++ ) |
| 3769 | { |
| 3770 | solid = false; |
| 3771 | break; |
| 3772 | } |
| 3773 | } |
| 3774 | if( solid ) |
| 3775 | { |
| 3776 | return ref; |
| 3777 | } |
| 3778 | } |
| 3779 | |
| 3780 | uint8_t min = src[0]; |
| 3781 | uint8_t max = src[0]; |
| 3782 | for( int i=1; i<16; i++ ) |
| 3783 | { |
| 3784 | if( min > src[i] ) min = src[i]; |
| 3785 | else if( max < src[i] ) max = src[i]; |
| 3786 | } |
| 3787 | int srcRange = max - min; |
| 3788 | int srcMid = min + srcRange / 2; |
| 3789 | |
| 3790 | uint8_t buf[16][16]; |
| 3791 | int err = std::numeric_limits<int>::max(); |
| 3792 | int sel; |
| 3793 | int selmul; |
| 3794 | for( int r=0; r<16; r++ ) |
| 3795 | { |
| 3796 | int mul = ( ( srcRange * g_alphaRange[r] ) >> 16 ) + 1; |
| 3797 | |
| 3798 | int rangeErr = 0; |
| 3799 | for( int i=0; i<16; i++ ) |
| 3800 | { |
| 3801 | const auto srcVal = src[i]; |
| 3802 | |
| 3803 | int idx = 0; |
| 3804 | const auto modVal = g_alpha[r][0] * mul; |
| 3805 | const auto recVal = clampu8( srcMid + modVal ); |
| 3806 | int localErr = sq( srcVal - recVal ); |
| 3807 | |
| 3808 | if( localErr != 0 ) |
| 3809 | { |
| 3810 | for( int j=1; j<8; j++ ) |
| 3811 | { |
| 3812 | const auto modVal = g_alpha[r][j] * mul; |
| 3813 | const auto recVal = clampu8( srcMid + modVal ); |
| 3814 | const auto errProbe = sq( srcVal - recVal ); |
| 3815 | if( errProbe < localErr ) |
| 3816 | { |
| 3817 | localErr = errProbe; |
| 3818 | idx = j; |
| 3819 | } |
| 3820 | } |
| 3821 | } |
| 3822 | |
| 3823 | buf[r][i] = idx; |
| 3824 | rangeErr += localErr; |
| 3825 | } |
| 3826 | |
| 3827 | if( rangeErr < err ) |
| 3828 | { |
| 3829 | err = rangeErr; |
| 3830 | sel = r; |
| 3831 | selmul = mul; |
| 3832 | if( err == 0 ) break; |
| 3833 | } |
| 3834 | } |
| 3835 | |
| 3836 | uint64_t d = ( uint64_t( srcMid ) << 56 ) | |
| 3837 | ( uint64_t( selmul ) << 52 ) | |
| 3838 | ( uint64_t( sel ) << 48 ); |
| 3839 | |
| 3840 | int offset = 45; |
| 3841 | auto ptr = buf[sel]; |
| 3842 | for( int i=0; i<16; i++ ) |
| 3843 | { |
| 3844 | d |= uint64_t( *ptr++ ) << offset; |
| 3845 | offset -= 3; |
| 3846 | } |
| 3847 | |
| 3848 | return _bswap64( d ); |
| 3849 | #endif |
| 3850 | } |
| 3851 | |
| 3852 | |
| 3853 | void CompressEtc1Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) |
| 3854 | { |
| 3855 | int w = 0; |
| 3856 | uint32_t buf[4*4]; |
| 3857 | do |
| 3858 | { |
| 3859 | #ifdef __SSE4_1__ |
| 3860 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
| 3861 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
| 3862 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
| 3863 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
| 3864 | |
| 3865 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
| 3866 | |
| 3867 | __m128i c0 = _mm_castps_si128( px0 ); |
| 3868 | __m128i c1 = _mm_castps_si128( px1 ); |
| 3869 | __m128i c2 = _mm_castps_si128( px2 ); |
| 3870 | __m128i c3 = _mm_castps_si128( px3 ); |
| 3871 | |
| 3872 | __m128i mask = _mm_setr_epi32( 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f ); |
| 3873 | __m128i p0 = _mm_shuffle_epi8( c0, mask ); |
| 3874 | __m128i p1 = _mm_shuffle_epi8( c1, mask ); |
| 3875 | __m128i p2 = _mm_shuffle_epi8( c2, mask ); |
| 3876 | __m128i p3 = _mm_shuffle_epi8( c3, mask ); |
| 3877 | |
| 3878 | _mm_store_si128( (__m128i*)(buf + 0), p0 ); |
| 3879 | _mm_store_si128( (__m128i*)(buf + 4), p1 ); |
| 3880 | _mm_store_si128( (__m128i*)(buf + 8), p2 ); |
| 3881 | _mm_store_si128( (__m128i*)(buf + 12), p3 ); |
| 3882 | |
| 3883 | src += 4; |
| 3884 | #else |
| 3885 | auto ptr = buf; |
| 3886 | for( int x=0; x<4; x++ ) |
| 3887 | { |
| 3888 | unsigned int a = *src >> 24; |
| 3889 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
| 3890 | src += width; |
| 3891 | a = *src >> 24; |
| 3892 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
| 3893 | src += width; |
| 3894 | a = *src >> 24; |
| 3895 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
| 3896 | src += width; |
| 3897 | a = *src >> 24; |
| 3898 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
| 3899 | src -= width * 3 - 1; |
| 3900 | } |
| 3901 | #endif |
| 3902 | if( ++w == width/4 ) |
| 3903 | { |
| 3904 | src += width * 3; |
| 3905 | w = 0; |
| 3906 | } |
| 3907 | *dst++ = ProcessRGB( (uint8_t*)buf ); |
| 3908 | } |
| 3909 | while( --blocks ); |
| 3910 | } |
| 3911 | |
| 3912 | void CompressEtc2Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics ) |
| 3913 | { |
| 3914 | int w = 0; |
| 3915 | uint32_t buf[4*4]; |
| 3916 | do |
| 3917 | { |
| 3918 | #ifdef __SSE4_1__ |
| 3919 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
| 3920 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
| 3921 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
| 3922 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
| 3923 | |
| 3924 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
| 3925 | |
| 3926 | __m128i c0 = _mm_castps_si128( px0 ); |
| 3927 | __m128i c1 = _mm_castps_si128( px1 ); |
| 3928 | __m128i c2 = _mm_castps_si128( px2 ); |
| 3929 | __m128i c3 = _mm_castps_si128( px3 ); |
| 3930 | |
| 3931 | __m128i mask = _mm_setr_epi32( 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f ); |
| 3932 | __m128i p0 = _mm_shuffle_epi8( c0, mask ); |
| 3933 | __m128i p1 = _mm_shuffle_epi8( c1, mask ); |
| 3934 | __m128i p2 = _mm_shuffle_epi8( c2, mask ); |
| 3935 | __m128i p3 = _mm_shuffle_epi8( c3, mask ); |
| 3936 | |
| 3937 | _mm_store_si128( (__m128i*)(buf + 0), p0 ); |
| 3938 | _mm_store_si128( (__m128i*)(buf + 4), p1 ); |
| 3939 | _mm_store_si128( (__m128i*)(buf + 8), p2 ); |
| 3940 | _mm_store_si128( (__m128i*)(buf + 12), p3 ); |
| 3941 | |
| 3942 | src += 4; |
| 3943 | #else |
| 3944 | auto ptr = buf; |
| 3945 | for( int x=0; x<4; x++ ) |
| 3946 | { |
| 3947 | unsigned int a = *src >> 24; |
| 3948 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
| 3949 | src += width; |
| 3950 | a = *src >> 24; |
| 3951 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
| 3952 | src += width; |
| 3953 | a = *src >> 24; |
| 3954 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
| 3955 | src += width; |
| 3956 | a = *src >> 24; |
| 3957 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
| 3958 | src -= width * 3 - 1; |
| 3959 | } |
| 3960 | #endif |
| 3961 | if( ++w == width/4 ) |
| 3962 | { |
| 3963 | src += width * 3; |
| 3964 | w = 0; |
| 3965 | } |
| 3966 | *dst++ = ProcessRGB_ETC2( (uint8_t*)buf, useHeuristics ); |
| 3967 | } |
| 3968 | while( --blocks ); |
| 3969 | } |
| 3970 | |
| 3971 | #include <chrono> |
| 3972 | #include <thread> |
| 3973 | |
| 3974 | void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) |
| 3975 | { |
| 3976 | int w = 0; |
| 3977 | uint32_t buf[4*4]; |
| 3978 | do |
| 3979 | { |
| 3980 | #ifdef __SSE4_1__ |
| 3981 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
| 3982 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
| 3983 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
| 3984 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
| 3985 | |
| 3986 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
| 3987 | |
| 3988 | _mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) ); |
| 3989 | _mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) ); |
| 3990 | _mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) ); |
| 3991 | _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) ); |
| 3992 | |
| 3993 | src += 4; |
| 3994 | #else |
| 3995 | auto ptr = buf; |
| 3996 | for( int x=0; x<4; x++ ) |
| 3997 | { |
| 3998 | *ptr++ = *src; |
| 3999 | src += width; |
| 4000 | *ptr++ = *src; |
| 4001 | src += width; |
| 4002 | *ptr++ = *src; |
| 4003 | src += width; |
| 4004 | *ptr++ = *src; |
| 4005 | src -= width * 3 - 1; |
| 4006 | } |
| 4007 | #endif |
| 4008 | if( ++w == width/4 ) |
| 4009 | { |
| 4010 | src += width * 3; |
| 4011 | w = 0; |
| 4012 | } |
| 4013 | *dst++ = ProcessRGB( (uint8_t*)buf ); |
| 4014 | } |
| 4015 | while( --blocks ); |
| 4016 | } |
| 4017 | |
| 4018 | void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) |
| 4019 | { |
| 4020 | int w = 0; |
| 4021 | uint32_t buf[4*4]; |
| 4022 | do |
| 4023 | { |
| 4024 | #ifdef __SSE4_1__ |
| 4025 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
| 4026 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
| 4027 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
| 4028 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
| 4029 | |
| 4030 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
| 4031 | |
| 4032 | # ifdef __AVX2__ |
| 4033 | DitherAvx2( (uint8_t*)buf, _mm_castps_si128( px0 ), _mm_castps_si128( px1 ), _mm_castps_si128( px2 ), _mm_castps_si128( px3 ) ); |
| 4034 | # else |
| 4035 | _mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) ); |
| 4036 | _mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) ); |
| 4037 | _mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) ); |
| 4038 | _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) ); |
| 4039 | |
| 4040 | Dither( (uint8_t*)buf ); |
| 4041 | # endif |
| 4042 | |
| 4043 | src += 4; |
| 4044 | #else |
| 4045 | auto ptr = buf; |
| 4046 | for( int x=0; x<4; x++ ) |
| 4047 | { |
| 4048 | *ptr++ = *src; |
| 4049 | src += width; |
| 4050 | *ptr++ = *src; |
| 4051 | src += width; |
| 4052 | *ptr++ = *src; |
| 4053 | src += width; |
| 4054 | *ptr++ = *src; |
| 4055 | src -= width * 3 - 1; |
| 4056 | } |
| 4057 | #endif |
| 4058 | if( ++w == width/4 ) |
| 4059 | { |
| 4060 | src += width * 3; |
| 4061 | w = 0; |
| 4062 | } |
| 4063 | *dst++ = ProcessRGB( (uint8_t*)buf ); |
| 4064 | } |
| 4065 | while( --blocks ); |
| 4066 | } |
| 4067 | |
| 4068 | void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics ) |
| 4069 | { |
| 4070 | int w = 0; |
| 4071 | uint32_t buf[4*4]; |
| 4072 | do |
| 4073 | { |
| 4074 | #ifdef __SSE4_1__ |
| 4075 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
| 4076 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
| 4077 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
| 4078 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
| 4079 | |
| 4080 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
| 4081 | |
| 4082 | _mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) ); |
| 4083 | _mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) ); |
| 4084 | _mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) ); |
| 4085 | _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) ); |
| 4086 | |
| 4087 | src += 4; |
| 4088 | #else |
| 4089 | auto ptr = buf; |
| 4090 | for( int x=0; x<4; x++ ) |
| 4091 | { |
| 4092 | *ptr++ = *src; |
| 4093 | src += width; |
| 4094 | *ptr++ = *src; |
| 4095 | src += width; |
| 4096 | *ptr++ = *src; |
| 4097 | src += width; |
| 4098 | *ptr++ = *src; |
| 4099 | src -= width * 3 - 1; |
| 4100 | } |
| 4101 | #endif |
| 4102 | if( ++w == width/4 ) |
| 4103 | { |
| 4104 | src += width * 3; |
| 4105 | w = 0; |
| 4106 | } |
| 4107 | *dst++ = ProcessRGB_ETC2( (uint8_t*)buf, useHeuristics ); |
| 4108 | } |
| 4109 | while( --blocks ); |
| 4110 | } |
| 4111 | |
| 4112 | void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics ) |
| 4113 | { |
| 4114 | int w = 0; |
| 4115 | uint32_t rgba[4*4]; |
| 4116 | uint8_t alpha[4*4]; |
| 4117 | do |
| 4118 | { |
| 4119 | #ifdef __SSE4_1__ |
| 4120 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
| 4121 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
| 4122 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
| 4123 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
| 4124 | |
| 4125 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
| 4126 | |
| 4127 | __m128i c0 = _mm_castps_si128( px0 ); |
| 4128 | __m128i c1 = _mm_castps_si128( px1 ); |
| 4129 | __m128i c2 = _mm_castps_si128( px2 ); |
| 4130 | __m128i c3 = _mm_castps_si128( px3 ); |
| 4131 | |
| 4132 | _mm_store_si128( (__m128i*)(rgba + 0), c0 ); |
| 4133 | _mm_store_si128( (__m128i*)(rgba + 4), c1 ); |
| 4134 | _mm_store_si128( (__m128i*)(rgba + 8), c2 ); |
| 4135 | _mm_store_si128( (__m128i*)(rgba + 12), c3 ); |
| 4136 | |
| 4137 | __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 ); |
| 4138 | |
| 4139 | __m128i a0 = _mm_shuffle_epi8( c0, mask ); |
| 4140 | __m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); |
| 4141 | __m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); |
| 4142 | __m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); |
| 4143 | |
| 4144 | __m128i s0 = _mm_or_si128( a0, a1 ); |
| 4145 | __m128i s1 = _mm_or_si128( a2, a3 ); |
| 4146 | __m128i s2 = _mm_or_si128( s0, s1 ); |
| 4147 | |
| 4148 | _mm_store_si128( (__m128i*)alpha, s2 ); |
| 4149 | |
| 4150 | src += 4; |
| 4151 | #else |
| 4152 | auto ptr = rgba; |
| 4153 | auto ptr8 = alpha; |
| 4154 | for( int x=0; x<4; x++ ) |
| 4155 | { |
| 4156 | auto v = *src; |
| 4157 | *ptr++ = v; |
| 4158 | *ptr8++ = v >> 24; |
| 4159 | src += width; |
| 4160 | v = *src; |
| 4161 | *ptr++ = v; |
| 4162 | *ptr8++ = v >> 24; |
| 4163 | src += width; |
| 4164 | v = *src; |
| 4165 | *ptr++ = v; |
| 4166 | *ptr8++ = v >> 24; |
| 4167 | src += width; |
| 4168 | v = *src; |
| 4169 | *ptr++ = v; |
| 4170 | *ptr8++ = v >> 24; |
| 4171 | src -= width * 3 - 1; |
| 4172 | } |
| 4173 | #endif |
| 4174 | if( ++w == width/4 ) |
| 4175 | { |
| 4176 | src += width * 3; |
| 4177 | w = 0; |
| 4178 | } |
| 4179 | *dst++ = ProcessAlpha_ETC2( alpha ); |
| 4180 | *dst++ = ProcessRGB_ETC2( (uint8_t*)rgba, useHeuristics ); |
| 4181 | } |
| 4182 | while( --blocks ); |
| 4183 | } |
| 4184 | |