1#include <array>
2#include <string.h>
3#include <limits>
4#ifdef __ARM_NEON
5# include <arm_neon.h>
6#endif
7
8#include "Dither.hpp"
9#include "ForceInline.hpp"
10#include "Math.hpp"
11#include "ProcessCommon.hpp"
12#include "ProcessRGB.hpp"
13#include "Tables.hpp"
14#include "Vector.hpp"
15#if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER
16# ifdef _MSC_VER
17# include <intrin.h>
18# include <Windows.h>
19# define _bswap(x) _byteswap_ulong(x)
20# define _bswap64(x) _byteswap_uint64(x)
21# else
22# include <x86intrin.h>
23# endif
24#endif
25
26#ifndef _bswap
27# define _bswap(x) __builtin_bswap32(x)
28# define _bswap64(x) __builtin_bswap64(x)
29#endif
30
31static const uint32_t MaxError = 1065369600; // ((38+76+14) * 255)^2
32// common T-/H-mode table
33static uint8_t tableTH[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
34
35// thresholds for the early compression-mode decision scheme
36// default: 0.03, 0.09, and 0.38
37float ecmd_threshold[3] = { 0.03f, 0.09f, 0.38f };
38
39static const uint8_t ModeUndecided = 0;
40static const uint8_t ModePlanar = 0x1;
41static const uint8_t ModeTH = 0x2;
42
43const unsigned int R = 2;
44const unsigned int G = 1;
45const unsigned int B = 0;
46
47struct Luma
48{
49#ifdef __AVX2__
50 float max, min;
51 uint8_t minIdx = 255, maxIdx = 255;
52 __m128i luma8;
53#elif defined __ARM_NEON && defined __aarch64__
54 float max, min;
55 uint8_t minIdx = 255, maxIdx = 255;
56 uint8x16_t luma8;
57#else
58 uint8_t max = 0, min = 255, maxIdx = 0, minIdx = 0;
59 uint8_t val[16];
60#endif
61};
62
63#ifdef __AVX2__
64struct Plane
65{
66 uint64_t plane;
67 uint64_t error;
68 __m256i sum4;
69};
70#endif
71
72#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
73struct Channels
74{
75#ifdef __AVX2__
76 __m128i r8, g8, b8;
77#elif defined __ARM_NEON && defined __aarch64__
78 uint8x16x2_t r, g, b;
79#endif
80};
81#endif
82
83namespace
84{
85static etcpak_force_inline uint8_t clamp( uint8_t min, int16_t val, uint8_t max )
86{
87 return val < min ? min : ( val > max ? max : val );
88}
89
90static etcpak_force_inline uint8_t clampMin( uint8_t min, int16_t val )
91{
92 return val < min ? min : val;
93}
94
95static etcpak_force_inline uint8_t clampMax( int16_t val, uint8_t max )
96{
97 return val > max ? max : val;
98}
99
100// slightly faster than std::sort
101static void insertionSort( uint8_t* arr1, uint8_t* arr2 )
102{
103 for( uint8_t i = 1; i < 16; ++i )
104 {
105 uint8_t value = arr1[i];
106 uint8_t hole = i;
107
108 for( ; hole > 0 && value < arr1[hole - 1]; --hole )
109 {
110 arr1[hole] = arr1[hole - 1];
111 arr2[hole] = arr2[hole - 1];
112 }
113 arr1[hole] = value;
114 arr2[hole] = i;
115 }
116}
117
118//converts indices from |a0|a1|e0|e1|i0|i1|m0|m1|b0|b1|f0|f1|j0|j1|n0|n1|c0|c1|g0|g1|k0|k1|o0|o1|d0|d1|h0|h1|l0|l1|p0|p1| previously used by T- and H-modes
119// into |p0|o0|n0|m0|l0|k0|j0|i0|h0|g0|f0|e0|d0|c0|b0|a0|p1|o1|n1|m1|l1|k1|j1|i1|h1|g1|f1|e1|d1|c1|b1|a1| which should be used for all modes.
120// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
121static etcpak_force_inline int indexConversion( int pixelIndices )
122{
123 int correctIndices = 0;
124 int LSB[4][4];
125 int MSB[4][4];
126 int shift = 0;
127 for( int y = 3; y >= 0; y-- )
128 {
129 for( int x = 3; x >= 0; x-- )
130 {
131 LSB[x][y] = ( pixelIndices >> shift ) & 1;
132 shift++;
133 MSB[x][y] = ( pixelIndices >> shift ) & 1;
134 shift++;
135 }
136 }
137 shift = 0;
138 for( int x = 0; x < 4; x++ )
139 {
140 for( int y = 0; y < 4; y++ )
141 {
142 correctIndices |= ( LSB[x][y] << shift );
143 correctIndices |= ( MSB[x][y] << ( 16 + shift ) );
144 shift++;
145 }
146 }
147 return correctIndices;
148}
149
150// Swapping two RGB-colors
151// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
152static etcpak_force_inline void swapColors( uint8_t( colors )[2][3] )
153{
154 uint8_t temp = colors[0][R];
155 colors[0][R] = colors[1][R];
156 colors[1][R] = temp;
157
158 temp = colors[0][G];
159 colors[0][G] = colors[1][G];
160 colors[1][G] = temp;
161
162 temp = colors[0][B];
163 colors[0][B] = colors[1][B];
164 colors[1][B] = temp;
165}
166
167
168// calculates quantized colors for T or H modes
169void compressColor( uint8_t( currColor )[2][3], uint8_t( quantColor )[2][3], bool t_mode )
170{
171 if( t_mode )
172 {
173 quantColor[0][R] = clampMax( 15 * ( currColor[0][R] + 8 ) / 255, 15 );
174 quantColor[0][G] = clampMax( 15 * ( currColor[0][G] + 8 ) / 255, 15 );
175 quantColor[0][B] = clampMax( 15 * ( currColor[0][B] + 8 ) / 255, 15 );
176 }
177 else // clamped to [1,14] to get a wider range
178 {
179 quantColor[0][R] = clamp( 1, 15 * ( currColor[0][R] + 8 ) / 255, 14 );
180 quantColor[0][G] = clamp( 1, 15 * ( currColor[0][G] + 8 ) / 255, 14 );
181 quantColor[0][B] = clamp( 1, 15 * ( currColor[0][B] + 8 ) / 255, 14 );
182 }
183
184 // clamped to [1,14] to get a wider range
185 quantColor[1][R] = clamp( 1, 15 * ( currColor[1][R] + 8 ) / 255, 14 );
186 quantColor[1][G] = clamp( 1, 15 * ( currColor[1][G] + 8 ) / 255, 14 );
187 quantColor[1][B] = clamp( 1, 15 * ( currColor[1][B] + 8 ) / 255, 14 );
188}
189
190// three decoding functions come from ETCPACK v2.74 and are slightly changed.
191static etcpak_force_inline void decompressColor( uint8_t( colorsRGB444 )[2][3], uint8_t( colors )[2][3] )
192{
193 // The color should be retrieved as:
194 //
195 // c = round(255/(r_bits^2-1))*comp_color
196 //
197 // This is similar to bit replication
198 //
199 // Note -- this code only work for bit replication from 4 bits and up --- 3 bits needs
200 // two copy operations.
201 colors[0][R] = ( colorsRGB444[0][R] << 4 ) | colorsRGB444[0][R];
202 colors[0][G] = ( colorsRGB444[0][G] << 4 ) | colorsRGB444[0][G];
203 colors[0][B] = ( colorsRGB444[0][B] << 4 ) | colorsRGB444[0][B];
204 colors[1][R] = ( colorsRGB444[1][R] << 4 ) | colorsRGB444[1][R];
205 colors[1][G] = ( colorsRGB444[1][G] << 4 ) | colorsRGB444[1][G];
206 colors[1][B] = ( colorsRGB444[1][B] << 4 ) | colorsRGB444[1][B];
207}
208
209// calculates the paint colors from the block colors
210// using a distance d and one of the H- or T-patterns.
211static void calculatePaintColors59T( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
212{
213 //////////////////////////////////////////////
214 //
215 // C3 C1 C4----C1---C2
216 // | | |
217 // | | |
218 // |-------| |
219 // | | |
220 // | | |
221 // C4 C2 C3
222 //
223 //////////////////////////////////////////////
224
225 // C4
226 pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
227 pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
228 pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
229
230 // C3
231 pColors[0][R] = colors[0][R];
232 pColors[0][G] = colors[0][G];
233 pColors[0][B] = colors[0][B];
234 // C2
235 pColors[1][R] = clampMax( colors[1][R] + tableTH[d], 255 );
236 pColors[1][G] = clampMax( colors[1][G] + tableTH[d], 255 );
237 pColors[1][B] = clampMax( colors[1][B] + tableTH[d], 255 );
238 // C1
239 pColors[2][R] = colors[1][R];
240 pColors[2][G] = colors[1][G];
241 pColors[2][B] = colors[1][B];
242}
243
244static void calculatePaintColors58H( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
245{
246 pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
247 pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
248 pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
249
250 // C1
251 pColors[0][R] = clampMax( colors[0][R] + tableTH[d], 255 );
252 pColors[0][G] = clampMax( colors[0][G] + tableTH[d], 255 );
253 pColors[0][B] = clampMax( colors[0][B] + tableTH[d], 255 );
254 // C2
255 pColors[1][R] = clampMin( 0, colors[0][R] - tableTH[d] );
256 pColors[1][G] = clampMin( 0, colors[0][G] - tableTH[d] );
257 pColors[1][B] = clampMin( 0, colors[0][B] - tableTH[d] );
258 // C3
259 pColors[2][R] = clampMax( colors[1][R] + tableTH[d], 255 );
260 pColors[2][G] = clampMax( colors[1][G] + tableTH[d], 255 );
261 pColors[2][B] = clampMax( colors[1][B] + tableTH[d], 255 );
262}
263
264#if defined _MSC_VER && !defined __clang__
265static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask )
266{
267 unsigned long ret;
268 _BitScanForward( &ret, mask );
269 return ret;
270}
271#endif
272
273typedef std::array<uint16_t, 4> v4i;
274
275#ifdef __AVX2__
276static etcpak_force_inline __m256i Sum4_AVX2( const uint8_t* data) noexcept
277{
278 __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
279 __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
280 __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
281 __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
282
283 __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
284 __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
285 __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
286 __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
287
288 __m256i t0 = _mm256_cvtepu8_epi16(dm0);
289 __m256i t1 = _mm256_cvtepu8_epi16(dm1);
290 __m256i t2 = _mm256_cvtepu8_epi16(dm2);
291 __m256i t3 = _mm256_cvtepu8_epi16(dm3);
292
293 __m256i sum0 = _mm256_add_epi16(t0, t1);
294 __m256i sum1 = _mm256_add_epi16(t2, t3);
295
296 __m256i s0 = _mm256_permute2x128_si256(sum0, sum1, (0) | (3 << 4)); // 0, 0, 3, 3
297 __m256i s1 = _mm256_permute2x128_si256(sum0, sum1, (1) | (2 << 4)); // 1, 1, 2, 2
298
299 __m256i s2 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(1, 3, 0, 2));
300 __m256i s3 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(0, 2, 1, 3));
301 __m256i s4 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(3, 1, 0, 2));
302 __m256i s5 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(2, 0, 1, 3));
303
304 __m256i sum5 = _mm256_add_epi16(s2, s3); // 3, 0, 3, 0
305 __m256i sum6 = _mm256_add_epi16(s4, s5); // 2, 1, 1, 2
306 return _mm256_add_epi16(sum5, sum6); // 3+2, 0+1, 3+1, 3+2
307}
308
309static etcpak_force_inline __m256i Average_AVX2( const __m256i data) noexcept
310{
311 __m256i a = _mm256_add_epi16(data, _mm256_set1_epi16(4));
312
313 return _mm256_srli_epi16(a, 3);
314}
315
316static etcpak_force_inline __m128i CalcErrorBlock_AVX2( const __m256i data, const v4i a[8]) noexcept
317{
318 //
319 __m256i a0 = _mm256_load_si256((__m256i*)a[0].data());
320 __m256i a1 = _mm256_load_si256((__m256i*)a[4].data());
321
322 // err = 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
323 __m256i a4 = _mm256_madd_epi16(a0, a0);
324 __m256i a5 = _mm256_madd_epi16(a1, a1);
325
326 __m256i a6 = _mm256_hadd_epi32(a4, a5);
327 __m256i a7 = _mm256_slli_epi32(a6, 3);
328
329 __m256i a8 = _mm256_add_epi32(a7, _mm256_set1_epi32(0x3FFFFFFF)); // Big value to prevent negative values, but small enough to prevent overflow
330
331 // average is not swapped
332 // err -= block[0] * 2 * average[0];
333 // err -= block[1] * 2 * average[1];
334 // err -= block[2] * 2 * average[2];
335 __m256i a2 = _mm256_slli_epi16(a0, 1);
336 __m256i a3 = _mm256_slli_epi16(a1, 1);
337 __m256i b0 = _mm256_madd_epi16(a2, data);
338 __m256i b1 = _mm256_madd_epi16(a3, data);
339
340 __m256i b2 = _mm256_hadd_epi32(b0, b1);
341 __m256i b3 = _mm256_sub_epi32(a8, b2);
342 __m256i b4 = _mm256_hadd_epi32(b3, b3);
343
344 __m256i b5 = _mm256_permutevar8x32_epi32(b4, _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0));
345
346 return _mm256_castsi256_si128(b5);
347}
348
349static etcpak_force_inline void ProcessAverages_AVX2(const __m256i d, v4i a[8] ) noexcept
350{
351 __m256i t = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(31)), _mm256_set1_epi16(128));
352
353 __m256i c = _mm256_srli_epi16(_mm256_add_epi16(t, _mm256_srli_epi16(t, 8)), 8);
354
355 __m256i c1 = _mm256_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
356 __m256i diff = _mm256_sub_epi16(c, c1);
357 diff = _mm256_max_epi16(diff, _mm256_set1_epi16(-4));
358 diff = _mm256_min_epi16(diff, _mm256_set1_epi16(3));
359
360 __m256i co = _mm256_add_epi16(c1, diff);
361
362 c = _mm256_blend_epi16(co, c, 0xF0);
363
364 __m256i a0 = _mm256_or_si256(_mm256_slli_epi16(c, 3), _mm256_srli_epi16(c, 2));
365
366 _mm256_store_si256((__m256i*)a[4].data(), a0);
367
368 __m256i t0 = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(15)), _mm256_set1_epi16(128));
369 __m256i t1 = _mm256_srli_epi16(_mm256_add_epi16(t0, _mm256_srli_epi16(t0, 8)), 8);
370
371 __m256i t2 = _mm256_or_si256(t1, _mm256_slli_epi16(t1, 4));
372
373 _mm256_store_si256((__m256i*)a[0].data(), t2);
374}
375
376static etcpak_force_inline uint64_t EncodeAverages_AVX2( const v4i a[8], size_t idx ) noexcept
377{
378 uint64_t d = ( idx << 24 );
379 size_t base = idx << 1;
380
381 __m128i a0 = _mm_load_si128((const __m128i*)a[base].data());
382
383 __m128i r0, r1;
384
385 if( ( idx & 0x2 ) == 0 )
386 {
387 r0 = _mm_srli_epi16(a0, 4);
388
389 __m128i a1 = _mm_unpackhi_epi64(r0, r0);
390 r1 = _mm_slli_epi16(a1, 4);
391 }
392 else
393 {
394 __m128i a1 = _mm_and_si128(a0, _mm_set1_epi16(-8));
395
396 r0 = _mm_unpackhi_epi64(a1, a1);
397 __m128i a2 = _mm_sub_epi16(a1, r0);
398 __m128i a3 = _mm_srai_epi16(a2, 3);
399 r1 = _mm_and_si128(a3, _mm_set1_epi16(0x07));
400 }
401
402 __m128i r2 = _mm_or_si128(r0, r1);
403 // do missing swap for average values
404 __m128i r3 = _mm_shufflelo_epi16(r2, _MM_SHUFFLE(3, 0, 1, 2));
405 __m128i r4 = _mm_packus_epi16(r3, _mm_setzero_si128());
406 d |= _mm_cvtsi128_si32(r4);
407
408 return d;
409}
410
411static etcpak_force_inline uint64_t CheckSolid_AVX2( const uint8_t* src ) noexcept
412{
413 __m256i d0 = _mm256_loadu_si256(((__m256i*)src) + 0);
414 __m256i d1 = _mm256_loadu_si256(((__m256i*)src) + 1);
415
416 __m256i c = _mm256_broadcastd_epi32(_mm256_castsi256_si128(d0));
417
418 __m256i c0 = _mm256_cmpeq_epi8(d0, c);
419 __m256i c1 = _mm256_cmpeq_epi8(d1, c);
420
421 __m256i m = _mm256_and_si256(c0, c1);
422
423 if (!_mm256_testc_si256(m, _mm256_set1_epi32(-1)))
424 {
425 return 0;
426 }
427
428 return 0x02000000 |
429 ( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
430 ( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
431 ( (unsigned int)( src[2] & 0xF8 ) );
432}
433
434static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const uint8_t* src) noexcept
435{
436 __m256i sum4 = Sum4_AVX2( src );
437
438 ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
439
440 return CalcErrorBlock_AVX2( sum4, a);
441}
442
443static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const __m256i sum4) noexcept
444{
445 ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
446
447 return CalcErrorBlock_AVX2( sum4, a);
448}
449
450static etcpak_force_inline void FindBestFit_4x2_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
451{
452 __m256i sel0 = _mm256_setzero_si256();
453 __m256i sel1 = _mm256_setzero_si256();
454
455 for (unsigned int j = 0; j < 2; ++j)
456 {
457 unsigned int bid = offset + 1 - j;
458
459 __m256i squareErrorSum = _mm256_setzero_si256();
460
461 __m128i a0 = _mm_loadl_epi64((const __m128i*)a[bid].data());
462 __m256i a1 = _mm256_broadcastq_epi64(a0);
463
464 // Processing one full row each iteration
465 for (size_t i = 0; i < 8; i += 4)
466 {
467 __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
468
469 __m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
470 __m256i d = _mm256_sub_epi16(a1, rgb16);
471
472 // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
473 // This produces slightly different results, but is significant faster
474 __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
475 __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
476 __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
477 __m128i pixel3 = _mm256_castsi256_si128(pixel2);
478
479 __m128i pix0 = _mm_broadcastw_epi16(pixel3);
480 __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
481 __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
482
483 // Processing first two pixels of the row
484 {
485 __m256i pix = _mm256_abs_epi16(pixel);
486
487 // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
488 // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
489 __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
490 __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
491
492 __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
493 __m256i minError = _mm256_min_epi16(error0, error1);
494
495 // Exploiting symmetry of the selector table and use the sign bit
496 // This produces slightly different results, but is significant faster
497 __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
498
499 // Interleaving values so madd instruction can be used
500 __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
501 __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
502
503 __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
504 // Squaring the minimum error to produce correct values when adding
505 __m256i squareError = _mm256_madd_epi16(minError2, minError2);
506
507 squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
508
509 // Packing selector bits
510 __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
511 __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
512
513 sel0 = _mm256_or_si256(sel0, minIndexLo2);
514 sel1 = _mm256_or_si256(sel1, minIndexHi2);
515 }
516
517 pixel3 = _mm256_extracti128_si256(pixel2, 1);
518 pix0 = _mm_broadcastw_epi16(pixel3);
519 pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
520 pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
521
522 // Processing second two pixels of the row
523 {
524 __m256i pix = _mm256_abs_epi16(pixel);
525
526 // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
527 // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
528 __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
529 __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
530
531 __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
532 __m256i minError = _mm256_min_epi16(error0, error1);
533
534 // Exploiting symmetry of the selector table and use the sign bit
535 __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
536
537 // Interleaving values so madd instruction can be used
538 __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
539 __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
540
541 __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
542 // Squaring the minimum error to produce correct values when adding
543 __m256i squareError = _mm256_madd_epi16(minError2, minError2);
544
545 squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
546
547 // Packing selector bits
548 __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
549 __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
550 __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
551 __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
552
553 sel0 = _mm256_or_si256(sel0, minIndexLo3);
554 sel1 = _mm256_or_si256(sel1, minIndexHi3);
555 }
556 }
557
558 data += 8 * 4;
559
560 _mm256_store_si256((__m256i*)terr[1 - j], squareErrorSum);
561 }
562
563 // Interleave selector bits
564 __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
565 __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
566
567 __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
568 __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
569
570 __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
571
572 __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
573
574 _mm256_store_si256((__m256i*)tsel, sel);
575}
576
577static etcpak_force_inline void FindBestFit_2x4_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
578{
579 __m256i sel0 = _mm256_setzero_si256();
580 __m256i sel1 = _mm256_setzero_si256();
581
582 __m256i squareErrorSum0 = _mm256_setzero_si256();
583 __m256i squareErrorSum1 = _mm256_setzero_si256();
584
585 __m128i a0 = _mm_loadl_epi64((const __m128i*)a[offset + 1].data());
586 __m128i a1 = _mm_loadl_epi64((const __m128i*)a[offset + 0].data());
587
588 __m128i a2 = _mm_broadcastq_epi64(a0);
589 __m128i a3 = _mm_broadcastq_epi64(a1);
590 __m256i a4 = _mm256_insertf128_si256(_mm256_castsi128_si256(a2), a3, 1);
591
592 // Processing one full row each iteration
593 for (size_t i = 0; i < 16; i += 4)
594 {
595 __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
596
597 __m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
598 __m256i d = _mm256_sub_epi16(a4, rgb16);
599
600 // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
601 // This produces slightly different results, but is significant faster
602 __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
603 __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
604 __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
605 __m128i pixel3 = _mm256_castsi256_si128(pixel2);
606
607 __m128i pix0 = _mm_broadcastw_epi16(pixel3);
608 __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
609 __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
610
611 // Processing first two pixels of the row
612 {
613 __m256i pix = _mm256_abs_epi16(pixel);
614
615 // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
616 // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
617 __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
618 __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
619
620 __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
621 __m256i minError = _mm256_min_epi16(error0, error1);
622
623 // Exploiting symmetry of the selector table and use the sign bit
624 __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
625
626 // Interleaving values so madd instruction can be used
627 __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
628 __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
629
630 __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
631 // Squaring the minimum error to produce correct values when adding
632 __m256i squareError = _mm256_madd_epi16(minError2, minError2);
633
634 squareErrorSum0 = _mm256_add_epi32(squareErrorSum0, squareError);
635
636 // Packing selector bits
637 __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
638 __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
639
640 sel0 = _mm256_or_si256(sel0, minIndexLo2);
641 sel1 = _mm256_or_si256(sel1, minIndexHi2);
642 }
643
644 pixel3 = _mm256_extracti128_si256(pixel2, 1);
645 pix0 = _mm_broadcastw_epi16(pixel3);
646 pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
647 pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
648
649 // Processing second two pixels of the row
650 {
651 __m256i pix = _mm256_abs_epi16(pixel);
652
653 // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
654 // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
655 __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
656 __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
657
658 __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
659 __m256i minError = _mm256_min_epi16(error0, error1);
660
661 // Exploiting symmetry of the selector table and use the sign bit
662 __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
663
664 // Interleaving values so madd instruction can be used
665 __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
666 __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
667
668 __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
669 // Squaring the minimum error to produce correct values when adding
670 __m256i squareError = _mm256_madd_epi16(minError2, minError2);
671
672 squareErrorSum1 = _mm256_add_epi32(squareErrorSum1, squareError);
673
674 // Packing selector bits
675 __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
676 __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
677 __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
678 __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
679
680 sel0 = _mm256_or_si256(sel0, minIndexLo3);
681 sel1 = _mm256_or_si256(sel1, minIndexHi3);
682 }
683 }
684
685 _mm256_store_si256((__m256i*)terr[1], squareErrorSum0);
686 _mm256_store_si256((__m256i*)terr[0], squareErrorSum1);
687
688 // Interleave selector bits
689 __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
690 __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
691
692 __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
693 __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
694
695 __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
696
697 __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
698
699 _mm256_store_si256((__m256i*)tsel, sel);
700}
701
702static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate) noexcept
703{
704 size_t tidx[2];
705
706 // Get index of minimum error (terr[0] and terr[1])
707 __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
708 __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
709
710 __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
711 __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
712
713 __m256i errMin0 = _mm256_min_epu32(errLo, errHi);
714
715 __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
716 __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
717
718 __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
719 __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
720
721 __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
722 __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
723
724 __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
725 __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
726
727 uint32_t mask0 = _mm256_movemask_epi8(errMask0);
728 uint32_t mask1 = _mm256_movemask_epi8(errMask1);
729
730 tidx[0] = _bit_scan_forward(mask0) >> 2;
731 tidx[1] = _bit_scan_forward(mask1) >> 2;
732
733 d |= tidx[0] << 26;
734 d |= tidx[1] << 29;
735
736 unsigned int t0 = tsel[tidx[0]];
737 unsigned int t1 = tsel[tidx[1]];
738
739 if (!rotate)
740 {
741 t0 &= 0xFF00FF00;
742 t1 &= 0x00FF00FF;
743 }
744 else
745 {
746 t0 &= 0xCCCCCCCC;
747 t1 &= 0x33333333;
748 }
749
750 // Flip selectors from sign bit
751 unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
752
753 return d | static_cast<uint64_t>(_bswap(t2)) << 32;
754}
755
756static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cvf) noexcept
757{
758 __m128i co = _mm_cvttps_epi32(cof);
759 __m128i ch = _mm_cvttps_epi32(chf);
760 __m128i cv = _mm_cvttps_epi32(cvf);
761
762 __m128i coh = _mm_packus_epi32(co, ch);
763 __m128i cv0 = _mm_packus_epi32(cv, _mm_setzero_si128());
764
765 __m256i cohv0 = _mm256_inserti128_si256(_mm256_castsi128_si256(coh), cv0, 1);
766 __m256i cohv1 = _mm256_min_epu16(cohv0, _mm256_set1_epi16(1023));
767
768 __m256i cohv2 = _mm256_sub_epi16(cohv1, _mm256_set1_epi16(15));
769 __m256i cohv3 = _mm256_srai_epi16(cohv2, 1);
770
771 __m256i cohvrb0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(11));
772 __m256i cohvrb1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(4));
773 __m256i cohvg0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(9));
774 __m256i cohvg1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(6));
775
776 __m256i cohvrb2 = _mm256_srai_epi16(cohvrb0, 7);
777 __m256i cohvrb3 = _mm256_srai_epi16(cohvrb1, 7);
778 __m256i cohvg2 = _mm256_srai_epi16(cohvg0, 8);
779 __m256i cohvg3 = _mm256_srai_epi16(cohvg1, 8);
780
781 __m256i cohvrb4 = _mm256_sub_epi16(cohvrb0, cohvrb2);
782 __m256i cohvrb5 = _mm256_sub_epi16(cohvrb4, cohvrb3);
783 __m256i cohvg4 = _mm256_sub_epi16(cohvg0, cohvg2);
784 __m256i cohvg5 = _mm256_sub_epi16(cohvg4, cohvg3);
785
786 __m256i cohvrb6 = _mm256_srai_epi16(cohvrb5, 3);
787 __m256i cohvg6 = _mm256_srai_epi16(cohvg5, 2);
788
789 __m256i cohv4 = _mm256_blend_epi16(cohvg6, cohvrb6, 0x55);
790
791 __m128i cohv5 = _mm_packus_epi16(_mm256_castsi256_si128(cohv4), _mm256_extracti128_si256(cohv4, 1));
792 return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1));
793}
794
795static etcpak_force_inline Plane Planar_AVX2( const Channels& ch, uint8_t& mode, bool useHeuristics )
796{
797 __m128i t0 = _mm_sad_epu8( ch.r8, _mm_setzero_si128() );
798 __m128i t1 = _mm_sad_epu8( ch.g8, _mm_setzero_si128() );
799 __m128i t2 = _mm_sad_epu8( ch.b8, _mm_setzero_si128() );
800
801 __m128i r8s = _mm_shuffle_epi8( ch.r8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
802 __m128i g8s = _mm_shuffle_epi8( ch.g8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
803 __m128i b8s = _mm_shuffle_epi8( ch.b8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
804
805 __m128i s0 = _mm_sad_epu8( r8s, _mm_setzero_si128() );
806 __m128i s1 = _mm_sad_epu8( g8s, _mm_setzero_si128() );
807 __m128i s2 = _mm_sad_epu8( b8s, _mm_setzero_si128() );
808
809 __m256i sr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), s0, 1 );
810 __m256i sg0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t1 ), s1, 1 );
811 __m256i sb0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), s2, 1 );
812
813 __m256i sr1 = _mm256_slli_epi64( sr0, 32 );
814 __m256i sg1 = _mm256_slli_epi64( sg0, 16 );
815
816 __m256i srb = _mm256_or_si256( sr1, sb0 );
817 __m256i srgb = _mm256_or_si256( srb, sg1 );
818
819 if( mode != ModePlanar && useHeuristics )
820 {
821 Plane plane;
822 plane.sum4 = _mm256_permute4x64_epi64( srgb, _MM_SHUFFLE( 2, 3, 0, 1 ) );
823 return plane;
824 }
825
826 __m128i t3 = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( t0 ), _mm_castsi128_ps( t1 ), _MM_SHUFFLE( 2, 0, 2, 0 ) ) );
827 __m128i t4 = _mm_shuffle_epi32( t2, _MM_SHUFFLE( 3, 1, 2, 0 ) );
828 __m128i t5 = _mm_hadd_epi32( t3, t4 );
829 __m128i t6 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 1, 1, 1, 1 ) );
830 __m128i t7 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 2, 2, 2, 2 ) );
831
832 __m256i sr = _mm256_broadcastw_epi16( t5 );
833 __m256i sg = _mm256_broadcastw_epi16( t6 );
834 __m256i sb = _mm256_broadcastw_epi16( t7 );
835
836 __m256i r08 = _mm256_cvtepu8_epi16( ch.r8 );
837 __m256i g08 = _mm256_cvtepu8_epi16( ch.g8 );
838 __m256i b08 = _mm256_cvtepu8_epi16( ch.b8 );
839
840 __m256i r16 = _mm256_slli_epi16( r08, 4 );
841 __m256i g16 = _mm256_slli_epi16( g08, 4 );
842 __m256i b16 = _mm256_slli_epi16( b08, 4 );
843
844 __m256i difR0 = _mm256_sub_epi16( r16, sr );
845 __m256i difG0 = _mm256_sub_epi16( g16, sg );
846 __m256i difB0 = _mm256_sub_epi16( b16, sb );
847
848 __m256i difRyz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
849 __m256i difGyz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
850 __m256i difByz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
851
852 __m256i difRxz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
853 __m256i difGxz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
854 __m256i difBxz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
855
856 __m256i difRGyz = _mm256_hadd_epi32( difRyz, difGyz );
857 __m256i difByzxz = _mm256_hadd_epi32( difByz, difBxz );
858
859 __m256i difRGxz = _mm256_hadd_epi32( difRxz, difGxz );
860
861 __m128i sumRGyz = _mm_add_epi32( _mm256_castsi256_si128( difRGyz ), _mm256_extracti128_si256( difRGyz, 1 ) );
862 __m128i sumByzxz = _mm_add_epi32( _mm256_castsi256_si128( difByzxz ), _mm256_extracti128_si256( difByzxz, 1 ) );
863 __m128i sumRGxz = _mm_add_epi32( _mm256_castsi256_si128( difRGxz ), _mm256_extracti128_si256( difRGxz, 1 ) );
864
865 __m128i sumRGByz = _mm_hadd_epi32( sumRGyz, sumByzxz );
866 __m128i sumRGByzxz = _mm_hadd_epi32( sumRGxz, sumByzxz );
867
868 __m128i sumRGBxz = _mm_shuffle_epi32( sumRGByzxz, _MM_SHUFFLE( 2, 3, 1, 0 ) );
869
870 __m128 sumRGByzf = _mm_cvtepi32_ps( sumRGByz );
871 __m128 sumRGBxzf = _mm_cvtepi32_ps( sumRGBxz );
872
873 const float value = ( 255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f;
874
875 __m128 scale = _mm_set1_ps( -4.0f / value );
876
877 __m128 af = _mm_mul_ps( sumRGBxzf, scale );
878 __m128 bf = _mm_mul_ps( sumRGByzf, scale );
879
880 __m128 df = _mm_mul_ps( _mm_cvtepi32_ps( t5 ), _mm_set1_ps( 4.0f / 16.0f ) );
881
882 // calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y;
883 __m128 cof0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
884 __m128 chf0 = _mm_fnmadd_ps( af, _mm_set1_ps( 425.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
885 __m128 cvf0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( 425.0f ), df ) );
886
887 // convert to r6g7b6
888 __m128i cohv = r6g7b6_AVX2( cof0, chf0, cvf0 );
889
890 uint64_t rgbho = _mm_extract_epi64( cohv, 0 );
891 uint32_t rgbv0 = _mm_extract_epi32( cohv, 2 );
892
893 // Error calculation
894 uint64_t error = 0;
895 if( !useHeuristics )
896 {
897 auto ro0 = ( rgbho >> 48 ) & 0x3F;
898 auto go0 = ( rgbho >> 40 ) & 0x7F;
899 auto bo0 = ( rgbho >> 32 ) & 0x3F;
900 auto ro1 = ( ro0 >> 4 ) | ( ro0 << 2 );
901 auto go1 = ( go0 >> 6 ) | ( go0 << 1 );
902 auto bo1 = ( bo0 >> 4 ) | ( bo0 << 2 );
903 auto ro2 = ( ro1 << 2 ) + 2;
904 auto go2 = ( go1 << 2 ) + 2;
905 auto bo2 = ( bo1 << 2 ) + 2;
906
907 __m256i ro3 = _mm256_set1_epi16( ro2 );
908 __m256i go3 = _mm256_set1_epi16( go2 );
909 __m256i bo3 = _mm256_set1_epi16( bo2 );
910
911 auto rh0 = ( rgbho >> 16 ) & 0x3F;
912 auto gh0 = ( rgbho >> 8 ) & 0x7F;
913 auto bh0 = ( rgbho >> 0 ) & 0x3F;
914 auto rh1 = ( rh0 >> 4 ) | ( rh0 << 2 );
915 auto gh1 = ( gh0 >> 6 ) | ( gh0 << 1 );
916 auto bh1 = ( bh0 >> 4 ) | ( bh0 << 2 );
917
918 auto rh2 = rh1 - ro1;
919 auto gh2 = gh1 - go1;
920 auto bh2 = bh1 - bo1;
921
922 __m256i rh3 = _mm256_set1_epi16( rh2 );
923 __m256i gh3 = _mm256_set1_epi16( gh2 );
924 __m256i bh3 = _mm256_set1_epi16( bh2 );
925
926 auto rv0 = ( rgbv0 >> 16 ) & 0x3F;
927 auto gv0 = ( rgbv0 >> 8 ) & 0x7F;
928 auto bv0 = ( rgbv0 >> 0 ) & 0x3F;
929 auto rv1 = ( rv0 >> 4 ) | ( rv0 << 2 );
930 auto gv1 = ( gv0 >> 6 ) | ( gv0 << 1 );
931 auto bv1 = ( bv0 >> 4 ) | ( bv0 << 2 );
932
933 auto rv2 = rv1 - ro1;
934 auto gv2 = gv1 - go1;
935 auto bv2 = bv1 - bo1;
936
937 __m256i rv3 = _mm256_set1_epi16( rv2 );
938 __m256i gv3 = _mm256_set1_epi16( gv2 );
939 __m256i bv3 = _mm256_set1_epi16( bv2 );
940
941 __m256i x = _mm256_set_epi16( 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 );
942
943 __m256i rh4 = _mm256_mullo_epi16( rh3, x );
944 __m256i gh4 = _mm256_mullo_epi16( gh3, x );
945 __m256i bh4 = _mm256_mullo_epi16( bh3, x );
946
947 __m256i y = _mm256_set_epi16( 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0 );
948
949 __m256i rv4 = _mm256_mullo_epi16( rv3, y );
950 __m256i gv4 = _mm256_mullo_epi16( gv3, y );
951 __m256i bv4 = _mm256_mullo_epi16( bv3, y );
952
953 __m256i rxy = _mm256_add_epi16( rh4, rv4 );
954 __m256i gxy = _mm256_add_epi16( gh4, gv4 );
955 __m256i bxy = _mm256_add_epi16( bh4, bv4 );
956
957 __m256i rp0 = _mm256_add_epi16( rxy, ro3 );
958 __m256i gp0 = _mm256_add_epi16( gxy, go3 );
959 __m256i bp0 = _mm256_add_epi16( bxy, bo3 );
960
961 __m256i rp1 = _mm256_srai_epi16( rp0, 2 );
962 __m256i gp1 = _mm256_srai_epi16( gp0, 2 );
963 __m256i bp1 = _mm256_srai_epi16( bp0, 2 );
964
965 __m256i rp2 = _mm256_max_epi16( _mm256_min_epi16( rp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() );
966 __m256i gp2 = _mm256_max_epi16( _mm256_min_epi16( gp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() );
967 __m256i bp2 = _mm256_max_epi16( _mm256_min_epi16( bp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() );
968
969 __m256i rdif = _mm256_sub_epi16( r08, rp2 );
970 __m256i gdif = _mm256_sub_epi16( g08, gp2 );
971 __m256i bdif = _mm256_sub_epi16( b08, bp2 );
972
973 __m256i rerr = _mm256_mullo_epi16( rdif, _mm256_set1_epi16( 38 ) );
974 __m256i gerr = _mm256_mullo_epi16( gdif, _mm256_set1_epi16( 76 ) );
975 __m256i berr = _mm256_mullo_epi16( bdif, _mm256_set1_epi16( 14 ) );
976
977 __m256i sum0 = _mm256_add_epi16( rerr, gerr );
978 __m256i sum1 = _mm256_add_epi16( sum0, berr );
979
980 __m256i sum2 = _mm256_madd_epi16( sum1, sum1 );
981
982 __m128i sum3 = _mm_add_epi32( _mm256_castsi256_si128( sum2 ), _mm256_extracti128_si256( sum2, 1 ) );
983
984 uint32_t err0 = _mm_extract_epi32( sum3, 0 );
985 uint32_t err1 = _mm_extract_epi32( sum3, 1 );
986 uint32_t err2 = _mm_extract_epi32( sum3, 2 );
987 uint32_t err3 = _mm_extract_epi32( sum3, 3 );
988
989 error = err0 + err1 + err2 + err3;
990 }
991 /**/
992
993 uint32_t rgbv = ( rgbv0 & 0x3F ) | ( ( rgbv0 >> 2 ) & 0x1FC0 ) | ( ( rgbv0 >> 3 ) & 0x7E000 );
994 uint64_t rgbho0_ = ( rgbho & 0x3F0000003F ) | ( ( rgbho >> 2 ) & 0x1FC000001FC0 ) | ( ( rgbho >> 3 ) & 0x7E0000007E000 );
995 uint64_t rgbho0 = ( rgbho0_ & 0x7FFFF ) | ( ( rgbho0_ >> 13 ) & 0x3FFFF80000 );
996
997 uint32_t hi = rgbv | ((rgbho0 & 0x1FFF) << 19);
998 rgbho0 >>= 13;
999 uint32_t lo = ( rgbho0 & 0x1 ) | ( ( rgbho0 & 0x1FE ) << 1 ) | ( ( rgbho0 & 0x600 ) << 2 ) | ( ( rgbho0 & 0x3F800 ) << 5 ) | ( ( rgbho0 & 0x1FC0000 ) << 6 );
1000
1001 uint32_t idx = ( ( rgbho >> 33 ) & 0xF ) | ( ( rgbho >> 41 ) & 0x10 ) | ( ( rgbho >> 48 ) & 0x20 );
1002 lo |= g_flags[idx];
1003 uint64_t result = static_cast<uint32_t>(_bswap(lo));
1004 result |= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << 32;
1005
1006 Plane plane;
1007
1008 plane.plane = result;
1009 if( useHeuristics )
1010 {
1011 plane.error = 0;
1012 mode = ModePlanar;
1013 }
1014 else
1015 {
1016 plane.error = error;
1017 }
1018 plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1));
1019
1020 return plane;
1021}
1022
1023static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate, const uint64_t value, const uint32_t error) noexcept
1024{
1025 size_t tidx[2];
1026
1027 // Get index of minimum error (terr[0] and terr[1])
1028 __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
1029 __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
1030
1031 __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
1032 __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
1033
1034 __m256i errMin0 = _mm256_min_epu32(errLo, errHi);
1035
1036 __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
1037 __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
1038
1039 __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
1040 __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
1041
1042 __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
1043 __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
1044
1045 __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
1046 __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
1047
1048 uint32_t mask0 = _mm256_movemask_epi8(errMask0);
1049 uint32_t mask1 = _mm256_movemask_epi8(errMask1);
1050
1051 tidx[0] = _bit_scan_forward(mask0) >> 2;
1052 tidx[1] = _bit_scan_forward(mask1) >> 2;
1053
1054 if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
1055 {
1056 return value;
1057 }
1058
1059 d |= tidx[0] << 26;
1060 d |= tidx[1] << 29;
1061
1062 unsigned int t0 = tsel[tidx[0]];
1063 unsigned int t1 = tsel[tidx[1]];
1064
1065 if (!rotate)
1066 {
1067 t0 &= 0xFF00FF00;
1068 t1 &= 0x00FF00FF;
1069 }
1070 else
1071 {
1072 t0 &= 0xCCCCCCCC;
1073 t1 &= 0x33333333;
1074 }
1075
1076 // Flip selectors from sign bit
1077 unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
1078
1079 return d | static_cast<uint64_t>(_bswap(t2)) << 32;
1080}
1081
1082#endif
1083
1084static etcpak_force_inline void Average( const uint8_t* data, v4i* a )
1085{
1086#ifdef __SSE4_1__
1087 __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
1088 __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
1089 __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
1090 __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
1091
1092 __m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
1093 __m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
1094 __m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
1095 __m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
1096 __m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
1097 __m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
1098 __m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
1099 __m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
1100
1101 __m128i sum0 = _mm_add_epi16(d0l, d1l);
1102 __m128i sum1 = _mm_add_epi16(d0h, d1h);
1103 __m128i sum2 = _mm_add_epi16(d2l, d3l);
1104 __m128i sum3 = _mm_add_epi16(d2h, d3h);
1105
1106 __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
1107 __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
1108 __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
1109 __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
1110 __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
1111 __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
1112 __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
1113 __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
1114
1115 __m128i b0 = _mm_add_epi32(sum0l, sum0h);
1116 __m128i b1 = _mm_add_epi32(sum1l, sum1h);
1117 __m128i b2 = _mm_add_epi32(sum2l, sum2h);
1118 __m128i b3 = _mm_add_epi32(sum3l, sum3h);
1119
1120 __m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3);
1121 __m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3);
1122 __m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3);
1123 __m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3);
1124
1125 _mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2))));
1126 _mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2))));
1127#elif defined __ARM_NEON
1128 uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data + 0), uint8x16_t());
1129 uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t());
1130 uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t());
1131 uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t());
1132
1133 uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) };
1134 uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) };
1135 uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) };
1136 uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) };
1137
1138 uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ) ) ), uint16x8_t());
1139 uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ) ) ), uint16x8_t());
1140 uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ) ) ), uint16x8_t());
1141 uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ) ) ), uint16x8_t());
1142
1143 uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) };
1144 uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) };
1145 uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) };
1146 uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) };
1147
1148 uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]);
1149 uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]);
1150 uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]);
1151 uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]);
1152
1153 uint32x4_t a0 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b2, b3), vdupq_n_u32(4)), 3);
1154 uint32x4_t a1 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b1), vdupq_n_u32(4)), 3);
1155 uint32x4_t a2 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b1, b3), vdupq_n_u32(4)), 3);
1156 uint32x4_t a3 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b2), vdupq_n_u32(4)), 3);
1157
1158 uint16x8_t o0 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a0 )), vqmovun_s32(vreinterpretq_s32_u32( a1 )));
1159 uint16x8_t o1 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a2 )), vqmovun_s32(vreinterpretq_s32_u32( a3 )));
1160
1161 a[0] = v4i{o0[2], o0[1], o0[0], 0};
1162 a[1] = v4i{o0[6], o0[5], o0[4], 0};
1163 a[2] = v4i{o1[2], o1[1], o1[0], 0};
1164 a[3] = v4i{o1[6], o1[5], o1[4], 0};
1165#else
1166 uint32_t r[4];
1167 uint32_t g[4];
1168 uint32_t b[4];
1169
1170 memset(r, 0, sizeof(r));
1171 memset(g, 0, sizeof(g));
1172 memset(b, 0, sizeof(b));
1173
1174 for( int j=0; j<4; j++ )
1175 {
1176 for( int i=0; i<4; i++ )
1177 {
1178 int index = (j & 2) + (i >> 1);
1179 b[index] += *data++;
1180 g[index] += *data++;
1181 r[index] += *data++;
1182 data++;
1183 }
1184 }
1185
1186 a[0] = v4i{ uint16_t( (r[2] + r[3] + 4) / 8 ), uint16_t( (g[2] + g[3] + 4) / 8 ), uint16_t( (b[2] + b[3] + 4) / 8 ), 0};
1187 a[1] = v4i{ uint16_t( (r[0] + r[1] + 4) / 8 ), uint16_t( (g[0] + g[1] + 4) / 8 ), uint16_t( (b[0] + b[1] + 4) / 8 ), 0};
1188 a[2] = v4i{ uint16_t( (r[1] + r[3] + 4) / 8 ), uint16_t( (g[1] + g[3] + 4) / 8 ), uint16_t( (b[1] + b[3] + 4) / 8 ), 0};
1189 a[3] = v4i{ uint16_t( (r[0] + r[2] + 4) / 8 ), uint16_t( (g[0] + g[2] + 4) / 8 ), uint16_t( (b[0] + b[2] + 4) / 8 ), 0};
1190#endif
1191}
1192
1193static etcpak_force_inline void CalcErrorBlock( const uint8_t* data, unsigned int err[4][4] )
1194{
1195#ifdef __SSE4_1__
1196 __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
1197 __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
1198 __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
1199 __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
1200
1201 __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
1202 __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
1203 __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
1204 __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
1205
1206 __m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
1207 __m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
1208 __m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
1209 __m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
1210 __m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
1211 __m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
1212 __m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
1213 __m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
1214
1215 __m128i sum0 = _mm_add_epi16(d0l, d1l);
1216 __m128i sum1 = _mm_add_epi16(d0h, d1h);
1217 __m128i sum2 = _mm_add_epi16(d2l, d3l);
1218 __m128i sum3 = _mm_add_epi16(d2h, d3h);
1219
1220 __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
1221 __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
1222 __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
1223 __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
1224 __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
1225 __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
1226 __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
1227 __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
1228
1229 __m128i b0 = _mm_add_epi32(sum0l, sum0h);
1230 __m128i b1 = _mm_add_epi32(sum1l, sum1h);
1231 __m128i b2 = _mm_add_epi32(sum2l, sum2h);
1232 __m128i b3 = _mm_add_epi32(sum3l, sum3h);
1233
1234 __m128i a0 = _mm_add_epi32(b2, b3);
1235 __m128i a1 = _mm_add_epi32(b0, b1);
1236 __m128i a2 = _mm_add_epi32(b1, b3);
1237 __m128i a3 = _mm_add_epi32(b0, b2);
1238
1239 _mm_storeu_si128((__m128i*)&err[0], a0);
1240 _mm_storeu_si128((__m128i*)&err[1], a1);
1241 _mm_storeu_si128((__m128i*)&err[2], a2);
1242 _mm_storeu_si128((__m128i*)&err[3], a3);
1243#elif defined __ARM_NEON
1244 uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data + 0), uint8x16_t());
1245 uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t());
1246 uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t());
1247 uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t());
1248
1249 uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) };
1250 uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) };
1251 uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) };
1252 uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) };
1253
1254 uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ))), uint16x8_t());
1255 uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ))), uint16x8_t());
1256 uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ))), uint16x8_t());
1257 uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ))), uint16x8_t());
1258
1259 uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) };
1260 uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) };
1261 uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) };
1262 uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) };
1263
1264 uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]);
1265 uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]);
1266 uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]);
1267 uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]);
1268
1269 uint32x4_t a0 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b2, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1270 uint32x4_t a1 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b1) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1271 uint32x4_t a2 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b1, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1272 uint32x4_t a3 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b2) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1273
1274 vst1q_u32(err[0], a0);
1275 vst1q_u32(err[1], a1);
1276 vst1q_u32(err[2], a2);
1277 vst1q_u32(err[3], a3);
1278#else
1279 unsigned int terr[4][4];
1280
1281 memset(terr, 0, 16 * sizeof(unsigned int));
1282
1283 for( int j=0; j<4; j++ )
1284 {
1285 for( int i=0; i<4; i++ )
1286 {
1287 int index = (j & 2) + (i >> 1);
1288 unsigned int d = *data++;
1289 terr[index][0] += d;
1290 d = *data++;
1291 terr[index][1] += d;
1292 d = *data++;
1293 terr[index][2] += d;
1294 data++;
1295 }
1296 }
1297
1298 for( int i=0; i<3; i++ )
1299 {
1300 err[0][i] = terr[2][i] + terr[3][i];
1301 err[1][i] = terr[0][i] + terr[1][i];
1302 err[2][i] = terr[1][i] + terr[3][i];
1303 err[3][i] = terr[0][i] + terr[2][i];
1304 }
1305 for( int i=0; i<4; i++ )
1306 {
1307 err[i][3] = 0;
1308 }
1309#endif
1310}
1311
1312static etcpak_force_inline unsigned int CalcError( const unsigned int block[4], const v4i& average )
1313{
1314 unsigned int err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow
1315 err -= block[0] * 2 * average[2];
1316 err -= block[1] * 2 * average[1];
1317 err -= block[2] * 2 * average[0];
1318 err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
1319 return err;
1320}
1321
1322static etcpak_force_inline void ProcessAverages( v4i* a )
1323{
1324#ifdef __SSE4_1__
1325 for( int i=0; i<2; i++ )
1326 {
1327 __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
1328
1329 __m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128));
1330
1331 __m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8);
1332
1333 __m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
1334 __m128i diff = _mm_sub_epi16(c, c1);
1335 diff = _mm_max_epi16(diff, _mm_set1_epi16(-4));
1336 diff = _mm_min_epi16(diff, _mm_set1_epi16(3));
1337
1338 __m128i co = _mm_add_epi16(c1, diff);
1339
1340 c = _mm_blend_epi16(co, c, 0xF0);
1341
1342 __m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2));
1343
1344 _mm_storeu_si128((__m128i*)a[4+i*2].data(), a0);
1345 }
1346
1347 for( int i=0; i<2; i++ )
1348 {
1349 __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
1350
1351 __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128));
1352 __m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8);
1353
1354 __m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4));
1355
1356 _mm_storeu_si128((__m128i*)a[i*2].data(), t2);
1357 }
1358#elif defined __ARM_NEON
1359 for( int i=0; i<2; i++ )
1360 {
1361 int16x8_t d = vld1q_s16((int16_t*)&a[i*2]);
1362 int16x8_t t = vaddq_s16(vmulq_s16(d, vdupq_n_s16(31)), vdupq_n_s16(128));
1363 int16x8_t c = vshrq_n_s16(vaddq_s16(t, vshrq_n_s16(t, 8)), 8);
1364
1365 int16x8_t c1 = vcombine_s16(vget_high_s16(c), vget_high_s16(c));
1366 int16x8_t diff = vsubq_s16(c, c1);
1367 diff = vmaxq_s16(diff, vdupq_n_s16(-4));
1368 diff = vminq_s16(diff, vdupq_n_s16(3));
1369
1370 int16x8_t co = vaddq_s16(c1, diff);
1371
1372 c = vcombine_s16(vget_low_s16(co), vget_high_s16(c));
1373
1374 int16x8_t a0 = vorrq_s16(vshlq_n_s16(c, 3), vshrq_n_s16(c, 2));
1375
1376 vst1q_s16((int16_t*)&a[4+i*2], a0);
1377 }
1378
1379 for( int i=0; i<2; i++ )
1380 {
1381 int16x8_t d = vld1q_s16((int16_t*)&a[i*2]);
1382
1383 int16x8_t t0 = vaddq_s16(vmulq_s16(d, vdupq_n_s16(15)), vdupq_n_s16(128));
1384 int16x8_t t1 = vshrq_n_s16(vaddq_s16(t0, vshrq_n_s16(t0, 8)), 8);
1385
1386 int16x8_t t2 = vorrq_s16(t1, vshlq_n_s16(t1, 4));
1387
1388 vst1q_s16((int16_t*)&a[i*2], t2);
1389 }
1390#else
1391 for( int i=0; i<2; i++ )
1392 {
1393 for( int j=0; j<3; j++ )
1394 {
1395 int32_t c1 = mul8bit( a[i*2+1][j], 31 );
1396 int32_t c2 = mul8bit( a[i*2][j], 31 );
1397
1398 int32_t diff = c2 - c1;
1399 if( diff > 3 ) diff = 3;
1400 else if( diff < -4 ) diff = -4;
1401
1402 int32_t co = c1 + diff;
1403
1404 a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 );
1405 a[4+i*2][j] = ( co << 3 ) | ( co >> 2 );
1406 }
1407 }
1408
1409 for( int i=0; i<4; i++ )
1410 {
1411 a[i][0] = g_avg2[mul8bit( a[i][0], 15 )];
1412 a[i][1] = g_avg2[mul8bit( a[i][1], 15 )];
1413 a[i][2] = g_avg2[mul8bit( a[i][2], 15 )];
1414 }
1415#endif
1416}
1417
1418static etcpak_force_inline void EncodeAverages( uint64_t& _d, const v4i* a, size_t idx )
1419{
1420 auto d = _d;
1421 d |= ( idx << 24 );
1422 size_t base = idx << 1;
1423
1424 if( ( idx & 0x2 ) == 0 )
1425 {
1426 for( int i=0; i<3; i++ )
1427 {
1428 d |= uint64_t( a[base+0][i] >> 4 ) << ( i*8 );
1429 d |= uint64_t( a[base+1][i] >> 4 ) << ( i*8 + 4 );
1430 }
1431 }
1432 else
1433 {
1434 for( int i=0; i<3; i++ )
1435 {
1436 d |= uint64_t( a[base+1][i] & 0xF8 ) << ( i*8 );
1437 int32_t c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3;
1438 c &= ~0xFFFFFFF8;
1439 d |= ((uint64_t)c) << ( i*8 );
1440 }
1441 }
1442 _d = d;
1443}
1444
1445static etcpak_force_inline uint64_t CheckSolid( const uint8_t* src )
1446{
1447#ifdef __SSE4_1__
1448 __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
1449 __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
1450 __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
1451 __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
1452
1453 __m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
1454
1455 __m128i c0 = _mm_cmpeq_epi8(d0, c);
1456 __m128i c1 = _mm_cmpeq_epi8(d1, c);
1457 __m128i c2 = _mm_cmpeq_epi8(d2, c);
1458 __m128i c3 = _mm_cmpeq_epi8(d3, c);
1459
1460 __m128i m0 = _mm_and_si128(c0, c1);
1461 __m128i m1 = _mm_and_si128(c2, c3);
1462 __m128i m = _mm_and_si128(m0, m1);
1463
1464 if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
1465 {
1466 return 0;
1467 }
1468#elif defined __ARM_NEON
1469 int32x4_t d0 = vld1q_s32((int32_t*)src + 0);
1470 int32x4_t d1 = vld1q_s32((int32_t*)src + 4);
1471 int32x4_t d2 = vld1q_s32((int32_t*)src + 8);
1472 int32x4_t d3 = vld1q_s32((int32_t*)src + 12);
1473
1474 int32x4_t c = vdupq_n_s32(d0[0]);
1475
1476 int32x4_t c0 = vreinterpretq_s32_u32(vceqq_s32(d0, c));
1477 int32x4_t c1 = vreinterpretq_s32_u32(vceqq_s32(d1, c));
1478 int32x4_t c2 = vreinterpretq_s32_u32(vceqq_s32(d2, c));
1479 int32x4_t c3 = vreinterpretq_s32_u32(vceqq_s32(d3, c));
1480
1481 int32x4_t m0 = vandq_s32(c0, c1);
1482 int32x4_t m1 = vandq_s32(c2, c3);
1483 int64x2_t m = vreinterpretq_s64_s32(vandq_s32(m0, m1));
1484
1485 if (m[0] != -1 || m[1] != -1)
1486 {
1487 return 0;
1488 }
1489#else
1490 const uint8_t* ptr = src + 4;
1491 for( int i=1; i<16; i++ )
1492 {
1493 if( memcmp( src, ptr, 4 ) != 0 )
1494 {
1495 return 0;
1496 }
1497 ptr += 4;
1498 }
1499#endif
1500 return 0x02000000 |
1501 ( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
1502 ( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
1503 ( (unsigned int)( src[2] & 0xF8 ) );
1504}
1505
1506static etcpak_force_inline void PrepareAverages( v4i a[8], const uint8_t* src, unsigned int err[4] )
1507{
1508 Average( src, a );
1509 ProcessAverages( a );
1510
1511 unsigned int errblock[4][4];
1512 CalcErrorBlock( src, errblock );
1513
1514 for( int i=0; i<4; i++ )
1515 {
1516 err[i/2] += CalcError( errblock[i], a[i] );
1517 err[2+i/2] += CalcError( errblock[i], a[i+4] );
1518 }
1519}
1520
1521static etcpak_force_inline void FindBestFit( uint64_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
1522{
1523 for( size_t i=0; i<16; i++ )
1524 {
1525 uint16_t* sel = tsel[i];
1526 unsigned int bid = id[i];
1527 uint64_t* ter = terr[bid%2];
1528
1529 uint8_t b = *data++;
1530 uint8_t g = *data++;
1531 uint8_t r = *data++;
1532 data++;
1533
1534 int dr = a[bid][0] - r;
1535 int dg = a[bid][1] - g;
1536 int db = a[bid][2] - b;
1537
1538#ifdef __SSE4_1__
1539 // Reference implementation
1540
1541 __m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28);
1542 // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1543 __m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0]));
1544 __m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1]));
1545 __m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0]));
1546 __m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1]));
1547
1548 __m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
1549 __m128i minError0 = _mm_min_epi32(error0, error1);
1550
1551 __m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
1552 __m128i minError1 = _mm_min_epi32(error2, error3);
1553
1554 __m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
1555 __m128i minError = _mm_min_epi32(minError0, minError1);
1556
1557 // Squaring the minimum error to produce correct values when adding
1558 __m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
1559 __m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
1560 squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
1561 _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
1562 __m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
1563 __m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
1564 squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
1565 _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
1566
1567 // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1568 error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2]));
1569 error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3]));
1570 error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2]));
1571 error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3]));
1572
1573 index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
1574 minError0 = _mm_min_epi32(error0, error1);
1575
1576 index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
1577 minError1 = _mm_min_epi32(error2, error3);
1578
1579 __m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
1580 minError = _mm_min_epi32(minError0, minError1);
1581
1582 // Squaring the minimum error to produce correct values when adding
1583 minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
1584 squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
1585 squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2));
1586 _mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow);
1587 minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
1588 squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
1589 squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3));
1590 _mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh);
1591 __m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
1592 _mm_storeu_si128((__m128i*)sel, minIndex);
1593#elif defined __ARM_NEON
1594 int32x4_t pix = vdupq_n_s32(dr * 77 + dg * 151 + db * 28);
1595
1596 // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1597 uint32x4_t error0 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[0])));
1598 uint32x4_t error1 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[1])));
1599 uint32x4_t error2 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[0])));
1600 uint32x4_t error3 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[1])));
1601
1602 uint32x4_t index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1));
1603 uint32x4_t minError0 = vminq_u32(error0, error1);
1604
1605 uint32x4_t index1 = vreinterpretq_u32_s32(vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))));
1606 uint32x4_t minError1 = vminq_u32(error2, error3);
1607
1608 uint32x4_t blendMask = vcltq_u32(minError1, minError0);
1609 uint32x4_t minIndex0 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
1610 uint32x4_t minError = vminq_u32(minError0, minError1);
1611
1612 // Squaring the minimum error to produce correct values when adding
1613 uint32x4_t squareErrorLow = vmulq_u32(minError, minError);
1614 uint32x4_t squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError))), 1);
1615 uint32x4x2_t squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
1616 uint64x2x2_t squareError = { vreinterpretq_u64_u32(squareErrorZip.val[0]), vreinterpretq_u64_u32(squareErrorZip.val[1]) };
1617 squareError.val[0] = vaddq_u64(squareError.val[0], vld1q_u64(ter + 0));
1618 squareError.val[1] = vaddq_u64(squareError.val[1], vld1q_u64(ter + 2));
1619 vst1q_u64(ter + 0, squareError.val[0]);
1620 vst1q_u64(ter + 2, squareError.val[1]);
1621
1622 // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1623 error0 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[2])));
1624 error1 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[3])));
1625 error2 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[2])));
1626 error3 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[3])));
1627
1628 index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1));
1629 minError0 = vminq_u32(error0, error1);
1630
1631 index1 = vreinterpretq_u32_s32( vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))) );
1632 minError1 = vminq_u32(error2, error3);
1633
1634 blendMask = vcltq_u32(minError1, minError0);
1635 uint32x4_t minIndex1 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
1636 minError = vminq_u32(minError0, minError1);
1637
1638 // Squaring the minimum error to produce correct values when adding
1639 squareErrorLow = vmulq_u32(minError, minError);
1640 squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32( vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError)) ), 1 );
1641 squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
1642 squareError.val[0] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[0] ), vld1q_u64(ter + 4));
1643 squareError.val[1] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[1] ), vld1q_u64(ter + 6));
1644 vst1q_u64(ter + 4, squareError.val[0]);
1645 vst1q_u64(ter + 6, squareError.val[1]);
1646
1647 uint16x8_t minIndex = vcombine_u16(vqmovn_u32(minIndex0), vqmovn_u32(minIndex1));
1648 vst1q_u16(sel, minIndex);
1649#else
1650 int pix = dr * 77 + dg * 151 + db * 28;
1651
1652 for( int t=0; t<8; t++ )
1653 {
1654 const int64_t* tab = g_table256[t];
1655 unsigned int idx = 0;
1656 uint64_t err = sq( tab[0] + pix );
1657 for( int j=1; j<4; j++ )
1658 {
1659 uint64_t local = sq( tab[j] + pix );
1660 if( local < err )
1661 {
1662 err = local;
1663 idx = j;
1664 }
1665 }
1666 *sel++ = idx;
1667 *ter++ += err;
1668 }
1669#endif
1670 }
1671}
1672
1673#if defined __SSE4_1__ || defined __ARM_NEON
1674// Non-reference implementation, but faster. Produces same results as the AVX2 version
1675static etcpak_force_inline void FindBestFit( uint32_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
1676{
1677 for( size_t i=0; i<16; i++ )
1678 {
1679 uint16_t* sel = tsel[i];
1680 unsigned int bid = id[i];
1681 uint32_t* ter = terr[bid%2];
1682
1683 uint8_t b = *data++;
1684 uint8_t g = *data++;
1685 uint8_t r = *data++;
1686 data++;
1687
1688 int dr = a[bid][0] - r;
1689 int dg = a[bid][1] - g;
1690 int db = a[bid][2] - b;
1691
1692#ifdef __SSE4_1__
1693 // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
1694 // This produces slightly different results, but is significant faster
1695 __m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14);
1696 __m128i pix = _mm_abs_epi16(pixel);
1697
1698 // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1699 // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
1700 __m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0]));
1701 __m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1]));
1702
1703 __m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1));
1704 __m128i minError = _mm_min_epi16(error0, error1);
1705
1706 // Exploiting symmetry of the selector table and use the sign bit
1707 // This produces slightly different results, but is needed to produce same results as AVX2 implementation
1708 __m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1));
1709 __m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
1710
1711 // Squaring the minimum error to produce correct values when adding
1712 __m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
1713 __m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
1714
1715 __m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
1716 __m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
1717
1718 squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
1719 _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
1720 squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
1721 _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
1722
1723 _mm_storeu_si128((__m128i*)sel, minIndex);
1724#elif defined __ARM_NEON
1725 int16x8_t pixel = vdupq_n_s16( dr * 38 + dg * 76 + db * 14 );
1726 int16x8_t pix = vabsq_s16( pixel );
1727
1728 int16x8_t error0 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[0] ) );
1729 int16x8_t error1 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[1] ) );
1730
1731 int16x8_t index = vandq_s16( vreinterpretq_s16_u16( vcltq_s16( error1, error0 ) ), vdupq_n_s16( 1 ) );
1732 int16x8_t minError = vminq_s16( error0, error1 );
1733
1734 int16x8_t indexBit = vandq_s16( vmvnq_s16( vshrq_n_s16( pixel, 15 ) ), vdupq_n_s16( -1 ) );
1735 int16x8_t minIndex = vorrq_s16( index, vaddq_s16( indexBit, indexBit ) );
1736
1737 int16x4_t minErrorLow = vget_low_s16( minError );
1738 int16x4_t minErrorHigh = vget_high_s16( minError );
1739
1740 int32x4_t squareErrorLow = vmull_s16( minErrorLow, minErrorLow );
1741 int32x4_t squareErrorHigh = vmull_s16( minErrorHigh, minErrorHigh );
1742
1743 int32x4_t squareErrorSumLow = vaddq_s32( squareErrorLow, vld1q_s32( (int32_t*)ter ) );
1744 int32x4_t squareErrorSumHigh = vaddq_s32( squareErrorHigh, vld1q_s32( (int32_t*)ter + 4 ) );
1745
1746 vst1q_s32( (int32_t*)ter, squareErrorSumLow );
1747 vst1q_s32( (int32_t*)ter + 4, squareErrorSumHigh );
1748
1749 vst1q_s16( (int16_t*)sel, minIndex );
1750#endif
1751 }
1752}
1753#endif
1754
1755static etcpak_force_inline uint8_t convert6(float f)
1756{
1757 int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
1758 return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3;
1759}
1760
1761static etcpak_force_inline uint8_t convert7(float f)
1762{
1763 int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
1764 return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
1765}
1766
1767static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode, bool useHeuristics )
1768{
1769 int32_t r = 0;
1770 int32_t g = 0;
1771 int32_t b = 0;
1772
1773 for( int i = 0; i < 16; ++i )
1774 {
1775 b += src[i * 4 + 0];
1776 g += src[i * 4 + 1];
1777 r += src[i * 4 + 2];
1778 }
1779
1780 int32_t difRyz = 0;
1781 int32_t difGyz = 0;
1782 int32_t difByz = 0;
1783 int32_t difRxz = 0;
1784 int32_t difGxz = 0;
1785 int32_t difBxz = 0;
1786
1787 const int32_t scaling[] = { -255, -85, 85, 255 };
1788
1789 for (int i = 0; i < 16; ++i)
1790 {
1791 int32_t difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b;
1792 int32_t difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g;
1793 int32_t difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r;
1794
1795 difRyz += difR * scaling[i % 4];
1796 difGyz += difG * scaling[i % 4];
1797 difByz += difB * scaling[i % 4];
1798
1799 difRxz += difR * scaling[i / 4];
1800 difGxz += difG * scaling[i / 4];
1801 difBxz += difB * scaling[i / 4];
1802 }
1803
1804 const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f);
1805
1806 float aR = difRxz * scale;
1807 float aG = difGxz * scale;
1808 float aB = difBxz * scale;
1809
1810 float bR = difRyz * scale;
1811 float bG = difGyz * scale;
1812 float bB = difByz * scale;
1813
1814 float dR = r * (4.0f / 16.0f);
1815 float dG = g * (4.0f / 16.0f);
1816 float dB = b * (4.0f / 16.0f);
1817
1818 // calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y;
1819 float cofR = std::fma(aR, 255.0f, std::fma(bR, 255.0f, dR));
1820 float cofG = std::fma(aG, 255.0f, std::fma(bG, 255.0f, dG));
1821 float cofB = std::fma(aB, 255.0f, std::fma(bB, 255.0f, dB));
1822 float chfR = std::fma(aR, -425.0f, std::fma(bR, 255.0f, dR));
1823 float chfG = std::fma(aG, -425.0f, std::fma(bG, 255.0f, dG));
1824 float chfB = std::fma(aB, -425.0f, std::fma(bB, 255.0f, dB));
1825 float cvfR = std::fma(aR, 255.0f, std::fma(bR, -425.0f, dR));
1826 float cvfG = std::fma(aG, 255.0f, std::fma(bG, -425.0f, dG));
1827 float cvfB = std::fma(aB, 255.0f, std::fma(bB, -425.0f, dB));
1828
1829 // convert to r6g7b6
1830 int32_t coR = convert6(cofR);
1831 int32_t coG = convert7(cofG);
1832 int32_t coB = convert6(cofB);
1833 int32_t chR = convert6(chfR);
1834 int32_t chG = convert7(chfG);
1835 int32_t chB = convert6(chfB);
1836 int32_t cvR = convert6(cvfR);
1837 int32_t cvG = convert7(cvfG);
1838 int32_t cvB = convert6(cvfB);
1839
1840 // Error calculation
1841 uint64_t error = 0;
1842 if( ModePlanar != mode && useHeuristics )
1843 {
1844 auto ro0 = coR;
1845 auto go0 = coG;
1846 auto bo0 = coB;
1847 auto ro1 = ( ro0 >> 4 ) | ( ro0 << 2 );
1848 auto go1 = ( go0 >> 6 ) | ( go0 << 1 );
1849 auto bo1 = ( bo0 >> 4 ) | ( bo0 << 2 );
1850 auto ro2 = ( ro1 << 2 ) + 2;
1851 auto go2 = ( go1 << 2 ) + 2;
1852 auto bo2 = ( bo1 << 2 ) + 2;
1853
1854 auto rh0 = chR;
1855 auto gh0 = chG;
1856 auto bh0 = chB;
1857 auto rh1 = ( rh0 >> 4 ) | ( rh0 << 2 );
1858 auto gh1 = ( gh0 >> 6 ) | ( gh0 << 1 );
1859 auto bh1 = ( bh0 >> 4 ) | ( bh0 << 2 );
1860
1861 auto rh2 = rh1 - ro1;
1862 auto gh2 = gh1 - go1;
1863 auto bh2 = bh1 - bo1;
1864
1865 auto rv0 = cvR;
1866 auto gv0 = cvG;
1867 auto bv0 = cvB;
1868 auto rv1 = ( rv0 >> 4 ) | ( rv0 << 2 );
1869 auto gv1 = ( gv0 >> 6 ) | ( gv0 << 1 );
1870 auto bv1 = ( bv0 >> 4 ) | ( bv0 << 2 );
1871
1872 auto rv2 = rv1 - ro1;
1873 auto gv2 = gv1 - go1;
1874 auto bv2 = bv1 - bo1;
1875 for( int i = 0; i < 16; ++i )
1876 {
1877 int32_t cR = clampu8( ( rh2 * ( i / 4 ) + rv2 * ( i % 4 ) + ro2 ) >> 2 );
1878 int32_t cG = clampu8( ( gh2 * ( i / 4 ) + gv2 * ( i % 4 ) + go2 ) >> 2 );
1879 int32_t cB = clampu8( ( bh2 * ( i / 4 ) + bv2 * ( i % 4 ) + bo2 ) >> 2 );
1880
1881 int32_t difB = static_cast<int>( src[i * 4 + 0] ) - cB;
1882 int32_t difG = static_cast<int>( src[i * 4 + 1] ) - cG;
1883 int32_t difR = static_cast<int>( src[i * 4 + 2] ) - cR;
1884
1885 int32_t dif = difR * 38 + difG * 76 + difB * 14;
1886
1887 error += dif * dif;
1888 }
1889 }
1890
1891 /**/
1892 uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 );
1893 uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 );
1894 uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 );
1895 uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C );
1896 lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 );
1897 lo |= ( ( coG & 0x3F ) << 17 ) | ( ( coG & 0x40 ) << 18 );
1898 lo |= coR << 25;
1899
1900 const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 );
1901
1902 lo |= g_flags[idx];
1903
1904 uint64_t result = static_cast<uint32_t>( _bswap( lo ) );
1905 result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32;
1906
1907 return std::make_pair( result, error );
1908}
1909
1910#ifdef __ARM_NEON
1911
1912static etcpak_force_inline int32x2_t Planar_NEON_DifXZ( int16x8_t dif_lo, int16x8_t dif_hi )
1913{
1914 int32x4_t dif0 = vmull_n_s16( vget_low_s16( dif_lo ), -255 );
1915 int32x4_t dif1 = vmull_n_s16( vget_high_s16( dif_lo ), -85 );
1916 int32x4_t dif2 = vmull_n_s16( vget_low_s16( dif_hi ), 85 );
1917 int32x4_t dif3 = vmull_n_s16( vget_high_s16( dif_hi ), 255 );
1918 int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
1919
1920#ifndef __aarch64__
1921 int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
1922 return vpadd_s32( dif5, dif5 );
1923#else
1924 return vdup_n_s32( vaddvq_s32( dif4 ) );
1925#endif
1926}
1927
1928static etcpak_force_inline int32x2_t Planar_NEON_DifYZ( int16x8_t dif_lo, int16x8_t dif_hi )
1929{
1930 int16x4_t scaling = { -255, -85, 85, 255 };
1931 int32x4_t dif0 = vmull_s16( vget_low_s16( dif_lo ), scaling );
1932 int32x4_t dif1 = vmull_s16( vget_high_s16( dif_lo ), scaling );
1933 int32x4_t dif2 = vmull_s16( vget_low_s16( dif_hi ), scaling );
1934 int32x4_t dif3 = vmull_s16( vget_high_s16( dif_hi ), scaling );
1935 int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
1936
1937#ifndef __aarch64__
1938 int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
1939 return vpadd_s32( dif5, dif5 );
1940#else
1941 return vdup_n_s32( vaddvq_s32( dif4 ) );
1942#endif
1943}
1944
1945static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src )
1946{
1947 uint16x8_t accu8 = vpaddlq_u8( src );
1948#ifndef __aarch64__
1949 uint16x4_t accu4 = vpadd_u16( vget_low_u16( accu8 ), vget_high_u16( accu8 ) );
1950 uint16x4_t accu2 = vpadd_u16( accu4, accu4 );
1951 uint16x4_t accu1 = vpadd_u16( accu2, accu2 );
1952 return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) );
1953#else
1954 return vdupq_n_s16( vaddvq_u16( accu8 ) );
1955#endif
1956}
1957
1958static etcpak_force_inline int16x8_t convert6_NEON( int32x4_t lo, int32x4_t hi )
1959{
1960 uint16x8_t x = vcombine_u16( vqmovun_s32( lo ), vqmovun_s32( hi ) );
1961 int16x8_t i = vreinterpretq_s16_u16( vshrq_n_u16( vqshlq_n_u16( x, 6 ), 6) ); // clamp 0-1023
1962 i = vhsubq_s16( i, vdupq_n_s16( 15 ) );
1963
1964 int16x8_t ip11 = vaddq_s16( i, vdupq_n_s16( 11 ) );
1965 int16x8_t ip4 = vaddq_s16( i, vdupq_n_s16( 4 ) );
1966
1967 return vshrq_n_s16( vsubq_s16( vsubq_s16( ip11, vshrq_n_s16( ip11, 7 ) ), vshrq_n_s16( ip4, 7) ), 3 );
1968}
1969
1970static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x )
1971{
1972 int16x4_t i = vreinterpret_s16_u16( vshr_n_u16( vqshl_n_u16( vqmovun_s32( x ), 6 ), 6 ) ); // clamp 0-1023
1973 i = vhsub_s16( i, vdup_n_s16( 15 ) );
1974
1975 int16x4_t p9 = vadd_s16( i, vdup_n_s16( 9 ) );
1976 int16x4_t p6 = vadd_s16( i, vdup_n_s16( 6 ) );
1977 return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, 8 ) ), vshr_n_s16( p6, 8 ) ), 2 );
1978}
1979
1980static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode, bool useHeuristics )
1981{
1982 uint8x16x4_t srcBlock = vld4q_u8( src );
1983
1984 int16x8_t bSumWide = Planar_NEON_SumWide( srcBlock.val[0] );
1985 int16x8_t gSumWide = Planar_NEON_SumWide( srcBlock.val[1] );
1986 int16x8_t rSumWide = Planar_NEON_SumWide( srcBlock.val[2] );
1987
1988 int16x8_t dif_R_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[2] ), 4) ), rSumWide );
1989 int16x8_t dif_R_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[2] ), 4) ), rSumWide );
1990
1991 int16x8_t dif_G_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[1] ), 4 ) ), gSumWide );
1992 int16x8_t dif_G_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[1] ), 4 ) ), gSumWide );
1993
1994 int16x8_t dif_B_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[0] ), 4) ), bSumWide );
1995 int16x8_t dif_B_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[0] ), 4) ), bSumWide );
1996
1997 int32x2x2_t dif_xz_z = vzip_s32( vzip_s32( Planar_NEON_DifXZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifXZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifXZ( dif_G_lo, dif_G_hi ) );
1998 int32x4_t dif_xz = vcombine_s32( dif_xz_z.val[0], dif_xz_z.val[1] );
1999 int32x2x2_t dif_yz_z = vzip_s32( vzip_s32( Planar_NEON_DifYZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifYZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifYZ( dif_G_lo, dif_G_hi ) );
2000 int32x4_t dif_yz = vcombine_s32( dif_yz_z.val[0], dif_yz_z.val[1] );
2001
2002 const float fscale = -4.0f / ( (255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f );
2003 float32x4_t fa = vmulq_n_f32( vcvtq_f32_s32( dif_xz ), fscale );
2004 float32x4_t fb = vmulq_n_f32( vcvtq_f32_s32( dif_yz ), fscale );
2005 int16x4_t bgrgSum = vzip_s16( vzip_s16( vget_low_s16( bSumWide ), vget_low_s16( rSumWide ) ).val[0], vget_low_s16( gSumWide ) ).val[0];
2006 float32x4_t fd = vmulq_n_f32( vcvtq_f32_s32( vmovl_s16( bgrgSum ) ), 4.0f / 16.0f);
2007
2008 float32x4_t cof = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, 255.0f );
2009 float32x4_t chf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, -425.0f );
2010 float32x4_t cvf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, -425.0f ), fa, 255.0f );
2011
2012 int32x4_t coi = vcvtq_s32_f32( cof );
2013 int32x4_t chi = vcvtq_s32_f32( chf );
2014 int32x4_t cvi = vcvtq_s32_f32( cvf );
2015
2016 int32x4x2_t tr_hv = vtrnq_s32( chi, cvi );
2017 int32x4x2_t tr_o = vtrnq_s32( coi, coi );
2018
2019 int16x8_t c_hvoo_br_6 = convert6_NEON( tr_hv.val[0], tr_o.val[0] );
2020 int16x4_t c_hvox_g_7 = convert7_NEON( vcombine_s32( vget_low_s32( tr_hv.val[1] ), vget_low_s32( tr_o.val[1] ) ) );
2021 int16x8_t c_hvoo_br_8 = vorrq_s16( vshrq_n_s16( c_hvoo_br_6, 4 ), vshlq_n_s16( c_hvoo_br_6, 2 ) );
2022 int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, 6 ), vshl_n_s16( c_hvox_g_7, 1 ) );
2023
2024 uint64_t error = 0;
2025 if( mode != ModePlanar && useHeuristics )
2026 {
2027 int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), 3 );
2028
2029 rec_gxbr_o = vadd_s16( vshl_n_s16( rec_gxbr_o, 2 ), vdup_n_s16( 2 ) );
2030 int16x8_t rec_ro_wide = vdupq_lane_s16( rec_gxbr_o, 3 );
2031 int16x8_t rec_go_wide = vdupq_lane_s16( rec_gxbr_o, 0 );
2032 int16x8_t rec_bo_wide = vdupq_lane_s16( rec_gxbr_o, 1 );
2033
2034 int16x4_t br_hv2 = vsub_s16( vget_low_s16( c_hvoo_br_8 ), vget_high_s16( c_hvoo_br_8 ) );
2035 int16x4_t gg_hv2 = vsub_s16( c_hvox_g_8, vdup_lane_s16( c_hvox_g_8, 2 ) );
2036
2037 int16x8_t scaleh_lo = { 0, 0, 0, 0, 1, 1, 1, 1 };
2038 int16x8_t scaleh_hi = { 2, 2, 2, 2, 3, 3, 3, 3 };
2039 int16x8_t scalev = { 0, 1, 2, 3, 0, 1, 2, 3 };
2040
2041 int16x8_t rec_r_1 = vmlaq_lane_s16( rec_ro_wide, scalev, br_hv2, 3 );
2042 int16x8_t rec_r_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_lo, br_hv2, 2 ), 2 ) ) );
2043 int16x8_t rec_r_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_hi, br_hv2, 2 ), 2 ) ) );
2044
2045 int16x8_t rec_b_1 = vmlaq_lane_s16( rec_bo_wide, scalev, br_hv2, 1 );
2046 int16x8_t rec_b_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_lo, br_hv2, 0 ), 2 ) ) );
2047 int16x8_t rec_b_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_hi, br_hv2, 0 ), 2 ) ) );
2048
2049 int16x8_t rec_g_1 = vmlaq_lane_s16( rec_go_wide, scalev, gg_hv2, 1 );
2050 int16x8_t rec_g_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_lo, gg_hv2, 0 ), 2 ) ) );
2051 int16x8_t rec_g_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_hi, gg_hv2, 0 ), 2 ) ) );
2052
2053 int16x8_t dif_r_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[2] ) ) ), rec_r_lo );
2054 int16x8_t dif_r_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[2] ) ) ), rec_r_hi );
2055
2056 int16x8_t dif_g_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[1] ) ) ), rec_g_lo );
2057 int16x8_t dif_g_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[1] ) ) ), rec_g_hi );
2058
2059 int16x8_t dif_b_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[0] ) ) ), rec_b_lo );
2060 int16x8_t dif_b_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[0] ) ) ), rec_b_hi );
2061
2062 int16x8_t dif_lo = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_lo, 38 ), dif_g_lo, 76 ), dif_b_lo, 14 );
2063 int16x8_t dif_hi = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_hi, 38 ), dif_g_hi, 76 ), dif_b_hi, 14 );
2064
2065 int16x4_t tmpDif = vget_low_s16( dif_lo );
2066 int32x4_t difsq_0 = vmull_s16( tmpDif, tmpDif );
2067 tmpDif = vget_high_s16( dif_lo );
2068 int32x4_t difsq_1 = vmull_s16( tmpDif, tmpDif );
2069 tmpDif = vget_low_s16( dif_hi );
2070 int32x4_t difsq_2 = vmull_s16( tmpDif, tmpDif );
2071 tmpDif = vget_high_s16( dif_hi );
2072 int32x4_t difsq_3 = vmull_s16( tmpDif, tmpDif );
2073
2074 uint32x4_t difsq_5 = vaddq_u32( vreinterpretq_u32_s32( difsq_0 ), vreinterpretq_u32_s32( difsq_1 ) );
2075 uint32x4_t difsq_6 = vaddq_u32( vreinterpretq_u32_s32( difsq_2 ), vreinterpretq_u32_s32( difsq_3 ) );
2076
2077 uint64x2_t difsq_7 = vaddl_u32( vget_low_u32( difsq_5 ), vget_high_u32( difsq_5 ) );
2078 uint64x2_t difsq_8 = vaddl_u32( vget_low_u32( difsq_6 ), vget_high_u32( difsq_6 ) );
2079
2080 uint64x2_t difsq_9 = vaddq_u64( difsq_7, difsq_8 );
2081
2082#ifdef __aarch64__
2083 error = vaddvq_u64( difsq_9 );
2084#else
2085 error = vgetq_lane_u64( difsq_9, 0 ) + vgetq_lane_u64( difsq_9, 1 );
2086#endif
2087 }
2088
2089 int32_t coR = c_hvoo_br_6[6];
2090 int32_t coG = c_hvox_g_7[2];
2091 int32_t coB = c_hvoo_br_6[4];
2092
2093 int32_t chR = c_hvoo_br_6[2];
2094 int32_t chG = c_hvox_g_7[0];
2095 int32_t chB = c_hvoo_br_6[0];
2096
2097 int32_t cvR = c_hvoo_br_6[3];
2098 int32_t cvG = c_hvox_g_7[1];
2099 int32_t cvB = c_hvoo_br_6[1];
2100
2101 uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 );
2102 uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 );
2103 uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 );
2104 uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C );
2105 lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 );
2106 lo |= ( ( coG & 0x3F) << 17) | ( (coG & 0x40 ) << 18 );
2107 lo |= coR << 25;
2108
2109 const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 );
2110
2111 lo |= g_flags[idx];
2112
2113 uint64_t result = static_cast<uint32_t>( _bswap(lo) );
2114 result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32;
2115
2116 return std::make_pair( result, error );
2117}
2118
2119#endif
2120
2121#ifdef __AVX2__
2122uint32_t calculateErrorTH( bool tMode, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist, __m128i r8, __m128i g8, __m128i b8 )
2123#else
2124uint32_t calculateErrorTH( bool tMode, uint8_t* src, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist )
2125#endif
2126{
2127 uint32_t blockErr = 0, bestBlockErr = MaxError;
2128
2129 uint32_t pixColors;
2130 uint8_t possibleColors[4][3];
2131 uint8_t colors[2][3];
2132
2133 decompressColor( colorsRGB444, colors );
2134
2135#ifdef __AVX2__
2136 __m128i reverseMask = _mm_set_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 );
2137#endif
2138
2139 // test distances
2140 for( uint8_t d = startDist; d < 8; ++d )
2141 {
2142 if( d >= 2 && dist == d - 2 ) break;
2143
2144 blockErr = 0;
2145 pixColors = 0;
2146
2147 if( tMode )
2148 {
2149 calculatePaintColors59T( d, colors, possibleColors );
2150 }
2151 else
2152 {
2153 calculatePaintColors58H( d, colors, possibleColors );
2154 }
2155
2156#ifdef __AVX2__
2157 // RGB ordering
2158 __m128i b8Rev = _mm_shuffle_epi8( b8, reverseMask );
2159 __m128i g8Rev = _mm_shuffle_epi8( g8, reverseMask );
2160 __m128i r8Rev = _mm_shuffle_epi8( r8, reverseMask );
2161
2162 // extends 3x128 bits RGB into 3x256 bits RGB for error comparisions
2163 static const __m128i zero = _mm_setzero_si128();
2164 __m128i b8Lo = _mm_unpacklo_epi8( b8Rev, zero );
2165 __m128i g8Lo = _mm_unpacklo_epi8( g8Rev, zero );
2166 __m128i r8Lo = _mm_unpacklo_epi8( r8Rev, zero );
2167 __m128i b8Hi = _mm_unpackhi_epi8( b8Rev, zero );
2168 __m128i g8Hi = _mm_unpackhi_epi8( g8Rev, zero );
2169 __m128i r8Hi = _mm_unpackhi_epi8( r8Rev, zero );
2170
2171 __m256i b8 = _mm256_set_m128i( b8Hi, b8Lo );
2172 __m256i g8 = _mm256_set_m128i( g8Hi, g8Lo );
2173 __m256i r8 = _mm256_set_m128i( r8Hi, r8Lo );
2174
2175 // caculates differences between the pixel colrs and the palette colors
2176 __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[0][B] ) ) );
2177 __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[0][G] ) ) );
2178 __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[0][R] ) ) );
2179
2180 // luma-based error calculations
2181 static const __m256i bWeight = _mm256_set1_epi16( 14 );
2182 static const __m256i gWeight = _mm256_set1_epi16( 76 );
2183 static const __m256i rWeight = _mm256_set1_epi16( 38 );
2184
2185 diffb = _mm256_mullo_epi16( diffb, bWeight );
2186 diffg = _mm256_mullo_epi16( diffg, gWeight );
2187 diffr = _mm256_mullo_epi16( diffr, rWeight );
2188
2189 // obtains the error with the current palette color
2190 __m256i lowestPixErr = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
2191
2192 // error calucations with the remaining three palette colors
2193 static const uint32_t masks[4] = { 0, 0x55555555, 0xAAAAAAAA, 0xFFFFFFFF };
2194 for( uint8_t c = 1; c < 4; c++ )
2195 {
2196 __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[c][B] ) ) );
2197 __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[c][G] ) ) );
2198 __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[c][R] ) ) );
2199
2200 diffb = _mm256_mullo_epi16( diffb, bWeight );
2201 diffg = _mm256_mullo_epi16( diffg, gWeight );
2202 diffr = _mm256_mullo_epi16( diffr, rWeight );
2203
2204 // error comparison with the previous best color
2205 __m256i pixErrors = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
2206 __m256i minErr = _mm256_min_epu16( lowestPixErr, pixErrors );
2207 __m256i cmpRes = _mm256_cmpeq_epi16( pixErrors, minErr );
2208 lowestPixErr = minErr;
2209
2210 // update pixel colors
2211 uint32_t updPixColors = _mm256_movemask_epi8( cmpRes );
2212 uint32_t prevPixColors = pixColors & ~updPixColors;
2213 uint32_t mskPixColors = masks[c] & updPixColors;
2214 pixColors = prevPixColors | mskPixColors;
2215 }
2216
2217 // accumulate the block error
2218 alignas( 32 ) uint16_t pixErr16[16] = { 0, };
2219 _mm256_storeu_si256( (__m256i*)pixErr16, lowestPixErr );
2220 for( uint8_t p = 0; p < 16; p++ )
2221 {
2222 blockErr += (int)( pixErr16[p] ) * pixErr16[p];
2223 }
2224#else
2225 for( size_t y = 0; y < 4; ++y )
2226 {
2227 for( size_t x = 0; x < 4; ++x )
2228 {
2229 uint32_t bestPixErr = MaxError;
2230 pixColors <<= 2; // Make room for next value
2231
2232 // Loop possible block colors
2233 for( uint8_t c = 0; c < 4; ++c )
2234 {
2235 int diff[3];
2236 diff[R] = src[4 * ( x * 4 + y ) + R] - possibleColors[c][R];
2237 diff[G] = src[4 * ( x * 4 + y ) + G] - possibleColors[c][G];
2238 diff[B] = src[4 * ( x * 4 + y ) + B] - possibleColors[c][B];
2239
2240 const uint32_t err = 38 * abs( diff[R] ) + 76 * abs( diff[G] ) + 14 * abs( diff[B] );
2241 uint32_t pixErr = err * err;
2242
2243 // Choose best error
2244 if( pixErr < bestPixErr )
2245 {
2246 bestPixErr = pixErr;
2247 pixColors ^= ( pixColors & 3 ); // Reset the two first bits
2248 pixColors |= c;
2249 }
2250 }
2251 blockErr += bestPixErr;
2252 }
2253 }
2254#endif
2255
2256 if( blockErr < bestBlockErr )
2257 {
2258 bestBlockErr = blockErr;
2259 dist = d;
2260 pixIndices = pixColors;
2261 }
2262 }
2263
2264 return bestBlockErr;
2265}
2266
2267
2268// main T-/H-mode compression function
2269#ifdef __AVX2__
2270uint32_t compressBlockTH( uint8_t* src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool& tMode, __m128i r8, __m128i g8, __m128i b8 )
2271#else
2272uint32_t compressBlockTH( uint8_t *src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool &tMode )
2273#endif
2274{
2275#ifdef __AVX2__
2276 alignas( 8 ) uint8_t luma[16] = { 0, };
2277 _mm_storeu_si128 ( (__m128i* )luma, l.luma8 );
2278#elif defined __ARM_NEON && defined __aarch64__
2279 alignas( 8 ) uint8_t luma[16] = { 0 };
2280 vst1q_u8( luma, l.luma8 );
2281#else
2282 uint8_t* luma = l.val;
2283#endif
2284
2285 uint8_t pixIdx[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
2286
2287 // 1) sorts the pairs of (luma, pix_idx)
2288 insertionSort( luma, pixIdx );
2289
2290 // 2) finds the min (left+right)
2291 uint8_t minSumRangeIdx = 0;
2292 uint16_t minSumRangeValue;
2293 uint16_t sum;
2294 static const uint8_t diffBonus[15] = {8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 8};
2295 const int16_t temp = luma[15] - luma[0];
2296
2297 minSumRangeValue = luma[15] - luma[1] + diffBonus[0];
2298 for( uint8_t i = 1; i < 14; i++ )
2299 {
2300 sum = temp - luma[i+1] + luma[i] + diffBonus[i];
2301 if( minSumRangeValue > sum )
2302 {
2303 minSumRangeValue = sum;
2304 minSumRangeIdx = i;
2305 }
2306 }
2307
2308 sum = luma[14] - luma[0] + diffBonus[14];
2309 if( minSumRangeValue > sum )
2310 {
2311 minSumRangeValue = sum;
2312 minSumRangeIdx = 14;
2313 }
2314 uint8_t lRange, rRange;
2315
2316 lRange = luma[minSumRangeIdx] - luma[0];
2317 rRange = luma[15] - luma[minSumRangeIdx + 1];
2318
2319 // 3) sets a proper mode
2320 bool swap = false;
2321 if( lRange >= rRange )
2322 {
2323 if( lRange >= rRange * 2 )
2324 {
2325 swap = true;
2326 tMode = true;
2327 }
2328 }
2329 else
2330 {
2331 if( lRange * 2 <= rRange ) tMode = true;
2332 }
2333 // 4) calculates the two base colors
2334 uint8_t rangeIdx[4] = { pixIdx[0], pixIdx[minSumRangeIdx], pixIdx[minSumRangeIdx + 1], pixIdx[15] };
2335
2336 uint16_t r[4], g[4], b[4];
2337 for( uint8_t i = 0; i < 4; ++i )
2338 {
2339 uint8_t idx = rangeIdx[i] * 4;
2340 b[i] = src[idx];
2341 g[i] = src[idx + 1];
2342 r[i] = src[idx + 2];
2343 }
2344
2345 uint8_t mid_rgb[2][3];
2346 if( swap )
2347 {
2348 mid_rgb[1][B] = ( b[0] + b[1] ) / 2;
2349 mid_rgb[1][G] = ( g[0] + g[1] ) / 2;
2350 mid_rgb[1][R] = ( r[0] + r[1] ) / 2;
2351
2352 uint16_t sum_rgb[3] = { 0, 0, 0 };
2353 for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
2354 {
2355 uint8_t idx = pixIdx[i] * 4;
2356 sum_rgb[B] += src[idx];
2357 sum_rgb[G] += src[idx + 1];
2358 sum_rgb[R] += src[idx + 2];
2359 }
2360 const uint8_t temp = 15 - minSumRangeIdx;
2361 mid_rgb[0][B] = sum_rgb[B] / temp;
2362 mid_rgb[0][G] = sum_rgb[G] / temp;
2363 mid_rgb[0][R] = sum_rgb[R] / temp;
2364 }
2365 else
2366 {
2367 mid_rgb[0][B] = (b[0] + b[1]) / 2;
2368 mid_rgb[0][G] = (g[0] + g[1]) / 2;
2369 mid_rgb[0][R] = (r[0] + r[1]) / 2;
2370 if( tMode )
2371 {
2372 uint16_t sum_rgb[3] = { 0, 0, 0 };
2373 for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
2374 {
2375 uint8_t idx = pixIdx[i] * 4;
2376 sum_rgb[B] += src[idx];
2377 sum_rgb[G] += src[idx + 1];
2378 sum_rgb[R] += src[idx + 2];
2379 }
2380 const uint8_t temp = 15 - minSumRangeIdx;
2381 mid_rgb[1][B] = sum_rgb[B] / temp;
2382 mid_rgb[1][G] = sum_rgb[G] / temp;
2383 mid_rgb[1][R] = sum_rgb[R] / temp;
2384 }
2385 else
2386 {
2387 mid_rgb[1][B] = (b[2] + b[3]) / 2;
2388 mid_rgb[1][G] = (g[2] + g[3]) / 2;
2389 mid_rgb[1][R] = (r[2] + r[3]) / 2;
2390 }
2391 }
2392
2393 // 5) sets the start distance index
2394 uint32_t startDistCandidate;
2395 uint32_t avgDist;
2396 if( tMode )
2397 {
2398 if( swap )
2399 {
2400 avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] ) / 6;
2401 }
2402 else
2403 {
2404 avgDist = ( b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 6;
2405 }
2406 }
2407 else
2408 {
2409 avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] + b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 12;
2410 }
2411
2412 if( avgDist <= 16)
2413 {
2414 startDistCandidate = 0;
2415 }
2416 else if( avgDist <= 23 )
2417 {
2418 startDistCandidate = 1;
2419 }
2420 else if( avgDist <= 32 )
2421 {
2422 startDistCandidate = 2;
2423 }
2424 else if( avgDist <= 41 )
2425 {
2426 startDistCandidate = 3;
2427 }
2428 else
2429 {
2430 startDistCandidate = 4;
2431 }
2432
2433 uint32_t bestErr = MaxError;
2434 uint32_t bestPixIndices;
2435 uint8_t bestDist = 10;
2436 uint8_t colorsRGB444[2][3];
2437 compressColor( mid_rgb, colorsRGB444, tMode );
2438 compressed1 = 0;
2439
2440 // 6) finds the best candidate with the lowest error
2441#ifdef __AVX2__
2442 // Vectorized ver
2443 bestErr = calculateErrorTH( tMode, colorsRGB444, bestDist, bestPixIndices, startDistCandidate, r8, g8, b8 );
2444#else
2445 // Scalar ver
2446 bestErr = calculateErrorTH( tMode, src, colorsRGB444, bestDist, bestPixIndices, startDistCandidate );
2447#endif
2448
2449 // 7) outputs the final T or H block
2450 if( tMode )
2451 {
2452 // Put the compress params into the compression block
2453 compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 23;
2454 compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 19;
2455 compressed1 |= ( colorsRGB444[0][B] ) << 15;
2456 compressed1 |= ( colorsRGB444[1][R] ) << 11;
2457 compressed1 |= ( colorsRGB444[1][G] ) << 7;
2458 compressed1 |= ( colorsRGB444[1][B] ) << 3;
2459 compressed1 |= bestDist & 0x7;
2460 }
2461 else
2462 {
2463 int bestRGB444ColPacked[2];
2464 bestRGB444ColPacked[0] = (colorsRGB444[0][R] << 8) + (colorsRGB444[0][G] << 4) + colorsRGB444[0][B];
2465 bestRGB444ColPacked[1] = (colorsRGB444[1][R] << 8) + (colorsRGB444[1][G] << 4) + colorsRGB444[1][B];
2466 if( ( bestRGB444ColPacked[0] >= bestRGB444ColPacked[1] ) ^ ( ( bestDist & 1 ) == 1 ) )
2467 {
2468 swapColors( colorsRGB444 );
2469 // Reshuffle pixel indices to to exchange C1 with C3, and C2 with C4
2470 bestPixIndices = ( 0x55555555 & bestPixIndices ) | ( 0xaaaaaaaa & ( ~bestPixIndices ) );
2471 }
2472
2473 // Put the compress params into the compression block
2474 compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 22;
2475 compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 18;
2476 compressed1 |= ( colorsRGB444[0][B] & 0xf ) << 14;
2477 compressed1 |= ( colorsRGB444[1][R] & 0xf ) << 10;
2478 compressed1 |= ( colorsRGB444[1][G] & 0xf ) << 6;
2479 compressed1 |= ( colorsRGB444[1][B] & 0xf ) << 2;
2480 compressed1 |= ( bestDist >> 1 ) & 0x3;
2481 }
2482
2483 bestPixIndices = indexConversion( bestPixIndices );
2484 compressed2 = 0;
2485 compressed2 = ( compressed2 & ~( ( 0x2 << 31 ) - 1 ) ) | ( bestPixIndices & ( ( 2 << 31 ) - 1 ) );
2486
2487 return bestErr;
2488}
2489//#endif
2490
2491template<class T, class S>
2492static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error)
2493{
2494 size_t tidx[2];
2495 tidx[0] = GetLeastError( terr[0], 8 );
2496 tidx[1] = GetLeastError( terr[1], 8 );
2497
2498 if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
2499 {
2500 return value;
2501 }
2502
2503 d |= tidx[0] << 26;
2504 d |= tidx[1] << 29;
2505 for( int i=0; i<16; i++ )
2506 {
2507 uint64_t t = tsel[i][tidx[id[i]%2]];
2508 d |= ( t & 0x1 ) << ( i + 32 );
2509 d |= ( t & 0x2 ) << ( i + 47 );
2510 }
2511
2512 return FixByteOrder(d);
2513}
2514
2515}
2516
2517static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src )
2518{
2519#ifdef __AVX2__
2520 uint64_t d = CheckSolid_AVX2( src );
2521 if( d != 0 ) return d;
2522
2523 alignas(32) v4i a[8];
2524
2525 __m128i err0 = PrepareAverages_AVX2( a, src );
2526
2527 // Get index of minimum error (err0)
2528 __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
2529 __m128i errMin0 = _mm_min_epu32(err0, err1);
2530
2531 __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
2532 __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
2533
2534 __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
2535
2536 uint32_t mask = _mm_movemask_epi8(errMask);
2537
2538 uint32_t idx = _bit_scan_forward(mask) >> 2;
2539
2540 d |= EncodeAverages_AVX2( a, idx );
2541
2542 alignas(32) uint32_t terr[2][8] = {};
2543 alignas(32) uint32_t tsel[8];
2544
2545 if ((idx == 0) || (idx == 2))
2546 {
2547 FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
2548 }
2549 else
2550 {
2551 FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
2552 }
2553
2554 return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1 );
2555#else
2556 uint64_t d = CheckSolid( src );
2557 if( d != 0 ) return d;
2558
2559 v4i a[8];
2560 unsigned int err[4] = {};
2561 PrepareAverages( a, src, err );
2562 size_t idx = GetLeastError( err, 4 );
2563 EncodeAverages( d, a, idx );
2564
2565#if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
2566 uint32_t terr[2][8] = {};
2567#else
2568 uint64_t terr[2][8] = {};
2569#endif
2570 uint16_t tsel[16][8];
2571 auto id = g_id[idx];
2572 FindBestFit( terr, tsel, a, id, src );
2573
2574 return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
2575#endif
2576}
2577
2578#ifdef __AVX2__
2579// horizontal min/max functions. https://stackoverflow.com/questions/22256525/horizontal-minimum-and-maximum-using-sse
2580// if an error occurs in GCC, please change the value of -march in CFLAGS to a specific value for your CPU (e.g., skylake).
2581static inline int16_t hMax( __m128i buffer, uint8_t& idx )
2582{
2583 __m128i tmp1 = _mm_sub_epi8( _mm_set1_epi8( (char)( 255 ) ), buffer );
2584 __m128i tmp2 = _mm_min_epu8( tmp1, _mm_srli_epi16( tmp1, 8 ) );
2585 __m128i tmp3 = _mm_minpos_epu16( tmp2 );
2586 uint8_t result = 255 - (uint8_t)_mm_cvtsi128_si32( tmp3 );
2587 __m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) );
2588 idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
2589
2590 return result;
2591}
2592#elif defined __ARM_NEON && defined __aarch64__
2593static inline int16_t hMax( uint8x16_t buffer, uint8_t& idx )
2594{
2595 const uint8_t max = vmaxvq_u8( buffer );
2596 const uint16x8_t vmax = vdupq_n_u16( max );
2597 uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() );
2598 uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[0] );
2599 uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[1] );
2600 uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmax );
2601 uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmax );
2602
2603 static const uint16_t mask_lsb[] = {
2604 0x1, 0x2, 0x4, 0x8,
2605 0x10, 0x20, 0x40, 0x80 };
2606
2607 static const uint16_t mask_msb[] = {
2608 0x100, 0x200, 0x400, 0x800,
2609 0x1000, 0x2000, 0x4000, 0x8000 };
2610
2611 uint16x8_t vmask_lsb = vld1q_u16( mask_lsb );
2612 uint16x8_t vmask_msb = vld1q_u16( mask_msb );
2613 uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask );
2614 uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask );
2615 pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2616 pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2617 pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2618 uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), 0 );
2619 pos_msb = vpaddq_u16( pos_msb, pos_msb );
2620 pos_msb = vpaddq_u16( pos_msb, pos_msb );
2621 pos_msb = vpaddq_u16( pos_msb, pos_msb );
2622 uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), 0 );
2623 idx = idx_lane1 != 0 ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 );
2624
2625 return max;
2626}
2627#endif
2628
2629#ifdef __AVX2__
2630static inline int16_t hMin( __m128i buffer, uint8_t& idx )
2631{
2632 __m128i tmp2 = _mm_min_epu8( buffer, _mm_srli_epi16( buffer, 8 ) );
2633 __m128i tmp3 = _mm_minpos_epu16( tmp2 );
2634 uint8_t result = (uint8_t)_mm_cvtsi128_si32( tmp3 );
2635 __m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) );
2636 idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
2637 return result;
2638}
2639#elif defined __ARM_NEON && defined __aarch64__
2640static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx )
2641{
2642 const uint8_t min = vminvq_u8( buffer );
2643 const uint16x8_t vmin = vdupq_n_u16( min );
2644 uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() );
2645 uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[0] );
2646 uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[1] );
2647 uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmin );
2648 uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmin );
2649
2650 static const uint16_t mask_lsb[] = {
2651 0x1, 0x2, 0x4, 0x8,
2652 0x10, 0x20, 0x40, 0x80 };
2653
2654 static const uint16_t mask_msb[] = {
2655 0x100, 0x200, 0x400, 0x800,
2656 0x1000, 0x2000, 0x4000, 0x8000 };
2657
2658 uint16x8_t vmask_lsb = vld1q_u16( mask_lsb );
2659 uint16x8_t vmask_msb = vld1q_u16( mask_msb );
2660 uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask );
2661 uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask );
2662 pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2663 pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2664 pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2665 uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), 0 );
2666 pos_msb = vpaddq_u16( pos_msb, pos_msb );
2667 pos_msb = vpaddq_u16( pos_msb, pos_msb );
2668 pos_msb = vpaddq_u16( pos_msb, pos_msb );
2669 uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), 0 );
2670 idx = idx_lane1 != 0 ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 );
2671
2672 return min;
2673}
2674#endif
2675
2676// During search it is not convenient to store the bits the way they are stored in the
2677// file format. Hence, after search, it is converted to this format.
2678// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
2679static inline void stuff59bits( unsigned int thumbT59W1, unsigned int thumbT59W2, unsigned int& thumbTW1, unsigned int& thumbTW2 )
2680{
2681 // Put bits in twotimer configuration for 59 (red overflows)
2682 //
2683 // Go from this bit layout:
2684 //
2685 // |63 62 61 60 59|58 57 56 55|54 53 52 51|50 49 48 47|46 45 44 43|42 41 40 39|38 37 36 35|34 33 32|
2686 // |----empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|--dist--|
2687 //
2688 // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2689 // |----------------------------------------index bits---------------------------------------------|
2690 //
2691 //
2692 // To this:
2693 //
2694 // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2695 // -----------------------------------------------------------------------------------------------
2696 // |// // //|R0a |//|R0b |G0 |B0 |R1 |G1 |B1 |da |df|db|
2697 // -----------------------------------------------------------------------------------------------
2698 //
2699 // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2700 // |----------------------------------------index bits---------------------------------------------|
2701 //
2702 // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2703 // -----------------------------------------------------------------------------------------------
2704 // | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp|
2705 // | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt|
2706 // ------------------------------------------------------------------------------------------------
2707
2708 uint8_t R0a;
2709 uint8_t bit, a, b, c, d, bits;
2710
2711 R0a = ( thumbT59W1 >> 25 ) & 0x3;
2712
2713 // Fix middle part
2714 thumbTW1 = thumbT59W1 << 1;
2715 // Fix R0a (top two bits of R0)
2716 thumbTW1 = ( thumbTW1 & ~( 0x3 << 27 ) ) | ( ( R0a & 0x3 ) << 27 );
2717 // Fix db (lowest bit of d)
2718 thumbTW1 = ( thumbTW1 & ~0x1 ) | ( thumbT59W1 & 0x1 );
2719
2720 // Make sure that red overflows:
2721 a = ( thumbTW1 >> 28 ) & 0x1;
2722 b = ( thumbTW1 >> 27 ) & 0x1;
2723 c = ( thumbTW1 >> 25 ) & 0x1;
2724 d = ( thumbTW1 >> 24 ) & 0x1;
2725
2726 // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
2727 // The following logical expression checks for the presence of any of those:
2728 bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
2729 bits = 0xf * bit;
2730 thumbTW1 = ( thumbTW1 & ~( 0x7 << 29 ) ) | ( bits & 0x7 ) << 29;
2731 thumbTW1 = ( thumbTW1 & ~( 0x1 << 26 ) ) | ( !bit & 0x1 ) << 26;
2732
2733 // Set diffbit
2734 thumbTW1 = ( thumbTW1 & ~0x2 ) | 0x2;
2735 thumbTW2 = thumbT59W2;
2736}
2737
2738// During search it is not convenient to store the bits the way they are stored in the
2739// file format. Hence, after search, it is converted to this format.
2740// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
2741static inline void stuff58bits( unsigned int thumbH58W1, unsigned int thumbH58W2, unsigned int& thumbHW1, unsigned int& thumbHW2 )
2742{
2743 // Put bits in twotimer configuration for 58 (red doesn't overflow, green does)
2744 //
2745 // Go from this bit layout:
2746 //
2747 //
2748 // |63 62 61 60 59 58|57 56 55 54|53 52 51 50|49 48 47 46|45 44 43 42|41 40 39 38|37 36 35 34|33 32|
2749 // |-------empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|d2 d1|
2750 //
2751 // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2752 // |---------------------------------------index bits----------------------------------------------|
2753 //
2754 // To this:
2755 //
2756 // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2757 // -----------------------------------------------------------------------------------------------
2758 // |//|R0 |G0 |// // //|G0|B0|//|B0b |R1 |G1 |B0 |d2|df|d1|
2759 // -----------------------------------------------------------------------------------------------
2760 //
2761 // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2762 // |---------------------------------------index bits----------------------------------------------|
2763 //
2764 // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2765 // -----------------------------------------------------------------------------------------------
2766 // | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp|
2767 // | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt|
2768 // -----------------------------------------------------------------------------------------------
2769 //
2770 //
2771 // Thus, what we are really doing is going from this bit layout:
2772 //
2773 //
2774 // |63 62 61 60 59 58|57 56 55 54 53 52 51|50 49|48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33|32 |
2775 // |-------empty-----|part0---------------|part1|part2------------------------------------------|part3|
2776 //
2777 // To this:
2778 //
2779 // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2780 // --------------------------------------------------------------------------------------------------|
2781 // |//|part0 |// // //|part1|//|part2 |df|part3|
2782 // --------------------------------------------------------------------------------------------------|
2783
2784 unsigned int part0, part1, part2, part3;
2785 uint8_t bit, a, b, c, d, bits;
2786
2787 // move parts
2788 part0 = ( thumbH58W1 >> 19 ) & 0x7f;
2789 part1 = ( thumbH58W1 >> 17 ) & 0x3;
2790 part2 = ( thumbH58W1 >> 1 ) & 0xffff;
2791 part3 = thumbH58W1 & 0x1;
2792 thumbHW1 = 0;
2793 thumbHW1 = ( thumbHW1 & ~( 0x7f << 24 ) ) | ( ( part0 & 0x7f ) << 24 );
2794 thumbHW1 = ( thumbHW1 & ~( 0x3 << 19 ) ) | ( ( part1 & 0x3 ) << 19 );
2795 thumbHW1 = ( thumbHW1 & ~( 0xffff << 2 ) ) | ( ( part2 & 0xffff ) << 2 );
2796 thumbHW1 = ( thumbHW1 & ~0x1 ) | ( part3 & 0x1 );
2797
2798 // Make sure that red does not overflow:
2799 bit = ( thumbHW1 >> 30 ) & 0x1;
2800 thumbHW1 = ( thumbHW1 & ~( 0x1 << 31 ) ) | ( ( !bit & 0x1 ) << 31 );
2801
2802 // Make sure that green overflows:
2803 a = ( thumbHW1 >> 20 ) & 0x1;
2804 b = ( thumbHW1 >> 19 ) & 0x1;
2805 c = ( thumbHW1 >> 17 ) & 0x1;
2806 d = ( thumbHW1 >> 16 ) & 0x1;
2807 // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
2808 // The following logical expression checks for the presence of any of those:
2809 bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
2810 bits = 0xf * bit;
2811 thumbHW1 = ( thumbHW1 & ~( 0x7 << 21 ) ) | ( ( bits & 0x7 ) << 21 );
2812 thumbHW1 = ( thumbHW1 & ~( 0x1 << 18 ) ) | ( ( !bit & 0x1 ) << 18 );
2813
2814 // Set diffbit
2815 thumbHW1 = ( thumbHW1 & ~0x2 ) | 0x2;
2816 thumbHW2 = thumbH58W2;
2817}
2818
2819#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
2820static etcpak_force_inline Channels GetChannels( const uint8_t* src )
2821{
2822 Channels ch;
2823#ifdef __AVX2__
2824 __m128i d0 = _mm_loadu_si128( ( (__m128i*)src ) + 0 );
2825 __m128i d1 = _mm_loadu_si128( ( (__m128i*)src ) + 1 );
2826 __m128i d2 = _mm_loadu_si128( ( (__m128i*)src ) + 2 );
2827 __m128i d3 = _mm_loadu_si128( ( (__m128i*)src ) + 3 );
2828
2829 __m128i rgb0 = _mm_shuffle_epi8( d0, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2830 __m128i rgb1 = _mm_shuffle_epi8( d1, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2831 __m128i rgb2 = _mm_shuffle_epi8( d2, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2832 __m128i rgb3 = _mm_shuffle_epi8( d3, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2833
2834 __m128i rg0 = _mm_unpacklo_epi32( rgb0, rgb1 );
2835 __m128i rg1 = _mm_unpacklo_epi32( rgb2, rgb3 );
2836 __m128i b0 = _mm_unpackhi_epi32( rgb0, rgb1 );
2837 __m128i b1 = _mm_unpackhi_epi32( rgb2, rgb3 );
2838
2839 // swap channels
2840 ch.b8 = _mm_unpacklo_epi64( rg0, rg1 );
2841 ch.g8 = _mm_unpackhi_epi64( rg0, rg1 );
2842 ch.r8 = _mm_unpacklo_epi64( b0, b1 );
2843#elif defined __ARM_NEON && defined __aarch64__
2844 //load pixel data into 4 rows
2845 uint8x16_t px0 = vld1q_u8( src + 0 );
2846 uint8x16_t px1 = vld1q_u8( src + 16 );
2847 uint8x16_t px2 = vld1q_u8( src + 32 );
2848 uint8x16_t px3 = vld1q_u8( src + 48 );
2849
2850 uint8x16x2_t px0z1 = vzipq_u8( px0, px1 );
2851 uint8x16x2_t px2z3 = vzipq_u8( px2, px3 );
2852 uint8x16x2_t px01 = vzipq_u8( px0z1.val[0], px0z1.val[1] );
2853 uint8x16x2_t rgb01 = vzipq_u8( px01.val[0], px01.val[1] );
2854 uint8x16x2_t px23 = vzipq_u8( px2z3.val[0], px2z3.val[1] );
2855 uint8x16x2_t rgb23 = vzipq_u8( px23.val[0], px23.val[1] );
2856
2857 uint8x16_t rr = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[0] ), vreinterpretq_u64_u8( rgb23.val[0] ) ) );
2858 uint8x16_t gg = vreinterpretq_u8_u64( vzip2q_u64( vreinterpretq_u64_u8( rgb01.val[0] ), vreinterpretq_u64_u8( rgb23.val[0] ) ) );
2859 uint8x16_t bb = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[1] ), vreinterpretq_u64_u8( rgb23.val[1] ) ) );
2860
2861 uint8x16x2_t red = vzipq_u8( rr, uint8x16_t() );
2862 uint8x16x2_t grn = vzipq_u8( gg, uint8x16_t() );
2863 uint8x16x2_t blu = vzipq_u8( bb, uint8x16_t() );
2864 ch.r = red;
2865 ch.b = blu;
2866 ch.g = grn;
2867#endif
2868 return ch;
2869}
2870#endif
2871
2872#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
2873static etcpak_force_inline void CalculateLuma( Channels& ch, Luma& luma )
2874#else
2875static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
2876#endif
2877{
2878#ifdef __AVX2__
2879 __m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.b8 ), _mm256_set1_epi16( 14 ) );
2880 __m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.g8 ), _mm256_set1_epi16( 76 ) );
2881 __m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.r8 ), _mm256_set1_epi16( 38 ) );
2882
2883 __m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma );
2884 __m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 );
2885 __m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 );
2886 __m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 );
2887
2888 static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 );
2889 static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 );
2890 __m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo );
2891 __m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi );
2892 __m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved );
2893 luma.luma8 = luma_8bit;
2894
2895 // min/max calculation
2896 luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f;
2897 luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f;
2898#elif defined __ARM_NEON && defined __aarch64__
2899 //load pixel data into 4 rows
2900 uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[0] ), 14 );
2901 uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[1] ), 14 );
2902 uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[0] ), 76 );
2903 uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[1] ), 76 );
2904 uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[0] ), 38 );
2905 uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[1] ), 38 );
2906
2907 //calculate luma for rows 0,1 and 2,3
2908 uint16x8_t lum_r01 = vaddq_u16( vaddq_u16( red0, grn0 ), blu0 );
2909 uint16x8_t lum_r23 = vaddq_u16( vaddq_u16( red1, grn1 ), blu1 );
2910
2911 //divide luma values with right shift and narrow results to 8bit
2912 uint8x8_t lum_r01_d = vshrn_n_u16( lum_r01, 7 );
2913 uint8x8_t lum_r02_d = vshrn_n_u16( lum_r23, 7 );
2914
2915 luma.luma8 = vcombine_u8( lum_r01_d, lum_r02_d );
2916 //find min and max luma value
2917 luma.min = hMin( luma.luma8, luma.minIdx ) * 0.00392156f;
2918 luma.max = hMax( luma.luma8, luma.maxIdx ) * 0.00392156f;
2919#else
2920 for( int i = 0; i < 16; ++i )
2921 {
2922 luma.val[i] = ( src[i * 4 + 2] * 76 + src[i * 4 + 1] * 150 + src[i * 4] * 28 ) / 254; // luma calculation
2923 if( luma.min > luma.val[i] )
2924 {
2925 luma.min = luma.val[i];
2926 luma.minIdx = i;
2927 }
2928 if( luma.max < luma.val[i] )
2929 {
2930 luma.max = luma.val[i];
2931 luma.maxIdx = i;
2932 }
2933 }
2934#endif
2935}
2936
2937static etcpak_force_inline uint8_t SelectModeETC2( const Luma& luma )
2938{
2939#if defined __AVX2__ || defined __ARM_NEON
2940 const float lumaRange = ( luma.max - luma.min );
2941#else
2942 const float lumaRange = ( luma.max - luma.min ) * ( 1.f / 255.f );
2943#endif
2944 // filters a very-low-contrast block
2945 if( lumaRange <= ecmd_threshold[0] )
2946 {
2947 return ModePlanar;
2948 }
2949 // checks whether a pair of the corner pixels in a block has the min/max luma values;
2950 // if so, the ETC2 planar mode is enabled, and otherwise, the ETC1 mode is enabled
2951 else if( lumaRange <= ecmd_threshold[1] )
2952 {
2953#ifdef __AVX2__
2954 static const __m128i corner_pair = _mm_set_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 0, 15, 3, 12, 12, 3, 15, 0 );
2955 __m128i current_max_min = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx );
2956
2957 __m128i max_min_result = _mm_cmpeq_epi16( corner_pair, current_max_min );
2958
2959 int mask = _mm_movemask_epi8( max_min_result );
2960 if( mask )
2961 {
2962 return ModePlanar;
2963 }
2964#else
2965 // check whether a pair of the corner pixels in a block has the min/max luma values;
2966 // if so, the ETC2 planar mode is enabled.
2967 if( ( luma.minIdx == 0 && luma.maxIdx == 15 ) ||
2968 ( luma.minIdx == 15 && luma.maxIdx == 0 ) ||
2969 ( luma.minIdx == 3 && luma.maxIdx == 12 ) ||
2970 ( luma.minIdx == 12 && luma.maxIdx == 3 ) )
2971 {
2972 return ModePlanar;
2973 }
2974#endif
2975 }
2976 // filters a high-contrast block for checking both ETC1 mode and the ETC2 T/H mode
2977 else if( lumaRange >= ecmd_threshold[2] )
2978 {
2979 return ModeTH;
2980 }
2981 return ModeUndecided;
2982}
2983
2984static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool useHeuristics )
2985{
2986#ifdef __AVX2__
2987 uint64_t d = CheckSolid_AVX2( src );
2988 if( d != 0 ) return d;
2989#else
2990 uint64_t d = CheckSolid( src );
2991 if (d != 0) return d;
2992#endif
2993
2994 uint8_t mode = ModeUndecided;
2995 Luma luma;
2996#ifdef __AVX2__
2997 Channels ch = GetChannels( src );
2998 if( useHeuristics )
2999 {
3000 CalculateLuma( ch, luma );
3001 mode = SelectModeETC2( luma );
3002 }
3003
3004 auto plane = Planar_AVX2( ch, mode, useHeuristics );
3005 if( useHeuristics && mode == ModePlanar ) return plane.plane;
3006
3007 alignas( 32 ) v4i a[8];
3008 __m128i err0 = PrepareAverages_AVX2( a, plane.sum4 );
3009
3010 // Get index of minimum error (err0)
3011 __m128i err1 = _mm_shuffle_epi32( err0, _MM_SHUFFLE( 2, 3, 0, 1 ) );
3012 __m128i errMin0 = _mm_min_epu32(err0, err1);
3013
3014 __m128i errMin1 = _mm_shuffle_epi32( errMin0, _MM_SHUFFLE( 1, 0, 3, 2 ) );
3015 __m128i errMin2 = _mm_min_epu32( errMin1, errMin0 );
3016
3017 __m128i errMask = _mm_cmpeq_epi32( errMin2, err0 );
3018
3019 uint32_t mask = _mm_movemask_epi8( errMask );
3020
3021 size_t idx = _bit_scan_forward( mask ) >> 2;
3022
3023 d = EncodeAverages_AVX2( a, idx );
3024
3025 alignas(32) uint32_t terr[2][8] = {};
3026 alignas(32) uint32_t tsel[8];
3027
3028 if ((idx == 0) || (idx == 2))
3029 {
3030 FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
3031 }
3032 else
3033 {
3034 FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
3035 }
3036
3037 if( useHeuristics )
3038 {
3039 if( mode == ModeTH )
3040 {
3041 uint64_t result = 0;
3042 uint64_t error = 0;
3043 uint32_t compressed[4] = { 0, 0, 0, 0 };
3044 bool tMode = false;
3045
3046 error = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode, ch.r8, ch.g8, ch.b8 );
3047 if( tMode )
3048 {
3049 stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3050 }
3051 else
3052 {
3053 stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3054 }
3055
3056 result = (uint32_t)_bswap( compressed[2] );
3057 result |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
3058
3059 plane.plane = result;
3060 plane.error = error;
3061 }
3062 else
3063 {
3064 plane.plane = 0;
3065 plane.error = MaxError;
3066 }
3067 }
3068
3069 return EncodeSelectors_AVX2( d, terr, tsel, ( idx % 2 ) == 1, plane.plane, plane.error );
3070#else
3071 if( useHeuristics )
3072 {
3073#if defined __ARM_NEON && defined __aarch64__
3074 Channels ch = GetChannels( src );
3075 CalculateLuma( ch, luma );
3076#else
3077 CalculateLuma( src, luma );
3078#endif
3079 mode = SelectModeETC2( luma );
3080 }
3081#ifdef __ARM_NEON
3082 auto result = Planar_NEON( src, mode, useHeuristics );
3083#else
3084 auto result = Planar( src, mode, useHeuristics );
3085#endif
3086 if( result.second == 0 ) return result.first;
3087
3088 v4i a[8];
3089 unsigned int err[4] = {};
3090 PrepareAverages( a, src, err );
3091 size_t idx = GetLeastError( err, 4 );
3092 EncodeAverages( d, a, idx );
3093
3094#if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
3095 uint32_t terr[2][8] = {};
3096#else
3097 uint64_t terr[2][8] = {};
3098#endif
3099 uint16_t tsel[16][8];
3100 auto id = g_id[idx];
3101 FindBestFit( terr, tsel, a, id, src );
3102
3103 if( useHeuristics )
3104 {
3105 if( mode == ModeTH )
3106 {
3107 uint32_t compressed[4] = { 0, 0, 0, 0 };
3108 bool tMode = false;
3109
3110 result.second = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode );
3111 if( tMode )
3112 {
3113 stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3114 }
3115 else
3116 {
3117 stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3118 }
3119
3120 result.first = (uint32_t)_bswap( compressed[2] );
3121 result.first |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
3122 }
3123 else
3124 {
3125 result.first = 0;
3126 result.second = MaxError;
3127 }
3128 }
3129
3130 return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
3131#endif
3132}
3133
3134#ifdef __SSE4_1__
3135template<int K>
3136static etcpak_force_inline __m128i Widen( const __m128i src )
3137{
3138 static_assert( K >= 0 && K <= 7, "Index out of range" );
3139
3140 __m128i tmp;
3141 switch( K )
3142 {
3143 case 0:
3144 tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3145 return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3146 case 1:
3147 tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3148 return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3149 case 2:
3150 tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3151 return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3152 case 3:
3153 tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
3154 return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3155 case 4:
3156 tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3157 return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3158 case 5:
3159 tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3160 return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3161 case 6:
3162 tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3163 return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3164 case 7:
3165 tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
3166 return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3167 }
3168}
3169
3170static etcpak_force_inline int GetMulSel( int sel )
3171{
3172 switch( sel )
3173 {
3174 case 0:
3175 return 0;
3176 case 1:
3177 case 2:
3178 case 3:
3179 return 1;
3180 case 4:
3181 return 2;
3182 case 5:
3183 case 6:
3184 case 7:
3185 return 3;
3186 case 8:
3187 case 9:
3188 case 10:
3189 case 11:
3190 case 12:
3191 case 13:
3192 return 4;
3193 case 14:
3194 case 15:
3195 return 5;
3196 }
3197}
3198
3199#endif
3200
3201#ifdef __ARM_NEON
3202
3203static constexpr etcpak_force_inline int GetMulSel(int sel)
3204{
3205 return ( sel < 1 ) ? 0 : ( sel < 4 ) ? 1 : ( sel < 5 ) ? 2 : ( sel < 8 ) ? 3 : ( sel < 14 ) ? 4 : 5;
3206}
3207
3208static constexpr int ClampConstant( int x, int min, int max )
3209{
3210 return x < min ? min : x > max ? max : x;
3211}
3212
3213template <int Index>
3214etcpak_force_inline static uint16x8_t ErrorProbe_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
3215{
3216 uint8x8_t srcValWide;
3217#ifndef __aarch64__
3218 if( Index < 8 )
3219 srcValWide = vdup_lane_u8( vget_low_u8( alphaBlock ), ClampConstant( Index, 0, 7 ) );
3220 else
3221 srcValWide = vdup_lane_u8( vget_high_u8( alphaBlock ), ClampConstant( Index - 8, 0, 7 ) );
3222#else
3223 srcValWide = vdup_laneq_u8( alphaBlock, Index );
3224#endif
3225
3226 uint8x8_t deltaVal = vabd_u8( srcValWide, recVal );
3227 return vmull_u8( deltaVal, deltaVal );
3228}
3229
3230etcpak_force_inline static uint16_t MinError_EAC_NEON( uint16x8_t errProbe )
3231{
3232#ifndef __aarch64__
3233 uint16x4_t tmpErr = vpmin_u16( vget_low_u16( errProbe ), vget_high_u16( errProbe ) );
3234 tmpErr = vpmin_u16( tmpErr, tmpErr );
3235 return vpmin_u16( tmpErr, tmpErr )[0];
3236#else
3237 return vminvq_u16( errProbe );
3238#endif
3239}
3240
3241template <int Index>
3242etcpak_force_inline static uint64_t MinErrorIndex_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
3243{
3244 uint16x8_t errProbe = ErrorProbe_EAC_NEON<Index>( recVal, alphaBlock );
3245 uint16x8_t minErrMask = vceqq_u16( errProbe, vdupq_n_u16( MinError_EAC_NEON( errProbe ) ) );
3246 uint64_t idx = __builtin_ctzll( vget_lane_u64( vreinterpret_u64_u8( vqmovn_u16( minErrMask ) ), 0 ) );
3247 idx >>= 3;
3248 idx <<= 45 - Index * 3;
3249
3250 return idx;
3251}
3252
3253template <int Index>
3254etcpak_force_inline static int16x8_t WidenMultiplier_EAC_NEON( int16x8_t multipliers )
3255{
3256 constexpr int Lane = GetMulSel( Index );
3257#ifndef __aarch64__
3258 if( Lane < 4 )
3259 return vdupq_lane_s16( vget_low_s16( multipliers ), ClampConstant( Lane, 0, 3 ) );
3260 else
3261 return vdupq_lane_s16( vget_high_s16( multipliers ), ClampConstant( Lane - 4, 0, 3 ) );
3262#else
3263 return vdupq_laneq_s16( multipliers, Lane );
3264#endif
3265}
3266
3267#endif
3268
3269static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
3270{
3271#if defined __SSE4_1__
3272 // Check solid
3273 __m128i s = _mm_loadu_si128( (__m128i*)src );
3274 __m128i solidCmp = _mm_set1_epi8( src[0] );
3275 __m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
3276 if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
3277 {
3278 return src[0];
3279 }
3280
3281 // Calculate min, max
3282 __m128i s1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 2, 3, 0, 1 ) );
3283 __m128i max1 = _mm_max_epu8( s, s1 );
3284 __m128i min1 = _mm_min_epu8( s, s1 );
3285 __m128i smax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
3286 __m128i smin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
3287 __m128i max2 = _mm_max_epu8( max1, smax2 );
3288 __m128i min2 = _mm_min_epu8( min1, smin2 );
3289 __m128i smax3 = _mm_alignr_epi8( max2, max2, 2 );
3290 __m128i smin3 = _mm_alignr_epi8( min2, min2, 2 );
3291 __m128i max3 = _mm_max_epu8( max2, smax3 );
3292 __m128i min3 = _mm_min_epu8( min2, smin3 );
3293 __m128i smax4 = _mm_alignr_epi8( max3, max3, 1 );
3294 __m128i smin4 = _mm_alignr_epi8( min3, min3, 1 );
3295 __m128i max = _mm_max_epu8( max3, smax4 );
3296 __m128i min = _mm_min_epu8( min3, smin4 );
3297 __m128i max16 = _mm_unpacklo_epi8( max, _mm_setzero_si128() );
3298 __m128i min16 = _mm_unpacklo_epi8( min, _mm_setzero_si128() );
3299
3300 // src range, mid
3301 __m128i srcRange = _mm_sub_epi16( max16, min16 );
3302 __m128i srcRangeHalf = _mm_srli_epi16( srcRange, 1 );
3303 __m128i srcMid = _mm_add_epi16( min16, srcRangeHalf );
3304
3305 // multiplier
3306 __m128i mul1 = _mm_mulhi_epi16( srcRange, g_alphaRange_SIMD );
3307 __m128i mul = _mm_add_epi16( mul1, _mm_set1_epi16( 1 ) );
3308
3309 // wide source
3310 __m128i s16_1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 3, 2, 3, 2 ) );
3311 __m128i s16[2] = { _mm_unpacklo_epi8( s, _mm_setzero_si128() ), _mm_unpacklo_epi8( s16_1, _mm_setzero_si128() ) };
3312
3313 __m128i sr[16] = {
3314 Widen<0>( s16[0] ),
3315 Widen<1>( s16[0] ),
3316 Widen<2>( s16[0] ),
3317 Widen<3>( s16[0] ),
3318 Widen<4>( s16[0] ),
3319 Widen<5>( s16[0] ),
3320 Widen<6>( s16[0] ),
3321 Widen<7>( s16[0] ),
3322 Widen<0>( s16[1] ),
3323 Widen<1>( s16[1] ),
3324 Widen<2>( s16[1] ),
3325 Widen<3>( s16[1] ),
3326 Widen<4>( s16[1] ),
3327 Widen<5>( s16[1] ),
3328 Widen<6>( s16[1] ),
3329 Widen<7>( s16[1] )
3330 };
3331
3332#ifdef __AVX2__
3333 __m256i srcRangeWide = _mm256_broadcastsi128_si256( srcRange );
3334 __m256i srcMidWide = _mm256_broadcastsi128_si256( srcMid );
3335
3336 __m256i mulWide1 = _mm256_mulhi_epi16( srcRangeWide, g_alphaRange_AVX );
3337 __m256i mulWide = _mm256_add_epi16( mulWide1, _mm256_set1_epi16( 1 ) );
3338
3339 __m256i modMul[8] = {
3340 _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ) ), _mm256_setzero_si256() ),
3341 _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ) ), _mm256_setzero_si256() ),
3342 _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ) ), _mm256_setzero_si256() ),
3343 _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ) ), _mm256_setzero_si256() ),
3344 _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ) ), _mm256_setzero_si256() ),
3345 _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ) ), _mm256_setzero_si256() ),
3346 _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ) ), _mm256_setzero_si256() ),
3347 _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ) ), _mm256_setzero_si256() ),
3348 };
3349
3350 // find selector
3351 __m256i mulErr = _mm256_setzero_si256();
3352 for( int j=0; j<16; j++ )
3353 {
3354 __m256i s16Wide = _mm256_broadcastsi128_si256( sr[j] );
3355 __m256i err1, err2;
3356
3357 err1 = _mm256_sub_epi16( s16Wide, modMul[0] );
3358 __m256i localErr = _mm256_mullo_epi16( err1, err1 );
3359
3360 err1 = _mm256_sub_epi16( s16Wide, modMul[1] );
3361 err2 = _mm256_mullo_epi16( err1, err1 );
3362 localErr = _mm256_min_epu16( localErr, err2 );
3363
3364 err1 = _mm256_sub_epi16( s16Wide, modMul[2] );
3365 err2 = _mm256_mullo_epi16( err1, err1 );
3366 localErr = _mm256_min_epu16( localErr, err2 );
3367
3368 err1 = _mm256_sub_epi16( s16Wide, modMul[3] );
3369 err2 = _mm256_mullo_epi16( err1, err1 );
3370 localErr = _mm256_min_epu16( localErr, err2 );
3371
3372 err1 = _mm256_sub_epi16( s16Wide, modMul[4] );
3373 err2 = _mm256_mullo_epi16( err1, err1 );
3374 localErr = _mm256_min_epu16( localErr, err2 );
3375
3376 err1 = _mm256_sub_epi16( s16Wide, modMul[5] );
3377 err2 = _mm256_mullo_epi16( err1, err1 );
3378 localErr = _mm256_min_epu16( localErr, err2 );
3379
3380 err1 = _mm256_sub_epi16( s16Wide, modMul[6] );
3381 err2 = _mm256_mullo_epi16( err1, err1 );
3382 localErr = _mm256_min_epu16( localErr, err2 );
3383
3384 err1 = _mm256_sub_epi16( s16Wide, modMul[7] );
3385 err2 = _mm256_mullo_epi16( err1, err1 );
3386 localErr = _mm256_min_epu16( localErr, err2 );
3387
3388 // note that this can overflow, but since we're looking for the smallest error, it shouldn't matter
3389 mulErr = _mm256_adds_epu16( mulErr, localErr );
3390 }
3391 uint64_t minPos1 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_castsi256_si128( mulErr ) ) );
3392 uint64_t minPos2 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_extracti128_si256( mulErr, 1 ) ) );
3393 int sel = ( ( minPos1 & 0xFFFF ) < ( minPos2 & 0xFFFF ) ) ? ( minPos1 >> 16 ) : ( 8 + ( minPos2 >> 16 ) );
3394
3395 __m128i recVal16;
3396 switch( sel )
3397 {
3398 case 0:
3399 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() );
3400 break;
3401 case 1:
3402 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() );
3403 break;
3404 case 2:
3405 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() );
3406 break;
3407 case 3:
3408 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() );
3409 break;
3410 case 4:
3411 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() );
3412 break;
3413 case 5:
3414 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() );
3415 break;
3416 case 6:
3417 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() );
3418 break;
3419 case 7:
3420 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() );
3421 break;
3422 case 8:
3423 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() );
3424 break;
3425 case 9:
3426 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() );
3427 break;
3428 case 10:
3429 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() );
3430 break;
3431 case 11:
3432 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() );
3433 break;
3434 case 12:
3435 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() );
3436 break;
3437 case 13:
3438 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() );
3439 break;
3440 case 14:
3441 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() );
3442 break;
3443 case 15:
3444 recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() );
3445 break;
3446 default:
3447 assert( false );
3448 break;
3449 }
3450#else
3451 // wide multiplier
3452 __m128i rangeMul[16] = {
3453 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ),
3454 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ),
3455 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ),
3456 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ),
3457 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ),
3458 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ),
3459 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ),
3460 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ),
3461 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ),
3462 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ),
3463 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ),
3464 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ),
3465 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ),
3466 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ),
3467 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ),
3468 _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() )
3469 };
3470
3471 // find selector
3472 int err = std::numeric_limits<int>::max();
3473 int sel;
3474 for( int r=0; r<16; r++ )
3475 {
3476 __m128i err1, err2, minerr;
3477 __m128i recVal16 = rangeMul[r];
3478 int rangeErr;
3479
3480 err1 = _mm_sub_epi16( sr[0], recVal16 );
3481 err2 = _mm_mullo_epi16( err1, err1 );
3482 minerr = _mm_minpos_epu16( err2 );
3483 rangeErr = _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3484
3485 err1 = _mm_sub_epi16( sr[1], recVal16 );
3486 err2 = _mm_mullo_epi16( err1, err1 );
3487 minerr = _mm_minpos_epu16( err2 );
3488 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3489
3490 err1 = _mm_sub_epi16( sr[2], recVal16 );
3491 err2 = _mm_mullo_epi16( err1, err1 );
3492 minerr = _mm_minpos_epu16( err2 );
3493 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3494
3495 err1 = _mm_sub_epi16( sr[3], recVal16 );
3496 err2 = _mm_mullo_epi16( err1, err1 );
3497 minerr = _mm_minpos_epu16( err2 );
3498 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3499
3500 err1 = _mm_sub_epi16( sr[4], recVal16 );
3501 err2 = _mm_mullo_epi16( err1, err1 );
3502 minerr = _mm_minpos_epu16( err2 );
3503 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3504
3505 err1 = _mm_sub_epi16( sr[5], recVal16 );
3506 err2 = _mm_mullo_epi16( err1, err1 );
3507 minerr = _mm_minpos_epu16( err2 );
3508 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3509
3510 err1 = _mm_sub_epi16( sr[6], recVal16 );
3511 err2 = _mm_mullo_epi16( err1, err1 );
3512 minerr = _mm_minpos_epu16( err2 );
3513 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3514
3515 err1 = _mm_sub_epi16( sr[7], recVal16 );
3516 err2 = _mm_mullo_epi16( err1, err1 );
3517 minerr = _mm_minpos_epu16( err2 );
3518 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3519
3520 err1 = _mm_sub_epi16( sr[8], recVal16 );
3521 err2 = _mm_mullo_epi16( err1, err1 );
3522 minerr = _mm_minpos_epu16( err2 );
3523 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3524
3525 err1 = _mm_sub_epi16( sr[9], recVal16 );
3526 err2 = _mm_mullo_epi16( err1, err1 );
3527 minerr = _mm_minpos_epu16( err2 );
3528 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3529
3530 err1 = _mm_sub_epi16( sr[10], recVal16 );
3531 err2 = _mm_mullo_epi16( err1, err1 );
3532 minerr = _mm_minpos_epu16( err2 );
3533 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3534
3535 err1 = _mm_sub_epi16( sr[11], recVal16 );
3536 err2 = _mm_mullo_epi16( err1, err1 );
3537 minerr = _mm_minpos_epu16( err2 );
3538 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3539
3540 err1 = _mm_sub_epi16( sr[12], recVal16 );
3541 err2 = _mm_mullo_epi16( err1, err1 );
3542 minerr = _mm_minpos_epu16( err2 );
3543 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3544
3545 err1 = _mm_sub_epi16( sr[13], recVal16 );
3546 err2 = _mm_mullo_epi16( err1, err1 );
3547 minerr = _mm_minpos_epu16( err2 );
3548 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3549
3550 err1 = _mm_sub_epi16( sr[14], recVal16 );
3551 err2 = _mm_mullo_epi16( err1, err1 );
3552 minerr = _mm_minpos_epu16( err2 );
3553 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3554
3555 err1 = _mm_sub_epi16( sr[15], recVal16 );
3556 err2 = _mm_mullo_epi16( err1, err1 );
3557 minerr = _mm_minpos_epu16( err2 );
3558 rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3559
3560 if( rangeErr < err )
3561 {
3562 err = rangeErr;
3563 sel = r;
3564 if( err == 0 ) break;
3565 }
3566 }
3567
3568 __m128i recVal16 = rangeMul[sel];
3569#endif
3570
3571 // find indices
3572 __m128i err1, err2, minerr;
3573 uint64_t idx = 0, tmp;
3574
3575 err1 = _mm_sub_epi16( sr[0], recVal16 );
3576 err2 = _mm_mullo_epi16( err1, err1 );
3577 minerr = _mm_minpos_epu16( err2 );
3578 tmp = _mm_cvtsi128_si64( minerr );
3579 idx |= ( tmp >> 16 ) << 15*3;
3580
3581 err1 = _mm_sub_epi16( sr[1], recVal16 );
3582 err2 = _mm_mullo_epi16( err1, err1 );
3583 minerr = _mm_minpos_epu16( err2 );
3584 tmp = _mm_cvtsi128_si64( minerr );
3585 idx |= ( tmp >> 16 ) << 14*3;
3586
3587 err1 = _mm_sub_epi16( sr[2], recVal16 );
3588 err2 = _mm_mullo_epi16( err1, err1 );
3589 minerr = _mm_minpos_epu16( err2 );
3590 tmp = _mm_cvtsi128_si64( minerr );
3591 idx |= ( tmp >> 16 ) << 13*3;
3592
3593 err1 = _mm_sub_epi16( sr[3], recVal16 );
3594 err2 = _mm_mullo_epi16( err1, err1 );
3595 minerr = _mm_minpos_epu16( err2 );
3596 tmp = _mm_cvtsi128_si64( minerr );
3597 idx |= ( tmp >> 16 ) << 12*3;
3598
3599 err1 = _mm_sub_epi16( sr[4], recVal16 );
3600 err2 = _mm_mullo_epi16( err1, err1 );
3601 minerr = _mm_minpos_epu16( err2 );
3602 tmp = _mm_cvtsi128_si64( minerr );
3603 idx |= ( tmp >> 16 ) << 11*3;
3604
3605 err1 = _mm_sub_epi16( sr[5], recVal16 );
3606 err2 = _mm_mullo_epi16( err1, err1 );
3607 minerr = _mm_minpos_epu16( err2 );
3608 tmp = _mm_cvtsi128_si64( minerr );
3609 idx |= ( tmp >> 16 ) << 10*3;
3610
3611 err1 = _mm_sub_epi16( sr[6], recVal16 );
3612 err2 = _mm_mullo_epi16( err1, err1 );
3613 minerr = _mm_minpos_epu16( err2 );
3614 tmp = _mm_cvtsi128_si64( minerr );
3615 idx |= ( tmp >> 16 ) << 9*3;
3616
3617 err1 = _mm_sub_epi16( sr[7], recVal16 );
3618 err2 = _mm_mullo_epi16( err1, err1 );
3619 minerr = _mm_minpos_epu16( err2 );
3620 tmp = _mm_cvtsi128_si64( minerr );
3621 idx |= ( tmp >> 16 ) << 8*3;
3622
3623 err1 = _mm_sub_epi16( sr[8], recVal16 );
3624 err2 = _mm_mullo_epi16( err1, err1 );
3625 minerr = _mm_minpos_epu16( err2 );
3626 tmp = _mm_cvtsi128_si64( minerr );
3627 idx |= ( tmp >> 16 ) << 7*3;
3628
3629 err1 = _mm_sub_epi16( sr[9], recVal16 );
3630 err2 = _mm_mullo_epi16( err1, err1 );
3631 minerr = _mm_minpos_epu16( err2 );
3632 tmp = _mm_cvtsi128_si64( minerr );
3633 idx |= ( tmp >> 16 ) << 6*3;
3634
3635 err1 = _mm_sub_epi16( sr[10], recVal16 );
3636 err2 = _mm_mullo_epi16( err1, err1 );
3637 minerr = _mm_minpos_epu16( err2 );
3638 tmp = _mm_cvtsi128_si64( minerr );
3639 idx |= ( tmp >> 16 ) << 5*3;
3640
3641 err1 = _mm_sub_epi16( sr[11], recVal16 );
3642 err2 = _mm_mullo_epi16( err1, err1 );
3643 minerr = _mm_minpos_epu16( err2 );
3644 tmp = _mm_cvtsi128_si64( minerr );
3645 idx |= ( tmp >> 16 ) << 4*3;
3646
3647 err1 = _mm_sub_epi16( sr[12], recVal16 );
3648 err2 = _mm_mullo_epi16( err1, err1 );
3649 minerr = _mm_minpos_epu16( err2 );
3650 tmp = _mm_cvtsi128_si64( minerr );
3651 idx |= ( tmp >> 16 ) << 3*3;
3652
3653 err1 = _mm_sub_epi16( sr[13], recVal16 );
3654 err2 = _mm_mullo_epi16( err1, err1 );
3655 minerr = _mm_minpos_epu16( err2 );
3656 tmp = _mm_cvtsi128_si64( minerr );
3657 idx |= ( tmp >> 16 ) << 2*3;
3658
3659 err1 = _mm_sub_epi16( sr[14], recVal16 );
3660 err2 = _mm_mullo_epi16( err1, err1 );
3661 minerr = _mm_minpos_epu16( err2 );
3662 tmp = _mm_cvtsi128_si64( minerr );
3663 idx |= ( tmp >> 16 ) << 1*3;
3664
3665 err1 = _mm_sub_epi16( sr[15], recVal16 );
3666 err2 = _mm_mullo_epi16( err1, err1 );
3667 minerr = _mm_minpos_epu16( err2 );
3668 tmp = _mm_cvtsi128_si64( minerr );
3669 idx |= ( tmp >> 16 ) << 0*3;
3670
3671 uint16_t rm[8];
3672 _mm_storeu_si128( (__m128i*)rm, mul );
3673 uint16_t sm = _mm_cvtsi128_si64( srcMid );
3674
3675 uint64_t d = ( uint64_t( sm ) << 56 ) |
3676 ( uint64_t( rm[GetMulSel( sel )] ) << 52 ) |
3677 ( uint64_t( sel ) << 48 ) |
3678 idx;
3679
3680 return _bswap64( d );
3681#elif defined __ARM_NEON
3682
3683 int16x8_t srcMidWide, multipliers;
3684 int srcMid;
3685 uint8x16_t srcAlphaBlock = vld1q_u8( src );
3686 {
3687 uint8_t ref = src[0];
3688 uint8x16_t a0 = vdupq_n_u8( ref );
3689 uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 );
3690 int64x2_t m = vreinterpretq_s64_u8( r );
3691 if( m[0] == -1 && m[1] == -1 )
3692 return ref;
3693
3694 // srcRange
3695#ifdef __aarch64__
3696 uint8_t min = vminvq_u8( srcAlphaBlock );
3697 uint8_t max = vmaxvq_u8( srcAlphaBlock );
3698 uint8_t srcRange = max - min;
3699 multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_n_s16( g_alphaRange_NEON, srcRange ), 1 ), vdupq_n_s16( 1 ) );
3700 srcMid = min + srcRange / 2;
3701 srcMidWide = vdupq_n_s16( srcMid );
3702#else
3703 uint8x8_t vmin = vpmin_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
3704 vmin = vpmin_u8( vmin, vmin );
3705 vmin = vpmin_u8( vmin, vmin );
3706 vmin = vpmin_u8( vmin, vmin );
3707 uint8x8_t vmax = vpmax_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
3708 vmax = vpmax_u8( vmax, vmax );
3709 vmax = vpmax_u8( vmax, vmax );
3710 vmax = vpmax_u8( vmax, vmax );
3711
3712 int16x8_t srcRangeWide = vreinterpretq_s16_u16( vsubl_u8( vmax, vmin ) );
3713 multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_s16( g_alphaRange_NEON, srcRangeWide ), 1 ), vdupq_n_s16( 1 ) );
3714 srcMidWide = vsraq_n_s16( vreinterpretq_s16_u16(vmovl_u8(vmin)), srcRangeWide, 1);
3715 srcMid = vgetq_lane_s16( srcMidWide, 0 );
3716#endif
3717 }
3718
3719 // calculate reconstructed values
3720#define EAC_APPLY_16X( m ) m( 0 ) m( 1 ) m( 2 ) m( 3 ) m( 4 ) m( 5 ) m( 6 ) m( 7 ) m( 8 ) m( 9 ) m( 10 ) m( 11 ) m( 12 ) m( 13 ) m( 14 ) m( 15 )
3721
3722#define EAC_RECONSTRUCT_VALUE( n ) vqmovun_s16( vmlaq_s16( srcMidWide, g_alpha_NEON[n], WidenMultiplier_EAC_NEON<n>( multipliers ) ) ),
3723 uint8x8_t recVals[16] = { EAC_APPLY_16X( EAC_RECONSTRUCT_VALUE ) };
3724
3725 // find selector
3726 int err = std::numeric_limits<int>::max();
3727 int sel = 0;
3728 for( int r = 0; r < 16; r++ )
3729 {
3730 uint8x8_t recVal = recVals[r];
3731
3732 int rangeErr = 0;
3733#define EAC_ACCUMULATE_ERROR( n ) rangeErr += MinError_EAC_NEON( ErrorProbe_EAC_NEON<n>( recVal, srcAlphaBlock ) );
3734 EAC_APPLY_16X( EAC_ACCUMULATE_ERROR )
3735
3736 if( rangeErr < err )
3737 {
3738 err = rangeErr;
3739 sel = r;
3740 if ( err == 0 ) break;
3741 }
3742 }
3743
3744 // combine results
3745 uint64_t d = ( uint64_t( srcMid ) << 56 ) |
3746 ( uint64_t( multipliers[GetMulSel( sel )] ) << 52 ) |
3747 ( uint64_t( sel ) << 48);
3748
3749 // generate indices
3750 uint8x8_t recVal = recVals[sel];
3751#define EAC_INSERT_INDEX(n) d |= MinErrorIndex_EAC_NEON<n>( recVal, srcAlphaBlock );
3752 EAC_APPLY_16X( EAC_INSERT_INDEX )
3753
3754 return _bswap64( d );
3755
3756#undef EAC_APPLY_16X
3757#undef EAC_INSERT_INDEX
3758#undef EAC_ACCUMULATE_ERROR
3759#undef EAC_RECONSTRUCT_VALUE
3760
3761#else
3762 {
3763 bool solid = true;
3764 const uint8_t* ptr = src + 1;
3765 const uint8_t ref = *src;
3766 for( int i=1; i<16; i++ )
3767 {
3768 if( ref != *ptr++ )
3769 {
3770 solid = false;
3771 break;
3772 }
3773 }
3774 if( solid )
3775 {
3776 return ref;
3777 }
3778 }
3779
3780 uint8_t min = src[0];
3781 uint8_t max = src[0];
3782 for( int i=1; i<16; i++ )
3783 {
3784 if( min > src[i] ) min = src[i];
3785 else if( max < src[i] ) max = src[i];
3786 }
3787 int srcRange = max - min;
3788 int srcMid = min + srcRange / 2;
3789
3790 uint8_t buf[16][16];
3791 int err = std::numeric_limits<int>::max();
3792 int sel;
3793 int selmul;
3794 for( int r=0; r<16; r++ )
3795 {
3796 int mul = ( ( srcRange * g_alphaRange[r] ) >> 16 ) + 1;
3797
3798 int rangeErr = 0;
3799 for( int i=0; i<16; i++ )
3800 {
3801 const auto srcVal = src[i];
3802
3803 int idx = 0;
3804 const auto modVal = g_alpha[r][0] * mul;
3805 const auto recVal = clampu8( srcMid + modVal );
3806 int localErr = sq( srcVal - recVal );
3807
3808 if( localErr != 0 )
3809 {
3810 for( int j=1; j<8; j++ )
3811 {
3812 const auto modVal = g_alpha[r][j] * mul;
3813 const auto recVal = clampu8( srcMid + modVal );
3814 const auto errProbe = sq( srcVal - recVal );
3815 if( errProbe < localErr )
3816 {
3817 localErr = errProbe;
3818 idx = j;
3819 }
3820 }
3821 }
3822
3823 buf[r][i] = idx;
3824 rangeErr += localErr;
3825 }
3826
3827 if( rangeErr < err )
3828 {
3829 err = rangeErr;
3830 sel = r;
3831 selmul = mul;
3832 if( err == 0 ) break;
3833 }
3834 }
3835
3836 uint64_t d = ( uint64_t( srcMid ) << 56 ) |
3837 ( uint64_t( selmul ) << 52 ) |
3838 ( uint64_t( sel ) << 48 );
3839
3840 int offset = 45;
3841 auto ptr = buf[sel];
3842 for( int i=0; i<16; i++ )
3843 {
3844 d |= uint64_t( *ptr++ ) << offset;
3845 offset -= 3;
3846 }
3847
3848 return _bswap64( d );
3849#endif
3850}
3851
3852
3853void CompressEtc1Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
3854{
3855 int w = 0;
3856 uint32_t buf[4*4];
3857 do
3858 {
3859#ifdef __SSE4_1__
3860 __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
3861 __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
3862 __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
3863 __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
3864
3865 _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3866
3867 __m128i c0 = _mm_castps_si128( px0 );
3868 __m128i c1 = _mm_castps_si128( px1 );
3869 __m128i c2 = _mm_castps_si128( px2 );
3870 __m128i c3 = _mm_castps_si128( px3 );
3871
3872 __m128i mask = _mm_setr_epi32( 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f );
3873 __m128i p0 = _mm_shuffle_epi8( c0, mask );
3874 __m128i p1 = _mm_shuffle_epi8( c1, mask );
3875 __m128i p2 = _mm_shuffle_epi8( c2, mask );
3876 __m128i p3 = _mm_shuffle_epi8( c3, mask );
3877
3878 _mm_store_si128( (__m128i*)(buf + 0), p0 );
3879 _mm_store_si128( (__m128i*)(buf + 4), p1 );
3880 _mm_store_si128( (__m128i*)(buf + 8), p2 );
3881 _mm_store_si128( (__m128i*)(buf + 12), p3 );
3882
3883 src += 4;
3884#else
3885 auto ptr = buf;
3886 for( int x=0; x<4; x++ )
3887 {
3888 unsigned int a = *src >> 24;
3889 *ptr++ = a | ( a << 8 ) | ( a << 16 );
3890 src += width;
3891 a = *src >> 24;
3892 *ptr++ = a | ( a << 8 ) | ( a << 16 );
3893 src += width;
3894 a = *src >> 24;
3895 *ptr++ = a | ( a << 8 ) | ( a << 16 );
3896 src += width;
3897 a = *src >> 24;
3898 *ptr++ = a | ( a << 8 ) | ( a << 16 );
3899 src -= width * 3 - 1;
3900 }
3901#endif
3902 if( ++w == width/4 )
3903 {
3904 src += width * 3;
3905 w = 0;
3906 }
3907 *dst++ = ProcessRGB( (uint8_t*)buf );
3908 }
3909 while( --blocks );
3910}
3911
3912void CompressEtc2Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics )
3913{
3914 int w = 0;
3915 uint32_t buf[4*4];
3916 do
3917 {
3918#ifdef __SSE4_1__
3919 __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
3920 __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
3921 __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
3922 __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
3923
3924 _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3925
3926 __m128i c0 = _mm_castps_si128( px0 );
3927 __m128i c1 = _mm_castps_si128( px1 );
3928 __m128i c2 = _mm_castps_si128( px2 );
3929 __m128i c3 = _mm_castps_si128( px3 );
3930
3931 __m128i mask = _mm_setr_epi32( 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f );
3932 __m128i p0 = _mm_shuffle_epi8( c0, mask );
3933 __m128i p1 = _mm_shuffle_epi8( c1, mask );
3934 __m128i p2 = _mm_shuffle_epi8( c2, mask );
3935 __m128i p3 = _mm_shuffle_epi8( c3, mask );
3936
3937 _mm_store_si128( (__m128i*)(buf + 0), p0 );
3938 _mm_store_si128( (__m128i*)(buf + 4), p1 );
3939 _mm_store_si128( (__m128i*)(buf + 8), p2 );
3940 _mm_store_si128( (__m128i*)(buf + 12), p3 );
3941
3942 src += 4;
3943#else
3944 auto ptr = buf;
3945 for( int x=0; x<4; x++ )
3946 {
3947 unsigned int a = *src >> 24;
3948 *ptr++ = a | ( a << 8 ) | ( a << 16 );
3949 src += width;
3950 a = *src >> 24;
3951 *ptr++ = a | ( a << 8 ) | ( a << 16 );
3952 src += width;
3953 a = *src >> 24;
3954 *ptr++ = a | ( a << 8 ) | ( a << 16 );
3955 src += width;
3956 a = *src >> 24;
3957 *ptr++ = a | ( a << 8 ) | ( a << 16 );
3958 src -= width * 3 - 1;
3959 }
3960#endif
3961 if( ++w == width/4 )
3962 {
3963 src += width * 3;
3964 w = 0;
3965 }
3966 *dst++ = ProcessRGB_ETC2( (uint8_t*)buf, useHeuristics );
3967 }
3968 while( --blocks );
3969}
3970
3971#include <chrono>
3972#include <thread>
3973
3974void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
3975{
3976 int w = 0;
3977 uint32_t buf[4*4];
3978 do
3979 {
3980#ifdef __SSE4_1__
3981 __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
3982 __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
3983 __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
3984 __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
3985
3986 _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3987
3988 _mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) );
3989 _mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) );
3990 _mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) );
3991 _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
3992
3993 src += 4;
3994#else
3995 auto ptr = buf;
3996 for( int x=0; x<4; x++ )
3997 {
3998 *ptr++ = *src;
3999 src += width;
4000 *ptr++ = *src;
4001 src += width;
4002 *ptr++ = *src;
4003 src += width;
4004 *ptr++ = *src;
4005 src -= width * 3 - 1;
4006 }
4007#endif
4008 if( ++w == width/4 )
4009 {
4010 src += width * 3;
4011 w = 0;
4012 }
4013 *dst++ = ProcessRGB( (uint8_t*)buf );
4014 }
4015 while( --blocks );
4016}
4017
4018void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
4019{
4020 int w = 0;
4021 uint32_t buf[4*4];
4022 do
4023 {
4024#ifdef __SSE4_1__
4025 __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
4026 __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
4027 __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
4028 __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
4029
4030 _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4031
4032# ifdef __AVX2__
4033 DitherAvx2( (uint8_t*)buf, _mm_castps_si128( px0 ), _mm_castps_si128( px1 ), _mm_castps_si128( px2 ), _mm_castps_si128( px3 ) );
4034# else
4035 _mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) );
4036 _mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) );
4037 _mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) );
4038 _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
4039
4040 Dither( (uint8_t*)buf );
4041# endif
4042
4043 src += 4;
4044#else
4045 auto ptr = buf;
4046 for( int x=0; x<4; x++ )
4047 {
4048 *ptr++ = *src;
4049 src += width;
4050 *ptr++ = *src;
4051 src += width;
4052 *ptr++ = *src;
4053 src += width;
4054 *ptr++ = *src;
4055 src -= width * 3 - 1;
4056 }
4057#endif
4058 if( ++w == width/4 )
4059 {
4060 src += width * 3;
4061 w = 0;
4062 }
4063 *dst++ = ProcessRGB( (uint8_t*)buf );
4064 }
4065 while( --blocks );
4066}
4067
4068void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics )
4069{
4070 int w = 0;
4071 uint32_t buf[4*4];
4072 do
4073 {
4074#ifdef __SSE4_1__
4075 __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
4076 __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
4077 __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
4078 __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
4079
4080 _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4081
4082 _mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) );
4083 _mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) );
4084 _mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) );
4085 _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
4086
4087 src += 4;
4088#else
4089 auto ptr = buf;
4090 for( int x=0; x<4; x++ )
4091 {
4092 *ptr++ = *src;
4093 src += width;
4094 *ptr++ = *src;
4095 src += width;
4096 *ptr++ = *src;
4097 src += width;
4098 *ptr++ = *src;
4099 src -= width * 3 - 1;
4100 }
4101#endif
4102 if( ++w == width/4 )
4103 {
4104 src += width * 3;
4105 w = 0;
4106 }
4107 *dst++ = ProcessRGB_ETC2( (uint8_t*)buf, useHeuristics );
4108 }
4109 while( --blocks );
4110}
4111
4112void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics )
4113{
4114 int w = 0;
4115 uint32_t rgba[4*4];
4116 uint8_t alpha[4*4];
4117 do
4118 {
4119#ifdef __SSE4_1__
4120 __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
4121 __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
4122 __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
4123 __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
4124
4125 _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4126
4127 __m128i c0 = _mm_castps_si128( px0 );
4128 __m128i c1 = _mm_castps_si128( px1 );
4129 __m128i c2 = _mm_castps_si128( px2 );
4130 __m128i c3 = _mm_castps_si128( px3 );
4131
4132 _mm_store_si128( (__m128i*)(rgba + 0), c0 );
4133 _mm_store_si128( (__m128i*)(rgba + 4), c1 );
4134 _mm_store_si128( (__m128i*)(rgba + 8), c2 );
4135 _mm_store_si128( (__m128i*)(rgba + 12), c3 );
4136
4137 __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
4138
4139 __m128i a0 = _mm_shuffle_epi8( c0, mask );
4140 __m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
4141 __m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
4142 __m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
4143
4144 __m128i s0 = _mm_or_si128( a0, a1 );
4145 __m128i s1 = _mm_or_si128( a2, a3 );
4146 __m128i s2 = _mm_or_si128( s0, s1 );
4147
4148 _mm_store_si128( (__m128i*)alpha, s2 );
4149
4150 src += 4;
4151#else
4152 auto ptr = rgba;
4153 auto ptr8 = alpha;
4154 for( int x=0; x<4; x++ )
4155 {
4156 auto v = *src;
4157 *ptr++ = v;
4158 *ptr8++ = v >> 24;
4159 src += width;
4160 v = *src;
4161 *ptr++ = v;
4162 *ptr8++ = v >> 24;
4163 src += width;
4164 v = *src;
4165 *ptr++ = v;
4166 *ptr8++ = v >> 24;
4167 src += width;
4168 v = *src;
4169 *ptr++ = v;
4170 *ptr8++ = v >> 24;
4171 src -= width * 3 - 1;
4172 }
4173#endif
4174 if( ++w == width/4 )
4175 {
4176 src += width * 3;
4177 w = 0;
4178 }
4179 *dst++ = ProcessAlpha_ETC2( alpha );
4180 *dst++ = ProcessRGB_ETC2( (uint8_t*)rgba, useHeuristics );
4181 }
4182 while( --blocks );
4183}
4184