ProcessRGB.cpp source code [Godot/thirdparty/etcpak/ProcessRGB.cpp]

1	#include <array>
2	#include <string.h>
3	#include <limits>
4	#ifdef __ARM_NEON
5	# include <arm_neon.h>
6	#endif
7
8	#include "Dither.hpp"
9	#include "ForceInline.hpp"
10	#include "Math.hpp"
11	#include "ProcessCommon.hpp"
12	#include "ProcessRGB.hpp"
13	#include "Tables.hpp"
14	#include "Vector.hpp"
15	#if defined __SSE4_1__ \|\| defined __AVX2__ \|\| defined _MSC_VER
16	# ifdef _MSC_VER
17	# include <intrin.h>
18	# include <Windows.h>
19	# define _bswap(x) _byteswap_ulong(x)
20	# define _bswap64(x) _byteswap_uint64(x)
21	# else
22	# include <x86intrin.h>
23	# endif
24	#endif
25
26	#ifndef _bswap
27	# define _bswap(x) __builtin_bswap32(x)
28	# define _bswap64(x) __builtin_bswap64(x)
29	#endif
30
31	static const uint32_t MaxError = `1065369600`; // ((38+76+14) 255)^2*
32	// common T-/H-mode table
33	static uint8_t tableTH[`8`] = { `3`, `6`, `11`, `16`, `23`, `32`, `41`, `64` };
34
35	// thresholds for the early compression-mode decision scheme
36	// default: 0.03, 0.09, and 0.38
37	float ecmd_threshold[`3`] = { `0.03f`, `0.09f`, `0.38f` };
38
39	static const uint8_t ModeUndecided = `0`;
40	static const uint8_t ModePlanar = `0x1`;
41	static const uint8_t ModeTH = `0x2`;
42
43	const unsigned int R = `2`;
44	const unsigned int G = `1`;
45	const unsigned int B = `0`;
46
47	struct Luma
48	{
49	#ifdef __AVX2__
50	float max, min;
51	uint8_t minIdx = `255`, maxIdx = `255`;
52	__m128i luma8;
53	#elif defined __ARM_NEON && defined __aarch64__
54	float max, min;
55	uint8_t minIdx = `255`, maxIdx = `255`;
56	uint8x16_t luma8;
57	#else
58	uint8_t max = `0`, min = `255`, maxIdx = `0`, minIdx = `0`;
59	uint8_t val[`16`];
60	#endif
61	};
62
63	#ifdef __AVX2__
64	struct Plane
65	{
66	uint64_t plane;
67	uint64_t error;
68	__m256i sum4;
69	};
70	#endif
71
72	#if defined __AVX2__ \|\| (defined __ARM_NEON && defined __aarch64__)
73	struct Channels
74	{
75	#ifdef __AVX2__
76	__m128i r8, g8, b8;
77	#elif defined __ARM_NEON && defined __aarch64__
78	uint8x16x2_t r, g, b;
79	#endif
80	};
81	#endif
82
83	namespace
84	{
85	static etcpak_force_inline uint8_t clamp( uint8_t min, int16_t val, uint8_t max )
86	{
87	return val < min ? min : ( val > max ? max : val );
88	}
89
90	static etcpak_force_inline uint8_t clampMin( uint8_t min, int16_t val )
91	{
92	return val < min ? min : val;
93	}
94
95	static etcpak_force_inline uint8_t clampMax( int16_t val, uint8_t max )
96	{
97	return val > max ? max : val;
98	}
99
100	// slightly faster than std::sort
101	static void insertionSort( uint8_t* arr1, uint8_t* arr2 )
102	{
103	for( uint8_t i = `1`; i < `16`; ++i )
104	{
105	uint8_t value = arr1[i];
106	uint8_t hole = i;
107
108	for( ; hole > `0` && value < arr1[hole - `1`]; --hole )
109	{
110	arr1[hole] = arr1[hole - `1`];
111	arr2[hole] = arr2[hole - `1`];
112	}
113	arr1[hole] = value;
114	arr2[hole] = i;
115	}
116	}
117
118	//converts indices from \|a0\|a1\|e0\|e1\|i0\|i1\|m0\|m1\|b0\|b1\|f0\|f1\|j0\|j1\|n0\|n1\|c0\|c1\|g0\|g1\|k0\|k1\|o0\|o1\|d0\|d1\|h0\|h1\|l0\|l1\|p0\|p1\| previously used by T- and H-modes
119	// into \|p0\|o0\|n0\|m0\|l0\|k0\|j0\|i0\|h0\|g0\|f0\|e0\|d0\|c0\|b0\|a0\|p1\|o1\|n1\|m1\|l1\|k1\|j1\|i1\|h1\|g1\|f1\|e1\|d1\|c1\|b1\|a1\| which should be used for all modes.
120	// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
121	static etcpak_force_inline int indexConversion( int pixelIndices )
122	{
123	int correctIndices = `0`;
124	int LSB[`4`][`4`];
125	int MSB[`4`][`4`];
126	int shift = `0`;
127	for( int y = `3`; y >= `0`; y-- )
128	{
129	for( int x = `3`; x >= `0`; x-- )
130	{
131	LSB[x][y] = ( pixelIndices >> shift ) & `1`;
132	shift++;
133	MSB[x][y] = ( pixelIndices >> shift ) & `1`;
134	shift++;
135	}
136	}
137	shift = `0`;
138	for( int x = `0`; x < `4`; x++ )
139	{
140	for( int y = `0`; y < `4`; y++ )
141	{
142	correctIndices \|= ( LSB[x][y] << shift );
143	correctIndices \|= ( MSB[x][y] << ( `16` + shift ) );
144	shift++;
145	}
146	}
147	return correctIndices;
148	}
149
150	// Swapping two RGB-colors
151	// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
152	static etcpak_force_inline void swapColors( uint8_t( colors )[`2`][`3`] )
153	{
154	uint8_t temp = colors[`0`][R];
155	colors[`0`][R] = colors[`1`][R];
156	colors[`1`][R] = temp;
157
158	temp = colors[`0`][G];
159	colors[`0`][G] = colors[`1`][G];
160	colors[`1`][G] = temp;
161
162	temp = colors[`0`][B];
163	colors[`0`][B] = colors[`1`][B];
164	colors[`1`][B] = temp;
165	}
166
167
168	// calculates quantized colors for T or H modes
169	void compressColor( uint8_t( currColor )[`2`][`3`], uint8_t( quantColor )[`2`][`3`], bool t_mode )
170	{
171	if( t_mode )
172	{
173	quantColor[`0`][R] = clampMax( `15` * ( currColor[`0`][R] + `8` ) / `255`, `15` );
174	quantColor[`0`][G] = clampMax( `15` * ( currColor[`0`][G] + `8` ) / `255`, `15` );
175	quantColor[`0`][B] = clampMax( `15` * ( currColor[`0`][B] + `8` ) / `255`, `15` );
176	}
177	else // clamped to [1,14] to get a wider range
178	{
179	quantColor[`0`][R] = clamp( `1`, `15` * ( currColor[`0`][R] + `8` ) / `255`, `14` );
180	quantColor[`0`][G] = clamp( `1`, `15` * ( currColor[`0`][G] + `8` ) / `255`, `14` );
181	quantColor[`0`][B] = clamp( `1`, `15` * ( currColor[`0`][B] + `8` ) / `255`, `14` );
182	}
183
184	// clamped to [1,14] to get a wider range
185	quantColor[`1`][R] = clamp( `1`, `15` * ( currColor[`1`][R] + `8` ) / `255`, `14` );
186	quantColor[`1`][G] = clamp( `1`, `15` * ( currColor[`1`][G] + `8` ) / `255`, `14` );
187	quantColor[`1`][B] = clamp( `1`, `15` * ( currColor[`1`][B] + `8` ) / `255`, `14` );
188	}
189
190	// three decoding functions come from ETCPACK v2.74 and are slightly changed.
191	static etcpak_force_inline void decompressColor( uint8_t( colorsRGB444 )[`2`][`3`], uint8_t( colors )[`2`][`3`] )
192	{
193	// The color should be retrieved as:
194	//
195	// c = round(255/(r_bits^2-1))comp_color*
196	//
197	// This is similar to bit replication
198	//
199	// Note -- this code only work for bit replication from 4 bits and up --- 3 bits needs
200	// two copy operations.
201	colors[`0`][R] = ( colorsRGB444[`0`][R] << `4` ) \| colorsRGB444[`0`][R];
202	colors[`0`][G] = ( colorsRGB444[`0`][G] << `4` ) \| colorsRGB444[`0`][G];
203	colors[`0`][B] = ( colorsRGB444[`0`][B] << `4` ) \| colorsRGB444[`0`][B];
204	colors[`1`][R] = ( colorsRGB444[`1`][R] << `4` ) \| colorsRGB444[`1`][R];
205	colors[`1`][G] = ( colorsRGB444[`1`][G] << `4` ) \| colorsRGB444[`1`][G];
206	colors[`1`][B] = ( colorsRGB444[`1`][B] << `4` ) \| colorsRGB444[`1`][B];
207	}
208
209	// calculates the paint colors from the block colors
210	// using a distance d and one of the H- or T-patterns.
211	static void calculatePaintColors59T( uint8_t d, uint8_t( colors )[`2`][`3`], uint8_t( pColors )[`4`][`3`] )
212	{
213	//////////////////////////////////////////////
214	//
215	// C3 C1 C4----C1---C2
216	// \| \| \|
217	// \| \| \|
218	// \|-------\| \|
219	// \| \| \|
220	// \| \| \|
221	// C4 C2 C3
222	//
223	//////////////////////////////////////////////
224
225	// C4
226	pColors[`3`][R] = clampMin( `0`, colors[`1`][R] - tableTH[d] );
227	pColors[`3`][G] = clampMin( `0`, colors[`1`][G] - tableTH[d] );
228	pColors[`3`][B] = clampMin( `0`, colors[`1`][B] - tableTH[d] );
229
230	// C3
231	pColors[`0`][R] = colors[`0`][R];
232	pColors[`0`][G] = colors[`0`][G];
233	pColors[`0`][B] = colors[`0`][B];
234	// C2
235	pColors[`1`][R] = clampMax( colors[`1`][R] + tableTH[d], `255` );
236	pColors[`1`][G] = clampMax( colors[`1`][G] + tableTH[d], `255` );
237	pColors[`1`][B] = clampMax( colors[`1`][B] + tableTH[d], `255` );
238	// C1
239	pColors[`2`][R] = colors[`1`][R];
240	pColors[`2`][G] = colors[`1`][G];
241	pColors[`2`][B] = colors[`1`][B];
242	}
243
244	static void calculatePaintColors58H( uint8_t d, uint8_t( colors )[`2`][`3`], uint8_t( pColors )[`4`][`3`] )
245	{
246	pColors[`3`][R] = clampMin( `0`, colors[`1`][R] - tableTH[d] );
247	pColors[`3`][G] = clampMin( `0`, colors[`1`][G] - tableTH[d] );
248	pColors[`3`][B] = clampMin( `0`, colors[`1`][B] - tableTH[d] );
249
250	// C1
251	pColors[`0`][R] = clampMax( colors[`0`][R] + tableTH[d], `255` );
252	pColors[`0`][G] = clampMax( colors[`0`][G] + tableTH[d], `255` );
253	pColors[`0`][B] = clampMax( colors[`0`][B] + tableTH[d], `255` );
254	// C2
255	pColors[`1`][R] = clampMin( `0`, colors[`0`][R] - tableTH[d] );
256	pColors[`1`][G] = clampMin( `0`, colors[`0`][G] - tableTH[d] );
257	pColors[`1`][B] = clampMin( `0`, colors[`0`][B] - tableTH[d] );
258	// C3
259	pColors[`2`][R] = clampMax( colors[`1`][R] + tableTH[d], `255` );
260	pColors[`2`][G] = clampMax( colors[`1`][G] + tableTH[d], `255` );
261	pColors[`2`][B] = clampMax( colors[`1`][B] + tableTH[d], `255` );
262	}
263
264	#if defined _MSC_VER && !defined __clang__
265	static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask )
266	{
267	unsigned long ret;
268	_BitScanForward( &ret, mask );
269	return ret;
270	}
271	#endif
272
273	typedef std::array<uint16_t, `4`> v4i;
274
275	#ifdef __AVX2__
276	static etcpak_force_inline __m256i Sum4_AVX2( const uint8_t* data) noexcept
277	{
278	__m128i d0 = _mm_loadu_si128(((__m128i*)data) + `0`);
279	__m128i d1 = _mm_loadu_si128(((__m128i*)data) + `1`);
280	__m128i d2 = _mm_loadu_si128(((__m128i*)data) + `2`);
281	__m128i d3 = _mm_loadu_si128(((__m128i*)data) + `3`);
282
283	__m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(`0x00FFFFFF`));
284	__m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(`0x00FFFFFF`));
285	__m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(`0x00FFFFFF`));
286	__m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(`0x00FFFFFF`));
287
288	__m256i t0 = _mm256_cvtepu8_epi16(dm0);
289	__m256i t1 = _mm256_cvtepu8_epi16(dm1);
290	__m256i t2 = _mm256_cvtepu8_epi16(dm2);
291	__m256i t3 = _mm256_cvtepu8_epi16(dm3);
292
293	__m256i sum0 = _mm256_add_epi16(t0, t1);
294	__m256i sum1 = _mm256_add_epi16(t2, t3);
295
296	__m256i s0 = _mm256_permute2x128_si256(sum0, sum1, (`0`) \| (`3` << `4`)); // 0, 0, 3, 3
297	__m256i s1 = _mm256_permute2x128_si256(sum0, sum1, (`1`) \| (`2` << `4`)); // 1, 1, 2, 2
298
299	__m256i s2 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(`1`, `3`, `0`, `2`));
300	__m256i s3 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(`0`, `2`, `1`, `3`));
301	__m256i s4 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(`3`, `1`, `0`, `2`));
302	__m256i s5 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(`2`, `0`, `1`, `3`));
303
304	__m256i sum5 = _mm256_add_epi16(s2, s3); // 3, 0, 3, 0
305	__m256i sum6 = _mm256_add_epi16(s4, s5); // 2, 1, 1, 2
306	return _mm256_add_epi16(sum5, sum6); // 3+2, 0+1, 3+1, 3+2
307	}
308
309	static etcpak_force_inline __m256i Average_AVX2( const __m256i data) noexcept
310	{
311	__m256i a = _mm256_add_epi16(data, _mm256_set1_epi16(`4`));
312
313	return _mm256_srli_epi16(a, `3`);
314	}
315
316	static etcpak_force_inline __m128i CalcErrorBlock_AVX2( const __m256i data, const v4i a[`8`]) noexcept
317	{
318	//
319	__m256i a0 = _mm256_load_si256((__m256i*)a[`0`].data());
320	__m256i a1 = _mm256_load_si256((__m256i*)a[`4`].data());
321
322	// err = 8 ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );*
323	__m256i a4 = _mm256_madd_epi16(a0, a0);
324	__m256i a5 = _mm256_madd_epi16(a1, a1);
325
326	__m256i a6 = _mm256_hadd_epi32(a4, a5);
327	__m256i a7 = _mm256_slli_epi32(a6, `3`);
328
329	__m256i a8 = _mm256_add_epi32(a7, _mm256_set1_epi32(`0x3FFFFFFF`)); // Big value to prevent negative values, but small enough to prevent overflow
330
331	// average is not swapped
332	// err -= block[0] 2 * average[0];*
333	// err -= block[1] 2 * average[1];*
334	// err -= block[2] 2 * average[2];*
335	__m256i a2 = _mm256_slli_epi16(a0, `1`);
336	__m256i a3 = _mm256_slli_epi16(a1, `1`);
337	__m256i b0 = _mm256_madd_epi16(a2, data);
338	__m256i b1 = _mm256_madd_epi16(a3, data);
339
340	__m256i b2 = _mm256_hadd_epi32(b0, b1);
341	__m256i b3 = _mm256_sub_epi32(a8, b2);
342	__m256i b4 = _mm256_hadd_epi32(b3, b3);
343
344	__m256i b5 = _mm256_permutevar8x32_epi32(b4, _mm256_set_epi32(`0`, `0`, `0`, `0`, `5`, `1`, `4`, `0`));
345
346	return _mm256_castsi256_si128(b5);
347	}
348
349	static etcpak_force_inline void ProcessAverages_AVX2(const __m256i d, v4i a[`8`] ) noexcept
350	{
351	__m256i t = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(`31`)), _mm256_set1_epi16(`128`));
352
353	__m256i c = _mm256_srli_epi16(_mm256_add_epi16(t, _mm256_srli_epi16(t, `8`)), `8`);
354
355	__m256i c1 = _mm256_shuffle_epi32(c, _MM_SHUFFLE(`3`, `2`, `3`, `2`));
356	__m256i diff = _mm256_sub_epi16(c, c1);
357	diff = _mm256_max_epi16(diff, _mm256_set1_epi16(-`4`));
358	diff = _mm256_min_epi16(diff, _mm256_set1_epi16(`3`));
359
360	__m256i co = _mm256_add_epi16(c1, diff);
361
362	c = _mm256_blend_epi16(co, c, `0xF0`);
363
364	__m256i a0 = _mm256_or_si256(_mm256_slli_epi16(c, `3`), _mm256_srli_epi16(c, `2`));
365
366	_mm256_store_si256((__m256i*)a[`4`].data(), a0);
367
368	__m256i t0 = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(`15`)), _mm256_set1_epi16(`128`));
369	__m256i t1 = _mm256_srli_epi16(_mm256_add_epi16(t0, _mm256_srli_epi16(t0, `8`)), `8`);
370
371	__m256i t2 = _mm256_or_si256(t1, _mm256_slli_epi16(t1, `4`));
372
373	_mm256_store_si256((__m256i*)a[`0`].data(), t2);
374	}
375
376	static etcpak_force_inline uint64_t EncodeAverages_AVX2( const v4i a[`8`], size_t idx ) noexcept
377	{
378	uint64_t d = ( idx << `24` );
379	size_t base = idx << `1`;
380
381	__m128i a0 = _mm_load_si128((const __m128i*)a[base].data());
382
383	__m128i r0, r1;
384
385	if( ( idx & `0x2` ) == `0` )
386	{
387	r0 = _mm_srli_epi16(a0, `4`);
388
389	__m128i a1 = _mm_unpackhi_epi64(r0, r0);
390	r1 = _mm_slli_epi16(a1, `4`);
391	}
392	else
393	{
394	__m128i a1 = _mm_and_si128(a0, _mm_set1_epi16(-`8`));
395
396	r0 = _mm_unpackhi_epi64(a1, a1);
397	__m128i a2 = _mm_sub_epi16(a1, r0);
398	__m128i a3 = _mm_srai_epi16(a2, `3`);
399	r1 = _mm_and_si128(a3, _mm_set1_epi16(`0x07`));
400	}
401
402	__m128i r2 = _mm_or_si128(r0, r1);
403	// do missing swap for average values
404	__m128i r3 = _mm_shufflelo_epi16(r2, _MM_SHUFFLE(`3`, `0`, `1`, `2`));
405	__m128i r4 = _mm_packus_epi16(r3, _mm_setzero_si128());
406	d \|= _mm_cvtsi128_si32(r4);
407
408	return d;
409	}
410
411	static etcpak_force_inline uint64_t CheckSolid_AVX2( const uint8_t* src ) noexcept
412	{
413	__m256i d0 = _mm256_loadu_si256(((__m256i*)src) + `0`);
414	__m256i d1 = _mm256_loadu_si256(((__m256i*)src) + `1`);
415
416	__m256i c = _mm256_broadcastd_epi32(_mm256_castsi256_si128(d0));
417
418	__m256i c0 = _mm256_cmpeq_epi8(d0, c);
419	__m256i c1 = _mm256_cmpeq_epi8(d1, c);
420
421	__m256i m = _mm256_and_si256(c0, c1);
422
423	if (!_mm256_testc_si256(m, _mm256_set1_epi32(-`1`)))
424	{
425	return `0`;
426	}
427
428	return `0x02000000` \|
429	( (unsigned int)( src[`0`] & `0xF8` ) << `16` ) \|
430	( (unsigned int)( src[`1`] & `0xF8` ) << `8` ) \|
431	( (unsigned int)( src[`2`] & `0xF8` ) );
432	}
433
434	static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[`8`], const uint8_t* src) noexcept
435	{
436	__m256i sum4 = Sum4_AVX2( src );
437
438	ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
439
440	return CalcErrorBlock_AVX2( sum4, a);
441	}
442
443	static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[`8`], const __m256i sum4) noexcept
444	{
445	ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
446
447	return CalcErrorBlock_AVX2( sum4, a);
448	}
449
450	static etcpak_force_inline void FindBestFit_4x2_AVX2( uint32_t terr[`2`][`8`], uint32_t tsel[`8`], v4i a[`8`], const uint32_t offset, const uint8_t* data) noexcept
451	{
452	__m256i sel0 = _mm256_setzero_si256();
453	__m256i sel1 = _mm256_setzero_si256();
454
455	for (unsigned int j = `0`; j < `2`; ++j)
456	{
457	unsigned int bid = offset + `1` - j;
458
459	__m256i squareErrorSum = _mm256_setzero_si256();
460
461	__m128i a0 = _mm_loadl_epi64((const __m128i*)a[bid].data());
462	__m256i a1 = _mm256_broadcastq_epi64(a0);
463
464	// Processing one full row each iteration
465	for (size_t i = `0`; i < `8`; i += `4`)
466	{
467	__m128i rgb = _mm_loadu_si128((const __m128i)(data + i `4`));
468
469	__m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
470	__m256i d = _mm256_sub_epi16(a1, rgb16);
471
472	// The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
473	// This produces slightly different results, but is significant faster
474	__m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(`0`, `38`, `76`, `14`, `0`, `38`, `76`, `14`, `0`, `38`, `76`, `14`, `0`, `38`, `76`, `14`));
475	__m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
476	__m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
477	__m128i pixel3 = _mm256_castsi256_si128(pixel2);
478
479	__m128i pix0 = _mm_broadcastw_epi16(pixel3);
480	__m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, `16`));
481	__m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, `1`);
482
483	// Processing first two pixels of the row
484	{
485	__m256i pix = _mm256_abs_epi16(pixel);
486
487	// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
488	// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
489	__m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[`0`])));
490	__m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[`1`])));
491
492	__m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(`1`));
493	__m256i minError = _mm256_min_epi16(error0, error1);
494
495	// Exploiting symmetry of the selector table and use the sign bit
496	// This produces slightly different results, but is significant faster
497	__m256i minIndex1 = _mm256_srli_epi16(pixel, `15`);
498
499	// Interleaving values so madd instruction can be used
500	__m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(`1`, `1`, `0`, `0`));
501	__m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(`3`, `3`, `2`, `2`));
502
503	__m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
504	// Squaring the minimum error to produce correct values when adding
505	__m256i squareError = _mm256_madd_epi16(minError2, minError2);
506
507	squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
508
509	// Packing selector bits
510	__m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * `8`));
511	__m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * `8`));
512
513	sel0 = _mm256_or_si256(sel0, minIndexLo2);
514	sel1 = _mm256_or_si256(sel1, minIndexHi2);
515	}
516
517	pixel3 = _mm256_extracti128_si256(pixel2, `1`);
518	pix0 = _mm_broadcastw_epi16(pixel3);
519	pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, `16`));
520	pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, `1`);
521
522	// Processing second two pixels of the row
523	{
524	__m256i pix = _mm256_abs_epi16(pixel);
525
526	// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
527	// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
528	__m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[`0`])));
529	__m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[`1`])));
530
531	__m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(`1`));
532	__m256i minError = _mm256_min_epi16(error0, error1);
533
534	// Exploiting symmetry of the selector table and use the sign bit
535	__m256i minIndex1 = _mm256_srli_epi16(pixel, `15`);
536
537	// Interleaving values so madd instruction can be used
538	__m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(`1`, `1`, `0`, `0`));
539	__m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(`3`, `3`, `2`, `2`));
540
541	__m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
542	// Squaring the minimum error to produce correct values when adding
543	__m256i squareError = _mm256_madd_epi16(minError2, minError2);
544
545	squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
546
547	// Packing selector bits
548	__m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * `8`));
549	__m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * `8`));
550	__m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, `2`);
551	__m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, `2`);
552
553	sel0 = _mm256_or_si256(sel0, minIndexLo3);
554	sel1 = _mm256_or_si256(sel1, minIndexHi3);
555	}
556	}
557
558	data += `8` * `4`;
559
560	_mm256_store_si256((__m256i*)terr[`1` - j], squareErrorSum);
561	}
562
563	// Interleave selector bits
564	__m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
565	__m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
566
567	__m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (`0`) \| (`2` << `4`));
568	__m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (`1`) \| (`3` << `4`));
569
570	__m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, `1`);
571
572	__m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
573
574	_mm256_store_si256((__m256i*)tsel, sel);
575	}
576
577	static etcpak_force_inline void FindBestFit_2x4_AVX2( uint32_t terr[`2`][`8`], uint32_t tsel[`8`], v4i a[`8`], const uint32_t offset, const uint8_t* data) noexcept
578	{
579	__m256i sel0 = _mm256_setzero_si256();
580	__m256i sel1 = _mm256_setzero_si256();
581
582	__m256i squareErrorSum0 = _mm256_setzero_si256();
583	__m256i squareErrorSum1 = _mm256_setzero_si256();
584
585	__m128i a0 = _mm_loadl_epi64((const __m128i*)a[offset + `1`].data());
586	__m128i a1 = _mm_loadl_epi64((const __m128i*)a[offset + `0`].data());
587
588	__m128i a2 = _mm_broadcastq_epi64(a0);
589	__m128i a3 = _mm_broadcastq_epi64(a1);
590	__m256i a4 = _mm256_insertf128_si256(_mm256_castsi128_si256(a2), a3, `1`);
591
592	// Processing one full row each iteration
593	for (size_t i = `0`; i < `16`; i += `4`)
594	{
595	__m128i rgb = _mm_loadu_si128((const __m128i)(data + i `4`));
596
597	__m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
598	__m256i d = _mm256_sub_epi16(a4, rgb16);
599
600	// The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
601	// This produces slightly different results, but is significant faster
602	__m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(`0`, `38`, `76`, `14`, `0`, `38`, `76`, `14`, `0`, `38`, `76`, `14`, `0`, `38`, `76`, `14`));
603	__m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
604	__m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
605	__m128i pixel3 = _mm256_castsi256_si128(pixel2);
606
607	__m128i pix0 = _mm_broadcastw_epi16(pixel3);
608	__m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, `16`));
609	__m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, `1`);
610
611	// Processing first two pixels of the row
612	{
613	__m256i pix = _mm256_abs_epi16(pixel);
614
615	// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
616	// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
617	__m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[`0`])));
618	__m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[`1`])));
619
620	__m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(`1`));
621	__m256i minError = _mm256_min_epi16(error0, error1);
622
623	// Exploiting symmetry of the selector table and use the sign bit
624	__m256i minIndex1 = _mm256_srli_epi16(pixel, `15`);
625
626	// Interleaving values so madd instruction can be used
627	__m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(`1`, `1`, `0`, `0`));
628	__m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(`3`, `3`, `2`, `2`));
629
630	__m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
631	// Squaring the minimum error to produce correct values when adding
632	__m256i squareError = _mm256_madd_epi16(minError2, minError2);
633
634	squareErrorSum0 = _mm256_add_epi32(squareErrorSum0, squareError);
635
636	// Packing selector bits
637	__m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
638	__m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
639
640	sel0 = _mm256_or_si256(sel0, minIndexLo2);
641	sel1 = _mm256_or_si256(sel1, minIndexHi2);
642	}
643
644	pixel3 = _mm256_extracti128_si256(pixel2, `1`);
645	pix0 = _mm_broadcastw_epi16(pixel3);
646	pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, `16`));
647	pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, `1`);
648
649	// Processing second two pixels of the row
650	{
651	__m256i pix = _mm256_abs_epi16(pixel);
652
653	// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
654	// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
655	__m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[`0`])));
656	__m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[`1`])));
657
658	__m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(`1`));
659	__m256i minError = _mm256_min_epi16(error0, error1);
660
661	// Exploiting symmetry of the selector table and use the sign bit
662	__m256i minIndex1 = _mm256_srli_epi16(pixel, `15`);
663
664	// Interleaving values so madd instruction can be used
665	__m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(`1`, `1`, `0`, `0`));
666	__m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(`3`, `3`, `2`, `2`));
667
668	__m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
669	// Squaring the minimum error to produce correct values when adding
670	__m256i squareError = _mm256_madd_epi16(minError2, minError2);
671
672	squareErrorSum1 = _mm256_add_epi32(squareErrorSum1, squareError);
673
674	// Packing selector bits
675	__m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
676	__m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
677	__m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, `2`);
678	__m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, `2`);
679
680	sel0 = _mm256_or_si256(sel0, minIndexLo3);
681	sel1 = _mm256_or_si256(sel1, minIndexHi3);
682	}
683	}
684
685	_mm256_store_si256((__m256i*)terr[`1`], squareErrorSum0);
686	_mm256_store_si256((__m256i*)terr[`0`], squareErrorSum1);
687
688	// Interleave selector bits
689	__m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
690	__m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
691
692	__m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (`0`) \| (`2` << `4`));
693	__m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (`1`) \| (`3` << `4`));
694
695	__m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, `1`);
696
697	__m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
698
699	_mm256_store_si256((__m256i*)tsel, sel);
700	}
701
702	static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[`2`][`8`], const uint32_t tsel[`8`], const bool rotate) noexcept
703	{
704	size_t tidx[`2`];
705
706	// Get index of minimum error (terr[0] and terr[1])
707	__m256i err0 = _mm256_load_si256((const __m256i*)terr[`0`]);
708	__m256i err1 = _mm256_load_si256((const __m256i*)terr[`1`]);
709
710	__m256i errLo = _mm256_permute2x128_si256(err0, err1, (`0`) \| (`2` << `4`));
711	__m256i errHi = _mm256_permute2x128_si256(err0, err1, (`1`) \| (`3` << `4`));
712
713	__m256i errMin0 = _mm256_min_epu32(errLo, errHi);
714
715	__m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(`2`, `3`, `0`, `1`));
716	__m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
717
718	__m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(`1`, `0`, `3`, `2`));
719	__m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
720
721	__m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (`0`) \| (`0` << `4`));
722	__m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (`1`) \| (`1` << `4`));
723
724	__m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
725	__m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
726
727	uint32_t mask0 = _mm256_movemask_epi8(errMask0);
728	uint32_t mask1 = _mm256_movemask_epi8(errMask1);
729
730	tidx[`0`] = _bit_scan_forward(mask0) >> `2`;
731	tidx[`1`] = _bit_scan_forward(mask1) >> `2`;
732
733	d \|= tidx[`0`] << `26`;
734	d \|= tidx[`1`] << `29`;
735
736	unsigned int t0 = tsel[tidx[`0`]];
737	unsigned int t1 = tsel[tidx[`1`]];
738
739	if (!rotate)
740	{
741	t0 &= `0xFF00FF00`;
742	t1 &= `0x00FF00FF`;
743	}
744	else
745	{
746	t0 &= `0xCCCCCCCC`;
747	t1 &= `0x33333333`;
748	}
749
750	// Flip selectors from sign bit
751	unsigned int t2 = (t0 \| t1) ^ `0xFFFF0000`;
752
753	return d \| static_cast<uint64_t>(_bswap(t2)) << `32`;
754	}
755
756	static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cvf) noexcept
757	{
758	__m128i co = _mm_cvttps_epi32(cof);
759	__m128i ch = _mm_cvttps_epi32(chf);
760	__m128i cv = _mm_cvttps_epi32(cvf);
761
762	__m128i coh = _mm_packus_epi32(co, ch);
763	__m128i cv0 = _mm_packus_epi32(cv, _mm_setzero_si128());
764
765	__m256i cohv0 = _mm256_inserti128_si256(_mm256_castsi128_si256(coh), cv0, `1`);
766	__m256i cohv1 = _mm256_min_epu16(cohv0, _mm256_set1_epi16(`1023`));
767
768	__m256i cohv2 = _mm256_sub_epi16(cohv1, _mm256_set1_epi16(`15`));
769	__m256i cohv3 = _mm256_srai_epi16(cohv2, `1`);
770
771	__m256i cohvrb0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(`11`));
772	__m256i cohvrb1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(`4`));
773	__m256i cohvg0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(`9`));
774	__m256i cohvg1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(`6`));
775
776	__m256i cohvrb2 = _mm256_srai_epi16(cohvrb0, `7`);
777	__m256i cohvrb3 = _mm256_srai_epi16(cohvrb1, `7`);
778	__m256i cohvg2 = _mm256_srai_epi16(cohvg0, `8`);
779	__m256i cohvg3 = _mm256_srai_epi16(cohvg1, `8`);
780
781	__m256i cohvrb4 = _mm256_sub_epi16(cohvrb0, cohvrb2);
782	__m256i cohvrb5 = _mm256_sub_epi16(cohvrb4, cohvrb3);
783	__m256i cohvg4 = _mm256_sub_epi16(cohvg0, cohvg2);
784	__m256i cohvg5 = _mm256_sub_epi16(cohvg4, cohvg3);
785
786	__m256i cohvrb6 = _mm256_srai_epi16(cohvrb5, `3`);
787	__m256i cohvg6 = _mm256_srai_epi16(cohvg5, `2`);
788
789	__m256i cohv4 = _mm256_blend_epi16(cohvg6, cohvrb6, `0x55`);
790
791	__m128i cohv5 = _mm_packus_epi16(_mm256_castsi256_si128(cohv4), _mm256_extracti128_si256(cohv4, `1`));
792	return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(`6`, `5`, `4`, -`1`, `2`, `1`, `0`, -`1`, `10`, `9`, `8`, -`1`, -`1`, -`1`, -`1`, -`1`));
793	}
794
795	static etcpak_force_inline Plane Planar_AVX2( const Channels& ch, uint8_t& mode, bool useHeuristics )
796	{
797	__m128i t0 = _mm_sad_epu8( ch.r8, _mm_setzero_si128() );
798	__m128i t1 = _mm_sad_epu8( ch.g8, _mm_setzero_si128() );
799	__m128i t2 = _mm_sad_epu8( ch.b8, _mm_setzero_si128() );
800
801	__m128i r8s = _mm_shuffle_epi8( ch.r8, _mm_set_epi8( `0xF`, `0xE`, `0xB`, `0xA`, `0x7`, `0x6`, `0x3`, `0x2`, `0xD`, `0xC`, `0x9`, `0x8`, `0x5`, `0x4`, `0x1`, `0x0` ) );
802	__m128i g8s = _mm_shuffle_epi8( ch.g8, _mm_set_epi8( `0xF`, `0xE`, `0xB`, `0xA`, `0x7`, `0x6`, `0x3`, `0x2`, `0xD`, `0xC`, `0x9`, `0x8`, `0x5`, `0x4`, `0x1`, `0x0` ) );
803	__m128i b8s = _mm_shuffle_epi8( ch.b8, _mm_set_epi8( `0xF`, `0xE`, `0xB`, `0xA`, `0x7`, `0x6`, `0x3`, `0x2`, `0xD`, `0xC`, `0x9`, `0x8`, `0x5`, `0x4`, `0x1`, `0x0` ) );
804
805	__m128i s0 = _mm_sad_epu8( r8s, _mm_setzero_si128() );
806	__m128i s1 = _mm_sad_epu8( g8s, _mm_setzero_si128() );
807	__m128i s2 = _mm_sad_epu8( b8s, _mm_setzero_si128() );
808
809	__m256i sr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), s0, `1` );
810	__m256i sg0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t1 ), s1, `1` );
811	__m256i sb0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), s2, `1` );
812
813	__m256i sr1 = _mm256_slli_epi64( sr0, `32` );
814	__m256i sg1 = _mm256_slli_epi64( sg0, `16` );
815
816	__m256i srb = _mm256_or_si256( sr1, sb0 );
817	__m256i srgb = _mm256_or_si256( srb, sg1 );
818
819	if( mode != ModePlanar && useHeuristics )
820	{
821	Plane plane;
822	plane.sum4 = _mm256_permute4x64_epi64( srgb, _MM_SHUFFLE( `2`, `3`, `0`, `1` ) );
823	return plane;
824	}
825
826	__m128i t3 = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( t0 ), _mm_castsi128_ps( t1 ), _MM_SHUFFLE( `2`, `0`, `2`, `0` ) ) );
827	__m128i t4 = _mm_shuffle_epi32( t2, _MM_SHUFFLE( `3`, `1`, `2`, `0` ) );
828	__m128i t5 = _mm_hadd_epi32( t3, t4 );
829	__m128i t6 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( `1`, `1`, `1`, `1` ) );
830	__m128i t7 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( `2`, `2`, `2`, `2` ) );
831
832	__m256i sr = _mm256_broadcastw_epi16( t5 );
833	__m256i sg = _mm256_broadcastw_epi16( t6 );
834	__m256i sb = _mm256_broadcastw_epi16( t7 );
835
836	__m256i r08 = _mm256_cvtepu8_epi16( ch.r8 );
837	__m256i g08 = _mm256_cvtepu8_epi16( ch.g8 );
838	__m256i b08 = _mm256_cvtepu8_epi16( ch.b8 );
839
840	__m256i r16 = _mm256_slli_epi16( r08, `4` );
841	__m256i g16 = _mm256_slli_epi16( g08, `4` );
842	__m256i b16 = _mm256_slli_epi16( b08, `4` );
843
844	__m256i difR0 = _mm256_sub_epi16( r16, sr );
845	__m256i difG0 = _mm256_sub_epi16( g16, sg );
846	__m256i difB0 = _mm256_sub_epi16( b16, sb );
847
848	__m256i difRyz = _mm256_madd_epi16( difR0, _mm256_set_epi16( `255`, `85`, -`85`, -`255`, `255`, `85`, -`85`, -`255`, `255`, `85`, -`85`, -`255`, `255`, `85`, -`85`, -`255` ) );
849	__m256i difGyz = _mm256_madd_epi16( difG0, _mm256_set_epi16( `255`, `85`, -`85`, -`255`, `255`, `85`, -`85`, -`255`, `255`, `85`, -`85`, -`255`, `255`, `85`, -`85`, -`255` ) );
850	__m256i difByz = _mm256_madd_epi16( difB0, _mm256_set_epi16( `255`, `85`, -`85`, -`255`, `255`, `85`, -`85`, -`255`, `255`, `85`, -`85`, -`255`, `255`, `85`, -`85`, -`255` ) );
851
852	__m256i difRxz = _mm256_madd_epi16( difR0, _mm256_set_epi16( `255`, `255`, `255`, `255`, `85`, `85`, `85`, `85`, -`85`, -`85`, -`85`, -`85`, -`255`, -`255`, -`255`, -`255` ) );
853	__m256i difGxz = _mm256_madd_epi16( difG0, _mm256_set_epi16( `255`, `255`, `255`, `255`, `85`, `85`, `85`, `85`, -`85`, -`85`, -`85`, -`85`, -`255`, -`255`, -`255`, -`255` ) );
854	__m256i difBxz = _mm256_madd_epi16( difB0, _mm256_set_epi16( `255`, `255`, `255`, `255`, `85`, `85`, `85`, `85`, -`85`, -`85`, -`85`, -`85`, -`255`, -`255`, -`255`, -`255` ) );
855
856	__m256i difRGyz = _mm256_hadd_epi32( difRyz, difGyz );
857	__m256i difByzxz = _mm256_hadd_epi32( difByz, difBxz );
858
859	__m256i difRGxz = _mm256_hadd_epi32( difRxz, difGxz );
860
861	__m128i sumRGyz = _mm_add_epi32( _mm256_castsi256_si128( difRGyz ), _mm256_extracti128_si256( difRGyz, `1` ) );
862	__m128i sumByzxz = _mm_add_epi32( _mm256_castsi256_si128( difByzxz ), _mm256_extracti128_si256( difByzxz, `1` ) );
863	__m128i sumRGxz = _mm_add_epi32( _mm256_castsi256_si128( difRGxz ), _mm256_extracti128_si256( difRGxz, `1` ) );
864
865	__m128i sumRGByz = _mm_hadd_epi32( sumRGyz, sumByzxz );
866	__m128i sumRGByzxz = _mm_hadd_epi32( sumRGxz, sumByzxz );
867
868	__m128i sumRGBxz = _mm_shuffle_epi32( sumRGByzxz, _MM_SHUFFLE( `2`, `3`, `1`, `0` ) );
869
870	__m128 sumRGByzf = _mm_cvtepi32_ps( sumRGByz );
871	__m128 sumRGBxzf = _mm_cvtepi32_ps( sumRGBxz );
872
873	const float value = ( `255` * `255` * `8.0f` + `85` * `85` * `8.0f` ) * `16.0f`;
874
875	__m128 scale = _mm_set1_ps( -`4.0f` / value );
876
877	__m128 af = _mm_mul_ps( sumRGBxzf, scale );
878	__m128 bf = _mm_mul_ps( sumRGByzf, scale );
879
880	__m128 df = _mm_mul_ps( _mm_cvtepi32_ps( t5 ), _mm_set1_ps( `4.0f` / `16.0f` ) );
881
882	// calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af x - bf * y;*
883	__m128 cof0 = _mm_fnmadd_ps( af, _mm_set1_ps( -`255.0f` ), _mm_fnmadd_ps( bf, _mm_set1_ps( -`255.0f` ), df ) );
884	__m128 chf0 = _mm_fnmadd_ps( af, _mm_set1_ps( `425.0f` ), _mm_fnmadd_ps( bf, _mm_set1_ps( -`255.0f` ), df ) );
885	__m128 cvf0 = _mm_fnmadd_ps( af, _mm_set1_ps( -`255.0f` ), _mm_fnmadd_ps( bf, _mm_set1_ps( `425.0f` ), df ) );
886
887	// convert to r6g7b6
888	__m128i cohv = r6g7b6_AVX2( cof0, chf0, cvf0 );
889
890	uint64_t rgbho = _mm_extract_epi64( cohv, `0` );
891	uint32_t rgbv0 = _mm_extract_epi32( cohv, `2` );
892
893	// Error calculation
894	uint64_t error = `0`;
895	if( !useHeuristics )
896	{
897	auto ro0 = ( rgbho >> `48` ) & `0x3F`;
898	auto go0 = ( rgbho >> `40` ) & `0x7F`;
899	auto bo0 = ( rgbho >> `32` ) & `0x3F`;
900	auto ro1 = ( ro0 >> `4` ) \| ( ro0 << `2` );
901	auto go1 = ( go0 >> `6` ) \| ( go0 << `1` );
902	auto bo1 = ( bo0 >> `4` ) \| ( bo0 << `2` );
903	auto ro2 = ( ro1 << `2` ) + `2`;
904	auto go2 = ( go1 << `2` ) + `2`;
905	auto bo2 = ( bo1 << `2` ) + `2`;
906
907	__m256i ro3 = _mm256_set1_epi16( ro2 );
908	__m256i go3 = _mm256_set1_epi16( go2 );
909	__m256i bo3 = _mm256_set1_epi16( bo2 );
910
911	auto rh0 = ( rgbho >> `16` ) & `0x3F`;
912	auto gh0 = ( rgbho >> `8` ) & `0x7F`;
913	auto bh0 = ( rgbho >> `0` ) & `0x3F`;
914	auto rh1 = ( rh0 >> `4` ) \| ( rh0 << `2` );
915	auto gh1 = ( gh0 >> `6` ) \| ( gh0 << `1` );
916	auto bh1 = ( bh0 >> `4` ) \| ( bh0 << `2` );
917
918	auto rh2 = rh1 - ro1;
919	auto gh2 = gh1 - go1;
920	auto bh2 = bh1 - bo1;
921
922	__m256i rh3 = _mm256_set1_epi16( rh2 );
923	__m256i gh3 = _mm256_set1_epi16( gh2 );
924	__m256i bh3 = _mm256_set1_epi16( bh2 );
925
926	auto rv0 = ( rgbv0 >> `16` ) & `0x3F`;
927	auto gv0 = ( rgbv0 >> `8` ) & `0x7F`;
928	auto bv0 = ( rgbv0 >> `0` ) & `0x3F`;
929	auto rv1 = ( rv0 >> `4` ) \| ( rv0 << `2` );
930	auto gv1 = ( gv0 >> `6` ) \| ( gv0 << `1` );
931	auto bv1 = ( bv0 >> `4` ) \| ( bv0 << `2` );
932
933	auto rv2 = rv1 - ro1;
934	auto gv2 = gv1 - go1;
935	auto bv2 = bv1 - bo1;
936
937	__m256i rv3 = _mm256_set1_epi16( rv2 );
938	__m256i gv3 = _mm256_set1_epi16( gv2 );
939	__m256i bv3 = _mm256_set1_epi16( bv2 );
940
941	__m256i x = _mm256_set_epi16( `3`, `3`, `3`, `3`, `2`, `2`, `2`, `2`, `1`, `1`, `1`, `1`, `0`, `0`, `0`, `0` );
942
943	__m256i rh4 = _mm256_mullo_epi16( rh3, x );
944	__m256i gh4 = _mm256_mullo_epi16( gh3, x );
945	__m256i bh4 = _mm256_mullo_epi16( bh3, x );
946
947	__m256i y = _mm256_set_epi16( `3`, `2`, `1`, `0`, `3`, `2`, `1`, `0`, `3`, `2`, `1`, `0`, `3`, `2`, `1`, `0` );
948
949	__m256i rv4 = _mm256_mullo_epi16( rv3, y );
950	__m256i gv4 = _mm256_mullo_epi16( gv3, y );
951	__m256i bv4 = _mm256_mullo_epi16( bv3, y );
952
953	__m256i rxy = _mm256_add_epi16( rh4, rv4 );
954	__m256i gxy = _mm256_add_epi16( gh4, gv4 );
955	__m256i bxy = _mm256_add_epi16( bh4, bv4 );
956
957	__m256i rp0 = _mm256_add_epi16( rxy, ro3 );
958	__m256i gp0 = _mm256_add_epi16( gxy, go3 );
959	__m256i bp0 = _mm256_add_epi16( bxy, bo3 );
960
961	__m256i rp1 = _mm256_srai_epi16( rp0, `2` );
962	__m256i gp1 = _mm256_srai_epi16( gp0, `2` );
963	__m256i bp1 = _mm256_srai_epi16( bp0, `2` );
964
965	__m256i rp2 = _mm256_max_epi16( _mm256_min_epi16( rp1, _mm256_set1_epi16( `255` ) ), _mm256_setzero_si256() );
966	__m256i gp2 = _mm256_max_epi16( _mm256_min_epi16( gp1, _mm256_set1_epi16( `255` ) ), _mm256_setzero_si256() );
967	__m256i bp2 = _mm256_max_epi16( _mm256_min_epi16( bp1, _mm256_set1_epi16( `255` ) ), _mm256_setzero_si256() );
968
969	__m256i rdif = _mm256_sub_epi16( r08, rp2 );
970	__m256i gdif = _mm256_sub_epi16( g08, gp2 );
971	__m256i bdif = _mm256_sub_epi16( b08, bp2 );
972
973	__m256i rerr = _mm256_mullo_epi16( rdif, _mm256_set1_epi16( `38` ) );
974	__m256i gerr = _mm256_mullo_epi16( gdif, _mm256_set1_epi16( `76` ) );
975	__m256i berr = _mm256_mullo_epi16( bdif, _mm256_set1_epi16( `14` ) );
976
977	__m256i sum0 = _mm256_add_epi16( rerr, gerr );
978	__m256i sum1 = _mm256_add_epi16( sum0, berr );
979
980	__m256i sum2 = _mm256_madd_epi16( sum1, sum1 );
981
982	__m128i sum3 = _mm_add_epi32( _mm256_castsi256_si128( sum2 ), _mm256_extracti128_si256( sum2, `1` ) );
983
984	uint32_t err0 = _mm_extract_epi32( sum3, `0` );
985	uint32_t err1 = _mm_extract_epi32( sum3, `1` );
986	uint32_t err2 = _mm_extract_epi32( sum3, `2` );
987	uint32_t err3 = _mm_extract_epi32( sum3, `3` );
988
989	error = err0 + err1 + err2 + err3;
990	}
991	//
992
993	uint32_t rgbv = ( rgbv0 & `0x3F` ) \| ( ( rgbv0 >> `2` ) & `0x1FC0` ) \| ( ( rgbv0 >> `3` ) & `0x7E000` );
994	uint64_t rgbho0_ = ( rgbho & `0x3F0000003F` ) \| ( ( rgbho >> `2` ) & `0x1FC000001FC0` ) \| ( ( rgbho >> `3` ) & `0x7E0000007E000` );
995	uint64_t rgbho0 = ( rgbho0_ & `0x7FFFF` ) \| ( ( rgbho0_ >> `13` ) & `0x3FFFF80000` );
996
997	uint32_t hi = rgbv \| ((rgbho0 & `0x1FFF`) << `19`);
998	rgbho0 >>= `13`;
999	uint32_t lo = ( rgbho0 & `0x1` ) \| ( ( rgbho0 & `0x1FE` ) << `1` ) \| ( ( rgbho0 & `0x600` ) << `2` ) \| ( ( rgbho0 & `0x3F800` ) << `5` ) \| ( ( rgbho0 & `0x1FC0000` ) << `6` );
1000
1001	uint32_t idx = ( ( rgbho >> `33` ) & `0xF` ) \| ( ( rgbho >> `41` ) & `0x10` ) \| ( ( rgbho >> `48` ) & `0x20` );
1002	lo \|= g_flags[idx];
1003	uint64_t result = static_cast<uint32_t>(_bswap(lo));
1004	result \|= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << `32`;
1005
1006	Plane plane;
1007
1008	plane.plane = result;
1009	if( useHeuristics )
1010	{
1011	plane.error = `0`;
1012	mode = ModePlanar;
1013	}
1014	else
1015	{
1016	plane.error = error;
1017	}
1018	plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(`2`, `3`, `0`, `1`));
1019
1020	return plane;
1021	}
1022
1023	static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[`2`][`8`], const uint32_t tsel[`8`], const bool rotate, const uint64_t value, const uint32_t error) noexcept
1024	{
1025	size_t tidx[`2`];
1026
1027	// Get index of minimum error (terr[0] and terr[1])
1028	__m256i err0 = _mm256_load_si256((const __m256i*)terr[`0`]);
1029	__m256i err1 = _mm256_load_si256((const __m256i*)terr[`1`]);
1030
1031	__m256i errLo = _mm256_permute2x128_si256(err0, err1, (`0`) \| (`2` << `4`));
1032	__m256i errHi = _mm256_permute2x128_si256(err0, err1, (`1`) \| (`3` << `4`));
1033
1034	__m256i errMin0 = _mm256_min_epu32(errLo, errHi);
1035
1036	__m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(`2`, `3`, `0`, `1`));
1037	__m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
1038
1039	__m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(`1`, `0`, `3`, `2`));
1040	__m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
1041
1042	__m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (`0`) \| (`0` << `4`));
1043	__m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (`1`) \| (`1` << `4`));
1044
1045	__m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
1046	__m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
1047
1048	uint32_t mask0 = _mm256_movemask_epi8(errMask0);
1049	uint32_t mask1 = _mm256_movemask_epi8(errMask1);
1050
1051	tidx[`0`] = _bit_scan_forward(mask0) >> `2`;
1052	tidx[`1`] = _bit_scan_forward(mask1) >> `2`;
1053
1054	if ((terr[`0`][tidx[`0`]] + terr[`1`][tidx[`1`]]) >= error)
1055	{
1056	return value;
1057	}
1058
1059	d \|= tidx[`0`] << `26`;
1060	d \|= tidx[`1`] << `29`;
1061
1062	unsigned int t0 = tsel[tidx[`0`]];
1063	unsigned int t1 = tsel[tidx[`1`]];
1064
1065	if (!rotate)
1066	{
1067	t0 &= `0xFF00FF00`;
1068	t1 &= `0x00FF00FF`;
1069	}
1070	else
1071	{
1072	t0 &= `0xCCCCCCCC`;
1073	t1 &= `0x33333333`;
1074	}
1075
1076	// Flip selectors from sign bit
1077	unsigned int t2 = (t0 \| t1) ^ `0xFFFF0000`;
1078
1079	return d \| static_cast<uint64_t>(_bswap(t2)) << `32`;
1080	}
1081
1082	#endif
1083
1084	static etcpak_force_inline void Average( const uint8_t* data, v4i* a )
1085	{
1086	#ifdef __SSE4_1__
1087	__m128i d0 = _mm_loadu_si128(((__m128i*)data) + `0`);
1088	__m128i d1 = _mm_loadu_si128(((__m128i*)data) + `1`);
1089	__m128i d2 = _mm_loadu_si128(((__m128i*)data) + `2`);
1090	__m128i d3 = _mm_loadu_si128(((__m128i*)data) + `3`);
1091
1092	__m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
1093	__m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
1094	__m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
1095	__m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
1096	__m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
1097	__m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
1098	__m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
1099	__m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
1100
1101	__m128i sum0 = _mm_add_epi16(d0l, d1l);
1102	__m128i sum1 = _mm_add_epi16(d0h, d1h);
1103	__m128i sum2 = _mm_add_epi16(d2l, d3l);
1104	__m128i sum3 = _mm_add_epi16(d2h, d3h);
1105
1106	__m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
1107	__m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
1108	__m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
1109	__m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
1110	__m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
1111	__m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
1112	__m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
1113	__m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
1114
1115	__m128i b0 = _mm_add_epi32(sum0l, sum0h);
1116	__m128i b1 = _mm_add_epi32(sum1l, sum1h);
1117	__m128i b2 = _mm_add_epi32(sum2l, sum2h);
1118	__m128i b3 = _mm_add_epi32(sum3l, sum3h);
1119
1120	__m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(`4`)), `3`);
1121	__m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(`4`)), `3`);
1122	__m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(`4`)), `3`);
1123	__m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(`4`)), `3`);
1124
1125	_mm_storeu_si128((__m128i*)&a[`0`], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(`3`, `0`, `1`, `2`)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(`3`, `0`, `1`, `2`))));
1126	_mm_storeu_si128((__m128i*)&a[`2`], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(`3`, `0`, `1`, `2`)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(`3`, `0`, `1`, `2`))));
1127	#elif defined __ARM_NEON
1128	uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data + `0`), uint8x16_t());
1129	uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + `16`), uint8x16_t());
1130	uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + `32`), uint8x16_t());
1131	uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + `48`), uint8x16_t());
1132
1133	uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[`0`]), vreinterpretq_u16_u8(t0.val[`1`]) };
1134	uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[`0`]), vreinterpretq_u16_u8(t1.val[`1`]) };
1135	uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[`0`]), vreinterpretq_u16_u8(t2.val[`1`]) };
1136	uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[`0`]), vreinterpretq_u16_u8(t3.val[`1`]) };
1137
1138	uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[`0`] ), vreinterpretq_s16_u16( d1.val[`0`] ) ) ), uint16x8_t());
1139	uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[`1`] ), vreinterpretq_s16_u16( d1.val[`1`] ) ) ), uint16x8_t());
1140	uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[`0`] ), vreinterpretq_s16_u16( d3.val[`0`] ) ) ), uint16x8_t());
1141	uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[`1`] ), vreinterpretq_s16_u16( d3.val[`1`] ) ) ), uint16x8_t());
1142
1143	uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[`0`]), vreinterpretq_u32_u16(s0.val[`1`]) };
1144	uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[`0`]), vreinterpretq_u32_u16(s1.val[`1`]) };
1145	uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[`0`]), vreinterpretq_u32_u16(s2.val[`1`]) };
1146	uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[`0`]), vreinterpretq_u32_u16(s3.val[`1`]) };
1147
1148	uint32x4_t b0 = vaddq_u32(sum0.val[`0`], sum0.val[`1`]);
1149	uint32x4_t b1 = vaddq_u32(sum1.val[`0`], sum1.val[`1`]);
1150	uint32x4_t b2 = vaddq_u32(sum2.val[`0`], sum2.val[`1`]);
1151	uint32x4_t b3 = vaddq_u32(sum3.val[`0`], sum3.val[`1`]);
1152
1153	uint32x4_t a0 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b2, b3), vdupq_n_u32(`4`)), `3`);
1154	uint32x4_t a1 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b1), vdupq_n_u32(`4`)), `3`);
1155	uint32x4_t a2 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b1, b3), vdupq_n_u32(`4`)), `3`);
1156	uint32x4_t a3 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b2), vdupq_n_u32(`4`)), `3`);
1157
1158	uint16x8_t o0 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a0 )), vqmovun_s32(vreinterpretq_s32_u32( a1 )));
1159	uint16x8_t o1 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a2 )), vqmovun_s32(vreinterpretq_s32_u32( a3 )));
1160
1161	a[`0`] = v4i{o0[`2`], o0[`1`], o0[`0`], `0`};
1162	a[`1`] = v4i{o0[`6`], o0[`5`], o0[`4`], `0`};
1163	a[`2`] = v4i{o1[`2`], o1[`1`], o1[`0`], `0`};
1164	a[`3`] = v4i{o1[`6`], o1[`5`], o1[`4`], `0`};
1165	#else
1166	uint32_t r[`4`];
1167	uint32_t g[`4`];
1168	uint32_t b[`4`];
1169
1170	memset(r, `0`, sizeof(r));
1171	memset(g, `0`, sizeof(g));
1172	memset(b, `0`, sizeof(b));
1173
1174	for( int j=`0`; j<`4`; j++ )
1175	{
1176	for( int i=`0`; i<`4`; i++ )
1177	{
1178	int index = (j & `2`) + (i >> `1`);
1179	b[index] += *data++;
1180	g[index] += *data++;
1181	r[index] += *data++;
1182	data++;
1183	}
1184	}
1185
1186	a[`0`] = v4i{ uint16_t( (r[`2`] + r[`3`] + `4`) / `8` ), uint16_t( (g[`2`] + g[`3`] + `4`) / `8` ), uint16_t( (b[`2`] + b[`3`] + `4`) / `8` ), `0`};
1187	a[`1`] = v4i{ uint16_t( (r[`0`] + r[`1`] + `4`) / `8` ), uint16_t( (g[`0`] + g[`1`] + `4`) / `8` ), uint16_t( (b[`0`] + b[`1`] + `4`) / `8` ), `0`};
1188	a[`2`] = v4i{ uint16_t( (r[`1`] + r[`3`] + `4`) / `8` ), uint16_t( (g[`1`] + g[`3`] + `4`) / `8` ), uint16_t( (b[`1`] + b[`3`] + `4`) / `8` ), `0`};
1189	a[`3`] = v4i{ uint16_t( (r[`0`] + r[`2`] + `4`) / `8` ), uint16_t( (g[`0`] + g[`2`] + `4`) / `8` ), uint16_t( (b[`0`] + b[`2`] + `4`) / `8` ), `0`};
1190	#endif
1191	}
1192
1193	static etcpak_force_inline void CalcErrorBlock( const uint8_t* data, unsigned int err[`4`][`4`] )
1194	{
1195	#ifdef __SSE4_1__
1196	__m128i d0 = _mm_loadu_si128(((__m128i*)data) + `0`);
1197	__m128i d1 = _mm_loadu_si128(((__m128i*)data) + `1`);
1198	__m128i d2 = _mm_loadu_si128(((__m128i*)data) + `2`);
1199	__m128i d3 = _mm_loadu_si128(((__m128i*)data) + `3`);
1200
1201	__m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(`0x00FFFFFF`));
1202	__m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(`0x00FFFFFF`));
1203	__m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(`0x00FFFFFF`));
1204	__m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(`0x00FFFFFF`));
1205
1206	__m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
1207	__m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
1208	__m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
1209	__m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
1210	__m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
1211	__m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
1212	__m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
1213	__m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
1214
1215	__m128i sum0 = _mm_add_epi16(d0l, d1l);
1216	__m128i sum1 = _mm_add_epi16(d0h, d1h);
1217	__m128i sum2 = _mm_add_epi16(d2l, d3l);
1218	__m128i sum3 = _mm_add_epi16(d2h, d3h);
1219
1220	__m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
1221	__m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
1222	__m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
1223	__m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
1224	__m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
1225	__m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
1226	__m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
1227	__m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
1228
1229	__m128i b0 = _mm_add_epi32(sum0l, sum0h);
1230	__m128i b1 = _mm_add_epi32(sum1l, sum1h);
1231	__m128i b2 = _mm_add_epi32(sum2l, sum2h);
1232	__m128i b3 = _mm_add_epi32(sum3l, sum3h);
1233
1234	__m128i a0 = _mm_add_epi32(b2, b3);
1235	__m128i a1 = _mm_add_epi32(b0, b1);
1236	__m128i a2 = _mm_add_epi32(b1, b3);
1237	__m128i a3 = _mm_add_epi32(b0, b2);
1238
1239	_mm_storeu_si128((__m128i*)&err[`0`], a0);
1240	_mm_storeu_si128((__m128i*)&err[`1`], a1);
1241	_mm_storeu_si128((__m128i*)&err[`2`], a2);
1242	_mm_storeu_si128((__m128i*)&err[`3`], a3);
1243	#elif defined __ARM_NEON
1244	uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data + `0`), uint8x16_t());
1245	uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + `16`), uint8x16_t());
1246	uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + `32`), uint8x16_t());
1247	uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + `48`), uint8x16_t());
1248
1249	uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[`0`]), vreinterpretq_u16_u8(t0.val[`1`]) };
1250	uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[`0`]), vreinterpretq_u16_u8(t1.val[`1`]) };
1251	uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[`0`]), vreinterpretq_u16_u8(t2.val[`1`]) };
1252	uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[`0`]), vreinterpretq_u16_u8(t3.val[`1`]) };
1253
1254	uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[`0`] ), vreinterpretq_s16_u16( d1.val[`0`] ))), uint16x8_t());
1255	uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[`1`] ), vreinterpretq_s16_u16( d1.val[`1`] ))), uint16x8_t());
1256	uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[`0`] ), vreinterpretq_s16_u16( d3.val[`0`] ))), uint16x8_t());
1257	uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[`1`] ), vreinterpretq_s16_u16( d3.val[`1`] ))), uint16x8_t());
1258
1259	uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[`0`]), vreinterpretq_u32_u16(s0.val[`1`]) };
1260	uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[`0`]), vreinterpretq_u32_u16(s1.val[`1`]) };
1261	uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[`0`]), vreinterpretq_u32_u16(s2.val[`1`]) };
1262	uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[`0`]), vreinterpretq_u32_u16(s3.val[`1`]) };
1263
1264	uint32x4_t b0 = vaddq_u32(sum0.val[`0`], sum0.val[`1`]);
1265	uint32x4_t b1 = vaddq_u32(sum1.val[`0`], sum1.val[`1`]);
1266	uint32x4_t b2 = vaddq_u32(sum2.val[`0`], sum2.val[`1`]);
1267	uint32x4_t b3 = vaddq_u32(sum3.val[`0`], sum3.val[`1`]);
1268
1269	uint32x4_t a0 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b2, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(`0x00FFFFFF`)) ) );
1270	uint32x4_t a1 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b1) ), vreinterpretq_u8_u32( vdupq_n_u32(`0x00FFFFFF`)) ) );
1271	uint32x4_t a2 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b1, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(`0x00FFFFFF`)) ) );
1272	uint32x4_t a3 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b2) ), vreinterpretq_u8_u32( vdupq_n_u32(`0x00FFFFFF`)) ) );
1273
1274	vst1q_u32(err[`0`], a0);
1275	vst1q_u32(err[`1`], a1);
1276	vst1q_u32(err[`2`], a2);
1277	vst1q_u32(err[`3`], a3);
1278	#else
1279	unsigned int terr[`4`][`4`];
1280
1281	memset(terr, `0`, `16` * sizeof(unsigned int));
1282
1283	for( int j=`0`; j<`4`; j++ )
1284	{
1285	for( int i=`0`; i<`4`; i++ )
1286	{
1287	int index = (j & `2`) + (i >> `1`);
1288	unsigned int d = *data++;
1289	terr[index][`0`] += d;
1290	d = *data++;
1291	terr[index][`1`] += d;
1292	d = *data++;
1293	terr[index][`2`] += d;
1294	data++;
1295	}
1296	}
1297
1298	for( int i=`0`; i<`3`; i++ )
1299	{
1300	err[`0`][i] = terr[`2`][i] + terr[`3`][i];
1301	err[`1`][i] = terr[`0`][i] + terr[`1`][i];
1302	err[`2`][i] = terr[`1`][i] + terr[`3`][i];
1303	err[`3`][i] = terr[`0`][i] + terr[`2`][i];
1304	}
1305	for( int i=`0`; i<`4`; i++ )
1306	{
1307	err[i][`3`] = `0`;
1308	}
1309	#endif
1310	}
1311
1312	static etcpak_force_inline unsigned int CalcError( const unsigned int block[`4`], const v4i& average )
1313	{
1314	unsigned int err = `0x3FFFFFFF`; // Big value to prevent negative values, but small enough to prevent overflow
1315	err -= block[`0`] * `2` * average [`2`];
1316	err -= block[`1`] * `2` * average [`1`];
1317	err -= block[`2`] * `2` * average [`0`];
1318	err += `8` * ( sq( average [`0`] ) + sq( average [`1`] ) + sq( average [`2`] ) );
1319	return err;
1320	}
1321
1322	static etcpak_force_inline void ProcessAverages( v4i* a )
1323	{
1324	#ifdef __SSE4_1__
1325	for( int i=`0`; i<`2`; i++ )
1326	{
1327	__m128i d = _mm_loadu_si128((__m128i)a[i`2`].data());
1328
1329	__m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(`31`)), _mm_set1_epi16(`128`));
1330
1331	__m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, `8`)), `8`);
1332
1333	__m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(`3`, `2`, `3`, `2`));
1334	__m128i diff = _mm_sub_epi16(c, c1);
1335	diff = _mm_max_epi16(diff, _mm_set1_epi16(-`4`));
1336	diff = _mm_min_epi16(diff, _mm_set1_epi16(`3`));
1337
1338	__m128i co = _mm_add_epi16(c1, diff);
1339
1340	c = _mm_blend_epi16(co, c, `0xF0`);
1341
1342	__m128i a0 = _mm_or_si128(_mm_slli_epi16(c, `3`), _mm_srli_epi16(c, `2`));
1343
1344	_mm_storeu_si128((__m128i)a[`4`+i`2`].data(), a0);
1345	}
1346
1347	for( int i=`0`; i<`2`; i++ )
1348	{
1349	__m128i d = _mm_loadu_si128((__m128i)a[i`2`].data());
1350
1351	__m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(`15`)), _mm_set1_epi16(`128`));
1352	__m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, `8`)), `8`);
1353
1354	__m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, `4`));
1355
1356	_mm_storeu_si128((__m128i)a[i`2`].data(), t2);
1357	}
1358	#elif defined __ARM_NEON
1359	for( int i=`0`; i<`2`; i++ )
1360	{
1361	int16x8_t d = vld1q_s16((int16_t)&a[i`2`]);
1362	int16x8_t t = vaddq_s16(vmulq_s16(d, vdupq_n_s16(`31`)), vdupq_n_s16(`128`));
1363	int16x8_t c = vshrq_n_s16(vaddq_s16(t, vshrq_n_s16(t, `8`)), `8`);
1364
1365	int16x8_t c1 = vcombine_s16(vget_high_s16(c), vget_high_s16(c));
1366	int16x8_t diff = vsubq_s16(c, c1);
1367	diff = vmaxq_s16(diff, vdupq_n_s16(-`4`));
1368	diff = vminq_s16(diff, vdupq_n_s16(`3`));
1369
1370	int16x8_t co = vaddq_s16(c1, diff);
1371
1372	c = vcombine_s16(vget_low_s16(co), vget_high_s16(c));
1373
1374	int16x8_t a0 = vorrq_s16(vshlq_n_s16(c, `3`), vshrq_n_s16(c, `2`));
1375
1376	vst1q_s16((int16_t)&a[`4`+i`2`], a0);
1377	}
1378
1379	for( int i=`0`; i<`2`; i++ )
1380	{
1381	int16x8_t d = vld1q_s16((int16_t)&a[i`2`]);
1382
1383	int16x8_t t0 = vaddq_s16(vmulq_s16(d, vdupq_n_s16(`15`)), vdupq_n_s16(`128`));
1384	int16x8_t t1 = vshrq_n_s16(vaddq_s16(t0, vshrq_n_s16(t0, `8`)), `8`);
1385
1386	int16x8_t t2 = vorrq_s16(t1, vshlq_n_s16(t1, `4`));
1387
1388	vst1q_s16((int16_t)&a[i`2`], t2);
1389	}
1390	#else
1391	for( int i=`0`; i<`2`; i++ )
1392	{
1393	for( int j=`0`; j<`3`; j++ )
1394	{
1395	int32_t c1 = mul8bit( a[i*`2`+`1`][j], `31` );
1396	int32_t c2 = mul8bit( a[i*`2`][j], `31` );
1397
1398	int32_t diff = c2 - c1;
1399	if( diff > `3` ) diff = `3`;
1400	else if( diff < -`4` ) diff = -`4`;
1401
1402	int32_t co = c1 + diff;
1403
1404	a[`5`+i*`2`][j] = ( c1 << `3` ) \| ( c1 >> `2` );
1405	a[`4`+i*`2`][j] = ( co << `3` ) \| ( co >> `2` );
1406	}
1407	}
1408
1409	for( int i=`0`; i<`4`; i++ )
1410	{
1411	a[i][`0`] = g_avg2[mul8bit( a[i][`0`], `15` )];
1412	a[i][`1`] = g_avg2[mul8bit( a[i][`1`], `15` )];
1413	a[i][`2`] = g_avg2[mul8bit( a[i][`2`], `15` )];
1414	}
1415	#endif
1416	}
1417
1418	static etcpak_force_inline void EncodeAverages( uint64_t& _d, const v4i* a, size_t idx )
1419	{
1420	auto d = _d;
1421	d \|= ( idx << `24` );
1422	size_t base = idx << `1`;
1423
1424	if( ( idx & `0x2` ) == `0` )
1425	{
1426	for( int i=`0`; i<`3`; i++ )
1427	{
1428	d \|= uint64_t( a[base+`0`][i] >> `4` ) << ( i*`8` );
1429	d \|= uint64_t( a[base+`1`][i] >> `4` ) << ( i*`8` + `4` );
1430	}
1431	}
1432	else
1433	{
1434	for( int i=`0`; i<`3`; i++ )
1435	{
1436	d \|= uint64_t( a[base+`1`][i] & `0xF8` ) << ( i*`8` );
1437	int32_t c = ( ( a[base+`0`][i] & `0xF8` ) - ( a[base+`1`][i] & `0xF8` ) ) >> `3`;
1438	c &= ~`0xFFFFFFF8`;
1439	d \|= ((uint64_t)c) << ( i*`8` );
1440	}
1441	}
1442	_d = d;
1443	}
1444
1445	static etcpak_force_inline uint64_t CheckSolid( const uint8_t* src )
1446	{
1447	#ifdef __SSE4_1__
1448	__m128i d0 = _mm_loadu_si128(((__m128i*)src) + `0`);
1449	__m128i d1 = _mm_loadu_si128(((__m128i*)src) + `1`);
1450	__m128i d2 = _mm_loadu_si128(((__m128i*)src) + `2`);
1451	__m128i d3 = _mm_loadu_si128(((__m128i*)src) + `3`);
1452
1453	__m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(`0`, `0`, `0`, `0`));
1454
1455	__m128i c0 = _mm_cmpeq_epi8(d0, c);
1456	__m128i c1 = _mm_cmpeq_epi8(d1, c);
1457	__m128i c2 = _mm_cmpeq_epi8(d2, c);
1458	__m128i c3 = _mm_cmpeq_epi8(d3, c);
1459
1460	__m128i m0 = _mm_and_si128(c0, c1);
1461	__m128i m1 = _mm_and_si128(c2, c3);
1462	__m128i m = _mm_and_si128(m0, m1);
1463
1464	if (!_mm_testc_si128(m, _mm_set1_epi32(-`1`)))
1465	{
1466	return `0`;
1467	}
1468	#elif defined __ARM_NEON
1469	int32x4_t d0 = vld1q_s32((int32_t*)src + `0`);
1470	int32x4_t d1 = vld1q_s32((int32_t*)src + `4`);
1471	int32x4_t d2 = vld1q_s32((int32_t*)src + `8`);
1472	int32x4_t d3 = vld1q_s32((int32_t*)src + `12`);
1473
1474	int32x4_t c = vdupq_n_s32(d0[`0`]);
1475
1476	int32x4_t c0 = vreinterpretq_s32_u32(vceqq_s32(d0, c));
1477	int32x4_t c1 = vreinterpretq_s32_u32(vceqq_s32(d1, c));
1478	int32x4_t c2 = vreinterpretq_s32_u32(vceqq_s32(d2, c));
1479	int32x4_t c3 = vreinterpretq_s32_u32(vceqq_s32(d3, c));
1480
1481	int32x4_t m0 = vandq_s32(c0, c1);
1482	int32x4_t m1 = vandq_s32(c2, c3);
1483	int64x2_t m = vreinterpretq_s64_s32(vandq_s32(m0, m1));
1484
1485	if (m[`0`] != -`1` \|\| m[`1`] != -`1`)
1486	{
1487	return `0`;
1488	}
1489	#else
1490	const uint8_t* ptr = src + `4`;
1491	for( int i=`1`; i<`16`; i++ )
1492	{
1493	if( memcmp( src, ptr, `4` ) != `0` )
1494	{
1495	return `0`;
1496	}
1497	ptr += `4`;
1498	}
1499	#endif
1500	return `0x02000000` \|
1501	( (unsigned int)( src[`0`] & `0xF8` ) << `16` ) \|
1502	( (unsigned int)( src[`1`] & `0xF8` ) << `8` ) \|
1503	( (unsigned int)( src[`2`] & `0xF8` ) );
1504	}
1505
1506	static etcpak_force_inline void PrepareAverages( v4i a[`8`], const uint8_t* src, unsigned int err[`4`] )
1507	{
1508	Average( src, a );
1509	ProcessAverages( a );
1510
1511	unsigned int errblock[`4`][`4`];
1512	CalcErrorBlock( src, errblock );
1513
1514	for( int i=`0`; i<`4`; i++ )
1515	{
1516	err[i/`2`] += CalcError( errblock[i], a[i] );
1517	err[`2`+i/`2`] += CalcError( errblock[i], a[i+`4`] );
1518	}
1519	}
1520
1521	static etcpak_force_inline void FindBestFit( uint64_t terr[`2`][`8`], uint16_t tsel[`16`][`8`], v4i a[`8`], const uint32_t* id, const uint8_t* data )
1522	{
1523	for( size_t i=`0`; i<`16`; i++ )
1524	{
1525	uint16_t* sel = tsel[i];
1526	unsigned int bid = id[i];
1527	uint64_t* ter = terr[bid%`2`];
1528
1529	uint8_t b = *data++;
1530	uint8_t g = *data++;
1531	uint8_t r = *data++;
1532	data++;
1533
1534	int dr = a[bid][`0`] - r;
1535	int dg = a[bid][`1`] - g;
1536	int db = a[bid][`2`] - b;
1537
1538	#ifdef __SSE4_1__
1539	// Reference implementation
1540
1541	__m128i pix = _mm_set1_epi32(dr * `77` + dg * `151` + db * `28`);
1542	// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1543	__m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[`0`]));
1544	__m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[`1`]));
1545	__m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[`0`]));
1546	__m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[`1`]));
1547
1548	__m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(`1`));
1549	__m128i minError0 = _mm_min_epi32(error0, error1);
1550
1551	__m128i index1 = _mm_sub_epi32(_mm_set1_epi32(`2`), _mm_cmplt_epi32(error3, error2));
1552	__m128i minError1 = _mm_min_epi32(error2, error3);
1553
1554	__m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
1555	__m128i minError = _mm_min_epi32(minError0, minError1);
1556
1557	// Squaring the minimum error to produce correct values when adding
1558	__m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(`1`, `1`, `0`, `0`));
1559	__m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
1560	squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + `0`));
1561	_mm_storeu_si128(((__m128i*)ter) + `0`, squareErrorLow);
1562	__m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(`3`, `3`, `2`, `2`));
1563	__m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
1564	squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + `1`));
1565	_mm_storeu_si128(((__m128i*)ter) + `1`, squareErrorHigh);
1566
1567	// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1568	error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[`2`]));
1569	error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[`3`]));
1570	error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[`2`]));
1571	error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[`3`]));
1572
1573	index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(`1`));
1574	minError0 = _mm_min_epi32(error0, error1);
1575
1576	index1 = _mm_sub_epi32(_mm_set1_epi32(`2`), _mm_cmplt_epi32(error3, error2));
1577	minError1 = _mm_min_epi32(error2, error3);
1578
1579	__m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
1580	minError = _mm_min_epi32(minError0, minError1);
1581
1582	// Squaring the minimum error to produce correct values when adding
1583	minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(`1`, `1`, `0`, `0`));
1584	squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
1585	squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + `2`));
1586	_mm_storeu_si128(((__m128i*)ter) + `2`, squareErrorLow);
1587	minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(`3`, `3`, `2`, `2`));
1588	squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
1589	squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + `3`));
1590	_mm_storeu_si128(((__m128i*)ter) + `3`, squareErrorHigh);
1591	__m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
1592	_mm_storeu_si128((__m128i*)sel, minIndex);
1593	#elif defined __ARM_NEON
1594	int32x4_t pix = vdupq_n_s32(dr * `77` + dg * `151` + db * `28`);
1595
1596	// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1597	uint32x4_t error0 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[`0`])));
1598	uint32x4_t error1 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[`1`])));
1599	uint32x4_t error2 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[`0`])));
1600	uint32x4_t error3 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[`1`])));
1601
1602	uint32x4_t index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(`1`));
1603	uint32x4_t minError0 = vminq_u32(error0, error1);
1604
1605	uint32x4_t index1 = vreinterpretq_u32_s32(vsubq_s32(vdupq_n_s32(`2`), vreinterpretq_s32_u32(vcltq_u32(error3, error2))));
1606	uint32x4_t minError1 = vminq_u32(error2, error3);
1607
1608	uint32x4_t blendMask = vcltq_u32(minError1, minError0);
1609	uint32x4_t minIndex0 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
1610	uint32x4_t minError = vminq_u32(minError0, minError1);
1611
1612	// Squaring the minimum error to produce correct values when adding
1613	uint32x4_t squareErrorLow = vmulq_u32(minError, minError);
1614	uint32x4_t squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError))), `1`);
1615	uint32x4x2_t squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
1616	uint64x2x2_t squareError = { vreinterpretq_u64_u32(squareErrorZip.val[`0`]), vreinterpretq_u64_u32(squareErrorZip.val[`1`]) };
1617	squareError.val[`0`] = vaddq_u64(squareError.val[`0`], vld1q_u64(ter + `0`));
1618	squareError.val[`1`] = vaddq_u64(squareError.val[`1`], vld1q_u64(ter + `2`));
1619	vst1q_u64(ter + `0`, squareError.val[`0`]);
1620	vst1q_u64(ter + `2`, squareError.val[`1`]);
1621
1622	// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1623	error0 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[`2`])));
1624	error1 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[`3`])));
1625	error2 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[`2`])));
1626	error3 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[`3`])));
1627
1628	index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(`1`));
1629	minError0 = vminq_u32(error0, error1);
1630
1631	index1 = vreinterpretq_u32_s32( vsubq_s32(vdupq_n_s32(`2`), vreinterpretq_s32_u32(vcltq_u32(error3, error2))) );
1632	minError1 = vminq_u32(error2, error3);
1633
1634	blendMask = vcltq_u32(minError1, minError0);
1635	uint32x4_t minIndex1 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
1636	minError = vminq_u32(minError0, minError1);
1637
1638	// Squaring the minimum error to produce correct values when adding
1639	squareErrorLow = vmulq_u32(minError, minError);
1640	squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32( vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError)) ), `1` );
1641	squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
1642	squareError.val[`0`] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[`0`] ), vld1q_u64(ter + `4`));
1643	squareError.val[`1`] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[`1`] ), vld1q_u64(ter + `6`));
1644	vst1q_u64(ter + `4`, squareError.val[`0`]);
1645	vst1q_u64(ter + `6`, squareError.val[`1`]);
1646
1647	uint16x8_t minIndex = vcombine_u16(vqmovn_u32(minIndex0), vqmovn_u32(minIndex1));
1648	vst1q_u16(sel, minIndex);
1649	#else
1650	int pix = dr * `77` + dg * `151` + db * `28`;
1651
1652	for( int t=`0`; t<`8`; t++ )
1653	{
1654	const int64_t* tab = g_table256[t];
1655	unsigned int idx = `0`;
1656	uint64_t err = sq( tab[`0`] + pix );
1657	for( int j=`1`; j<`4`; j++ )
1658	{
1659	uint64_t local = sq( tab[j] + pix );
1660	if( local < err )
1661	{
1662	err = local;
1663	idx = j;
1664	}
1665	}
1666	*sel++ = idx;
1667	*ter++ += err;
1668	}
1669	#endif
1670	}
1671	}
1672
1673	#if defined __SSE4_1__ \|\| defined __ARM_NEON
1674	// Non-reference implementation, but faster. Produces same results as the AVX2 version
1675	static etcpak_force_inline void FindBestFit( uint32_t terr[`2`][`8`], uint16_t tsel[`16`][`8`], v4i a[`8`], const uint32_t* id, const uint8_t* data )
1676	{
1677	for( size_t i=`0`; i<`16`; i++ )
1678	{
1679	uint16_t* sel = tsel[i];
1680	unsigned int bid = id[i];
1681	uint32_t* ter = terr[bid%`2`];
1682
1683	uint8_t b = *data++;
1684	uint8_t g = *data++;
1685	uint8_t r = *data++;
1686	data++;
1687
1688	int dr = a[bid][`0`] - r;
1689	int dg = a[bid][`1`] - g;
1690	int db = a[bid][`2`] - b;
1691
1692	#ifdef __SSE4_1__
1693	// The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
1694	// This produces slightly different results, but is significant faster
1695	__m128i pixel = _mm_set1_epi16(dr * `38` + dg * `76` + db * `14`);
1696	__m128i pix = _mm_abs_epi16(pixel);
1697
1698	// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1699	// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
1700	__m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[`0`]));
1701	__m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[`1`]));
1702
1703	__m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(`1`));
1704	__m128i minError = _mm_min_epi16(error0, error1);
1705
1706	// Exploiting symmetry of the selector table and use the sign bit
1707	// This produces slightly different results, but is needed to produce same results as AVX2 implementation
1708	__m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, `15`), _mm_set1_epi8(-`1`));
1709	__m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
1710
1711	// Squaring the minimum error to produce correct values when adding
1712	__m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
1713	__m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
1714
1715	__m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
1716	__m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
1717
1718	squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + `0`));
1719	_mm_storeu_si128(((__m128i*)ter) + `0`, squareErrorLow);
1720	squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + `1`));
1721	_mm_storeu_si128(((__m128i*)ter) + `1`, squareErrorHigh);
1722
1723	_mm_storeu_si128((__m128i*)sel, minIndex);
1724	#elif defined __ARM_NEON
1725	int16x8_t pixel = vdupq_n_s16( dr * `38` + dg * `76` + db * `14` );
1726	int16x8_t pix = vabsq_s16( pixel );
1727
1728	int16x8_t error0 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[`0`] ) );
1729	int16x8_t error1 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[`1`] ) );
1730
1731	int16x8_t index = vandq_s16( vreinterpretq_s16_u16( vcltq_s16( error1, error0 ) ), vdupq_n_s16( `1` ) );
1732	int16x8_t minError = vminq_s16( error0, error1 );
1733
1734	int16x8_t indexBit = vandq_s16( vmvnq_s16( vshrq_n_s16( pixel, `15` ) ), vdupq_n_s16( -`1` ) );
1735	int16x8_t minIndex = vorrq_s16( index, vaddq_s16( indexBit, indexBit ) );
1736
1737	int16x4_t minErrorLow = vget_low_s16( minError );
1738	int16x4_t minErrorHigh = vget_high_s16( minError );
1739
1740	int32x4_t squareErrorLow = vmull_s16( minErrorLow, minErrorLow );
1741	int32x4_t squareErrorHigh = vmull_s16( minErrorHigh, minErrorHigh );
1742
1743	int32x4_t squareErrorSumLow = vaddq_s32( squareErrorLow, vld1q_s32( (int32_t*)ter ) );
1744	int32x4_t squareErrorSumHigh = vaddq_s32( squareErrorHigh, vld1q_s32( (int32_t*)ter + `4` ) );
1745
1746	vst1q_s32( (int32_t*)ter, squareErrorSumLow );
1747	vst1q_s32( (int32_t*)ter + `4`, squareErrorSumHigh );
1748
1749	vst1q_s16( (int16_t*)sel, minIndex );
1750	#endif
1751	}
1752	}
1753	#endif
1754
1755	static etcpak_force_inline uint8_t convert6(float f)
1756	{
1757	int i = (std::min(std::max(static_cast<int>(f), `0`), `1023`) - `15`) >> `1`;
1758	return (i + `11` - ((i + `11`) >> `7`) - ((i + `4`) >> `7`)) >> `3`;
1759	}
1760
1761	static etcpak_force_inline uint8_t convert7(float f)
1762	{
1763	int i = (std::min(std::max(static_cast<int>(f), `0`), `1023`) - `15`) >> `1`;
1764	return (i + `9` - ((i + `9`) >> `8`) - ((i + `6`) >> `8`)) >> `2`;
1765	}
1766
1767	static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode, bool useHeuristics )
1768	{
1769	int32_t r = `0`;
1770	int32_t g = `0`;
1771	int32_t b = `0`;
1772
1773	for( int i = `0`; i < `16`; ++i )
1774	{
1775	b += src[i * `4` + `0`];
1776	g += src[i * `4` + `1`];
1777	r += src[i * `4` + `2`];
1778	}
1779
1780	int32_t difRyz = `0`;
1781	int32_t difGyz = `0`;
1782	int32_t difByz = `0`;
1783	int32_t difRxz = `0`;
1784	int32_t difGxz = `0`;
1785	int32_t difBxz = `0`;
1786
1787	const int32_t scaling[] = { -`255`, -`85`, `85`, `255` };
1788
1789	for (int i = `0`; i < `16`; ++i)
1790	{
1791	int32_t difB = (static_cast<int>(src[i * `4` + `0`]) << `4`) - b;
1792	int32_t difG = (static_cast<int>(src[i * `4` + `1`]) << `4`) - g;
1793	int32_t difR = (static_cast<int>(src[i * `4` + `2`]) << `4`) - r;
1794
1795	difRyz += difR * scaling[i % `4`];
1796	difGyz += difG * scaling[i % `4`];
1797	difByz += difB * scaling[i % `4`];
1798
1799	difRxz += difR * scaling[i / `4`];
1800	difGxz += difG * scaling[i / `4`];
1801	difBxz += difB * scaling[i / `4`];
1802	}
1803
1804	const float scale = -`4.0f` / ((`255` * `255` * `8.0f` + `85` * `85` * `8.0f`) * `16.0f`);
1805
1806	float aR = difRxz * scale;
1807	float aG = difGxz * scale;
1808	float aB = difBxz * scale;
1809
1810	float bR = difRyz * scale;
1811	float bG = difGyz * scale;
1812	float bB = difByz * scale;
1813
1814	float dR = r * (`4.0f` / `16.0f`);
1815	float dG = g * (`4.0f` / `16.0f`);
1816	float dB = b * (`4.0f` / `16.0f`);
1817
1818	// calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af x - bf * y;*
1819	float cofR = std::fma(aR, `255.0f`, std::fma(bR, `255.0f`, dR));
1820	float cofG = std::fma(aG, `255.0f`, std::fma(bG, `255.0f`, dG));
1821	float cofB = std::fma(aB, `255.0f`, std::fma(bB, `255.0f`, dB));
1822	float chfR = std::fma(aR, -`425.0f`, std::fma(bR, `255.0f`, dR));
1823	float chfG = std::fma(aG, -`425.0f`, std::fma(bG, `255.0f`, dG));
1824	float chfB = std::fma(aB, -`425.0f`, std::fma(bB, `255.0f`, dB));
1825	float cvfR = std::fma(aR, `255.0f`, std::fma(bR, -`425.0f`, dR));
1826	float cvfG = std::fma(aG, `255.0f`, std::fma(bG, -`425.0f`, dG));
1827	float cvfB = std::fma(aB, `255.0f`, std::fma(bB, -`425.0f`, dB));
1828
1829	// convert to r6g7b6
1830	int32_t coR = convert6(cofR);
1831	int32_t coG = convert7(cofG);
1832	int32_t coB = convert6(cofB);
1833	int32_t chR = convert6(chfR);
1834	int32_t chG = convert7(chfG);
1835	int32_t chB = convert6(chfB);
1836	int32_t cvR = convert6(cvfR);
1837	int32_t cvG = convert7(cvfG);
1838	int32_t cvB = convert6(cvfB);
1839
1840	// Error calculation
1841	uint64_t error = `0`;
1842	if( ModePlanar != mode && useHeuristics )
1843	{
1844	auto ro0 = coR;
1845	auto go0 = coG;
1846	auto bo0 = coB;
1847	auto ro1 = ( ro0 >> `4` ) \| ( ro0 << `2` );
1848	auto go1 = ( go0 >> `6` ) \| ( go0 << `1` );
1849	auto bo1 = ( bo0 >> `4` ) \| ( bo0 << `2` );
1850	auto ro2 = ( ro1 << `2` ) + `2`;
1851	auto go2 = ( go1 << `2` ) + `2`;
1852	auto bo2 = ( bo1 << `2` ) + `2`;
1853
1854	auto rh0 = chR;
1855	auto gh0 = chG;
1856	auto bh0 = chB;
1857	auto rh1 = ( rh0 >> `4` ) \| ( rh0 << `2` );
1858	auto gh1 = ( gh0 >> `6` ) \| ( gh0 << `1` );
1859	auto bh1 = ( bh0 >> `4` ) \| ( bh0 << `2` );
1860
1861	auto rh2 = rh1 - ro1;
1862	auto gh2 = gh1 - go1;
1863	auto bh2 = bh1 - bo1;
1864
1865	auto rv0 = cvR;
1866	auto gv0 = cvG;
1867	auto bv0 = cvB;
1868	auto rv1 = ( rv0 >> `4` ) \| ( rv0 << `2` );
1869	auto gv1 = ( gv0 >> `6` ) \| ( gv0 << `1` );
1870	auto bv1 = ( bv0 >> `4` ) \| ( bv0 << `2` );
1871
1872	auto rv2 = rv1 - ro1;
1873	auto gv2 = gv1 - go1;
1874	auto bv2 = bv1 - bo1;
1875	for( int i = `0`; i < `16`; ++i )
1876	{
1877	int32_t cR = clampu8( ( rh2 * ( i / `4` ) + rv2 * ( i % `4` ) + ro2 ) >> `2` );
1878	int32_t cG = clampu8( ( gh2 * ( i / `4` ) + gv2 * ( i % `4` ) + go2 ) >> `2` );
1879	int32_t cB = clampu8( ( bh2 * ( i / `4` ) + bv2 * ( i % `4` ) + bo2 ) >> `2` );
1880
1881	int32_t difB = static_cast<int>( src[i * `4` + `0`] ) - cB;
1882	int32_t difG = static_cast<int>( src[i * `4` + `1`] ) - cG;
1883	int32_t difR = static_cast<int>( src[i * `4` + `2`] ) - cR;
1884
1885	int32_t dif = difR * `38` + difG * `76` + difB * `14`;
1886
1887	error += dif * dif;
1888	}
1889	}
1890
1891	//
1892	uint32_t rgbv = cvB \| ( cvG << `6` ) \| ( cvR << `13` );
1893	uint32_t rgbh = chB \| ( chG << `6` ) \| ( chR << `13` );
1894	uint32_t hi = rgbv \| ( ( rgbh & `0x1FFF` ) << `19` );
1895	uint32_t lo = ( chR & `0x1` ) \| `0x2` \| ( ( chR << `1` ) & `0x7C` );
1896	lo \|= ( ( coB & `0x07` ) << `7` ) \| ( ( coB & `0x18` ) << `8` ) \| ( ( coB & `0x20` ) << `11` );
1897	lo \|= ( ( coG & `0x3F` ) << `17` ) \| ( ( coG & `0x40` ) << `18` );
1898	lo \|= coR << `25`;
1899
1900	const auto idx = ( coR & `0x20` ) \| ( ( coG & `0x20` ) >> `1` ) \| ( ( coB & `0x1E` ) >> `1` );
1901
1902	lo \|= g_flags[idx];
1903
1904	uint64_t result = static_cast<uint32_t>( _bswap( lo ) );
1905	result \|= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << `32`;
1906
1907	return std::make_pair( result, error );
1908	}
1909
1910	#ifdef __ARM_NEON
1911
1912	static etcpak_force_inline int32x2_t Planar_NEON_DifXZ( int16x8_t dif_lo, int16x8_t dif_hi )
1913	{
1914	int32x4_t dif0 = vmull_n_s16( vget_low_s16( dif_lo ), -`255` );
1915	int32x4_t dif1 = vmull_n_s16( vget_high_s16( dif_lo ), -`85` );
1916	int32x4_t dif2 = vmull_n_s16( vget_low_s16( dif_hi ), `85` );
1917	int32x4_t dif3 = vmull_n_s16( vget_high_s16( dif_hi ), `255` );
1918	int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
1919
1920	#ifndef __aarch64__
1921	int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
1922	return vpadd_s32( dif5, dif5 );
1923	#else
1924	return vdup_n_s32( vaddvq_s32( dif4 ) );
1925	#endif
1926	}
1927
1928	static etcpak_force_inline int32x2_t Planar_NEON_DifYZ( int16x8_t dif_lo, int16x8_t dif_hi )
1929	{
1930	int16x4_t scaling = { -`255`, -`85`, `85`, `255` };
1931	int32x4_t dif0 = vmull_s16( vget_low_s16( dif_lo ), scaling );
1932	int32x4_t dif1 = vmull_s16( vget_high_s16( dif_lo ), scaling );
1933	int32x4_t dif2 = vmull_s16( vget_low_s16( dif_hi ), scaling );
1934	int32x4_t dif3 = vmull_s16( vget_high_s16( dif_hi ), scaling );
1935	int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
1936
1937	#ifndef __aarch64__
1938	int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
1939	return vpadd_s32( dif5, dif5 );
1940	#else
1941	return vdup_n_s32( vaddvq_s32( dif4 ) );
1942	#endif
1943	}
1944
1945	static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src )
1946	{
1947	uint16x8_t accu8 = vpaddlq_u8( src );
1948	#ifndef __aarch64__
1949	uint16x4_t accu4 = vpadd_u16( vget_low_u16( accu8 ), vget_high_u16( accu8 ) );
1950	uint16x4_t accu2 = vpadd_u16( accu4, accu4 );
1951	uint16x4_t accu1 = vpadd_u16( accu2, accu2 );
1952	return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) );
1953	#else
1954	return vdupq_n_s16( vaddvq_u16( accu8 ) );
1955	#endif
1956	}
1957
1958	static etcpak_force_inline int16x8_t convert6_NEON( int32x4_t lo, int32x4_t hi )
1959	{
1960	uint16x8_t x = vcombine_u16( vqmovun_s32( lo ), vqmovun_s32( hi ) );
1961	int16x8_t i = vreinterpretq_s16_u16( vshrq_n_u16( vqshlq_n_u16( x, `6` ), `6`) ); // clamp 0-1023
1962	i = vhsubq_s16( i, vdupq_n_s16( `15` ) );
1963
1964	int16x8_t ip11 = vaddq_s16( i, vdupq_n_s16( `11` ) );
1965	int16x8_t ip4 = vaddq_s16( i, vdupq_n_s16( `4` ) );
1966
1967	return vshrq_n_s16( vsubq_s16( vsubq_s16( ip11, vshrq_n_s16( ip11, `7` ) ), vshrq_n_s16( ip4, `7`) ), `3` );
1968	}
1969
1970	static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x )
1971	{
1972	int16x4_t i = vreinterpret_s16_u16( vshr_n_u16( vqshl_n_u16( vqmovun_s32( x ), `6` ), `6` ) ); // clamp 0-1023
1973	i = vhsub_s16( i, vdup_n_s16( `15` ) );
1974
1975	int16x4_t p9 = vadd_s16( i, vdup_n_s16( `9` ) );
1976	int16x4_t p6 = vadd_s16( i, vdup_n_s16( `6` ) );
1977	return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, `8` ) ), vshr_n_s16( p6, `8` ) ), `2` );
1978	}
1979
1980	static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode, bool useHeuristics )
1981	{
1982	uint8x16x4_t srcBlock = vld4q_u8( src );
1983
1984	int16x8_t bSumWide = Planar_NEON_SumWide( srcBlock.val[`0`] );
1985	int16x8_t gSumWide = Planar_NEON_SumWide( srcBlock.val[`1`] );
1986	int16x8_t rSumWide = Planar_NEON_SumWide( srcBlock.val[`2`] );
1987
1988	int16x8_t dif_R_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[`2`] ), `4`) ), rSumWide );
1989	int16x8_t dif_R_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[`2`] ), `4`) ), rSumWide );
1990
1991	int16x8_t dif_G_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[`1`] ), `4` ) ), gSumWide );
1992	int16x8_t dif_G_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[`1`] ), `4` ) ), gSumWide );
1993
1994	int16x8_t dif_B_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[`0`] ), `4`) ), bSumWide );
1995	int16x8_t dif_B_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[`0`] ), `4`) ), bSumWide );
1996
1997	int32x2x2_t dif_xz_z = vzip_s32( vzip_s32( Planar_NEON_DifXZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifXZ( dif_R_lo, dif_R_hi ) ).val[`0`], Planar_NEON_DifXZ( dif_G_lo, dif_G_hi ) );
1998	int32x4_t dif_xz = vcombine_s32( dif_xz_z.val[`0`], dif_xz_z.val[`1`] );
1999	int32x2x2_t dif_yz_z = vzip_s32( vzip_s32( Planar_NEON_DifYZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifYZ( dif_R_lo, dif_R_hi ) ).val[`0`], Planar_NEON_DifYZ( dif_G_lo, dif_G_hi ) );
2000	int32x4_t dif_yz = vcombine_s32( dif_yz_z.val[`0`], dif_yz_z.val[`1`] );
2001
2002	const float fscale = -`4.0f` / ( (`255` * `255` * `8.0f` + `85` * `85` * `8.0f` ) * `16.0f` );
2003	float32x4_t fa = vmulq_n_f32( vcvtq_f32_s32( dif_xz ), fscale );
2004	float32x4_t fb = vmulq_n_f32( vcvtq_f32_s32( dif_yz ), fscale );
2005	int16x4_t bgrgSum = vzip_s16( vzip_s16( vget_low_s16( bSumWide ), vget_low_s16( rSumWide ) ).val[`0`], vget_low_s16( gSumWide ) ).val[`0`];
2006	float32x4_t fd = vmulq_n_f32( vcvtq_f32_s32( vmovl_s16( bgrgSum ) ), `4.0f` / `16.0f`);
2007
2008	float32x4_t cof = vmlaq_n_f32( vmlaq_n_f32( fd, fb, `255.0f` ), fa, `255.0f` );
2009	float32x4_t chf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, `255.0f` ), fa, -`425.0f` );
2010	float32x4_t cvf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, -`425.0f` ), fa, `255.0f` );
2011
2012	int32x4_t coi = vcvtq_s32_f32( cof );
2013	int32x4_t chi = vcvtq_s32_f32( chf );
2014	int32x4_t cvi = vcvtq_s32_f32( cvf );
2015
2016	int32x4x2_t tr_hv = vtrnq_s32( chi, cvi );
2017	int32x4x2_t tr_o = vtrnq_s32( coi, coi );
2018
2019	int16x8_t c_hvoo_br_6 = convert6_NEON( tr_hv.val[`0`], tr_o.val[`0`] );
2020	int16x4_t c_hvox_g_7 = convert7_NEON( vcombine_s32( vget_low_s32( tr_hv.val[`1`] ), vget_low_s32( tr_o.val[`1`] ) ) );
2021	int16x8_t c_hvoo_br_8 = vorrq_s16( vshrq_n_s16( c_hvoo_br_6, `4` ), vshlq_n_s16( c_hvoo_br_6, `2` ) );
2022	int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, `6` ), vshl_n_s16( c_hvox_g_7, `1` ) );
2023
2024	uint64_t error = `0`;
2025	if( mode != ModePlanar && useHeuristics )
2026	{
2027	int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), `3` );
2028
2029	rec_gxbr_o = vadd_s16( vshl_n_s16( rec_gxbr_o, `2` ), vdup_n_s16( `2` ) );
2030	int16x8_t rec_ro_wide = vdupq_lane_s16( rec_gxbr_o, `3` );
2031	int16x8_t rec_go_wide = vdupq_lane_s16( rec_gxbr_o, `0` );
2032	int16x8_t rec_bo_wide = vdupq_lane_s16( rec_gxbr_o, `1` );
2033
2034	int16x4_t br_hv2 = vsub_s16( vget_low_s16( c_hvoo_br_8 ), vget_high_s16( c_hvoo_br_8 ) );
2035	int16x4_t gg_hv2 = vsub_s16( c_hvox_g_8, vdup_lane_s16( c_hvox_g_8, `2` ) );
2036
2037	int16x8_t scaleh_lo = { `0`, `0`, `0`, `0`, `1`, `1`, `1`, `1` };
2038	int16x8_t scaleh_hi = { `2`, `2`, `2`, `2`, `3`, `3`, `3`, `3` };
2039	int16x8_t scalev = { `0`, `1`, `2`, `3`, `0`, `1`, `2`, `3` };
2040
2041	int16x8_t rec_r_1 = vmlaq_lane_s16( rec_ro_wide, scalev, br_hv2, `3` );
2042	int16x8_t rec_r_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_lo, br_hv2, `2` ), `2` ) ) );
2043	int16x8_t rec_r_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_hi, br_hv2, `2` ), `2` ) ) );
2044
2045	int16x8_t rec_b_1 = vmlaq_lane_s16( rec_bo_wide, scalev, br_hv2, `1` );
2046	int16x8_t rec_b_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_lo, br_hv2, `0` ), `2` ) ) );
2047	int16x8_t rec_b_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_hi, br_hv2, `0` ), `2` ) ) );
2048
2049	int16x8_t rec_g_1 = vmlaq_lane_s16( rec_go_wide, scalev, gg_hv2, `1` );
2050	int16x8_t rec_g_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_lo, gg_hv2, `0` ), `2` ) ) );
2051	int16x8_t rec_g_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_hi, gg_hv2, `0` ), `2` ) ) );
2052
2053	int16x8_t dif_r_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[`2`] ) ) ), rec_r_lo );
2054	int16x8_t dif_r_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[`2`] ) ) ), rec_r_hi );
2055
2056	int16x8_t dif_g_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[`1`] ) ) ), rec_g_lo );
2057	int16x8_t dif_g_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[`1`] ) ) ), rec_g_hi );
2058
2059	int16x8_t dif_b_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[`0`] ) ) ), rec_b_lo );
2060	int16x8_t dif_b_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[`0`] ) ) ), rec_b_hi );
2061
2062	int16x8_t dif_lo = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_lo, `38` ), dif_g_lo, `76` ), dif_b_lo, `14` );
2063	int16x8_t dif_hi = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_hi, `38` ), dif_g_hi, `76` ), dif_b_hi, `14` );
2064
2065	int16x4_t tmpDif = vget_low_s16( dif_lo );
2066	int32x4_t difsq_0 = vmull_s16( tmpDif, tmpDif );
2067	tmpDif = vget_high_s16( dif_lo );
2068	int32x4_t difsq_1 = vmull_s16( tmpDif, tmpDif );
2069	tmpDif = vget_low_s16( dif_hi );
2070	int32x4_t difsq_2 = vmull_s16( tmpDif, tmpDif );
2071	tmpDif = vget_high_s16( dif_hi );
2072	int32x4_t difsq_3 = vmull_s16( tmpDif, tmpDif );
2073
2074	uint32x4_t difsq_5 = vaddq_u32( vreinterpretq_u32_s32( difsq_0 ), vreinterpretq_u32_s32( difsq_1 ) );
2075	uint32x4_t difsq_6 = vaddq_u32( vreinterpretq_u32_s32( difsq_2 ), vreinterpretq_u32_s32( difsq_3 ) );
2076
2077	uint64x2_t difsq_7 = vaddl_u32( vget_low_u32( difsq_5 ), vget_high_u32( difsq_5 ) );
2078	uint64x2_t difsq_8 = vaddl_u32( vget_low_u32( difsq_6 ), vget_high_u32( difsq_6 ) );
2079
2080	uint64x2_t difsq_9 = vaddq_u64( difsq_7, difsq_8 );
2081
2082	#ifdef __aarch64__
2083	error = vaddvq_u64( difsq_9 );
2084	#else
2085	error = vgetq_lane_u64( difsq_9, `0` ) + vgetq_lane_u64( difsq_9, `1` );
2086	#endif
2087	}
2088
2089	int32_t coR = c_hvoo_br_6[`6`];
2090	int32_t coG = c_hvox_g_7[`2`];
2091	int32_t coB = c_hvoo_br_6[`4`];
2092
2093	int32_t chR = c_hvoo_br_6[`2`];
2094	int32_t chG = c_hvox_g_7[`0`];
2095	int32_t chB = c_hvoo_br_6[`0`];
2096
2097	int32_t cvR = c_hvoo_br_6[`3`];
2098	int32_t cvG = c_hvox_g_7[`1`];
2099	int32_t cvB = c_hvoo_br_6[`1`];
2100
2101	uint32_t rgbv = cvB \| ( cvG << `6` ) \| ( cvR << `13` );
2102	uint32_t rgbh = chB \| ( chG << `6` ) \| ( chR << `13` );
2103	uint32_t hi = rgbv \| ( ( rgbh & `0x1FFF` ) << `19` );
2104	uint32_t lo = ( chR & `0x1` ) \| `0x2` \| ( ( chR << `1` ) & `0x7C` );
2105	lo \|= ( ( coB & `0x07` ) << `7` ) \| ( ( coB & `0x18` ) << `8` ) \| ( ( coB & `0x20` ) << `11` );
2106	lo \|= ( ( coG & `0x3F`) << `17`) \| ( (coG & `0x40` ) << `18` );
2107	lo \|= coR << `25`;
2108
2109	const auto idx = ( coR & `0x20` ) \| ( ( coG & `0x20` ) >> `1` ) \| ( ( coB & `0x1E` ) >> `1` );
2110
2111	lo \|= g_flags[idx];
2112
2113	uint64_t result = static_cast<uint32_t>( _bswap(lo) );
2114	result \|= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << `32`;
2115
2116	return std::make_pair( result, error );
2117	}
2118
2119	#endif
2120
2121	#ifdef __AVX2__
2122	uint32_t calculateErrorTH( bool tMode, uint8_t( colorsRGB444 )[`2`][`3`], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist, __m128i r8, __m128i g8, __m128i b8 )
2123	#else
2124	uint32_t calculateErrorTH( bool tMode, uint8_t* src, uint8_t( colorsRGB444 )[`2`][`3`], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist )
2125	#endif
2126	{
2127	uint32_t blockErr = `0`, bestBlockErr = MaxError;
2128
2129	uint32_t pixColors;
2130	uint8_t possibleColors[`4`][`3`];
2131	uint8_t colors[`2`][`3`];
2132
2133	decompressColor( colorsRGB444, colors );
2134
2135	#ifdef __AVX2__
2136	__m128i reverseMask = _mm_set_epi8( `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, `3`, `7`, `11`, `15` );
2137	#endif
2138
2139	// test distances
2140	for( uint8_t d = startDist; d < `8`; ++d )
2141	{
2142	if( d >= `2` && dist == d - `2` ) break;
2143
2144	blockErr = `0`;
2145	pixColors = `0`;
2146
2147	if( tMode )
2148	{
2149	calculatePaintColors59T( d, colors, possibleColors );
2150	}
2151	else
2152	{
2153	calculatePaintColors58H( d, colors, possibleColors );
2154	}
2155
2156	#ifdef __AVX2__
2157	// RGB ordering
2158	__m128i b8Rev = _mm_shuffle_epi8( b8, reverseMask );
2159	__m128i g8Rev = _mm_shuffle_epi8( g8, reverseMask );
2160	__m128i r8Rev = _mm_shuffle_epi8( r8, reverseMask );
2161
2162	// extends 3x128 bits RGB into 3x256 bits RGB for error comparisions
2163	static const __m128i zero = _mm_setzero_si128();
2164	__m128i b8Lo = _mm_unpacklo_epi8( b8Rev, zero );
2165	__m128i g8Lo = _mm_unpacklo_epi8( g8Rev, zero );
2166	__m128i r8Lo = _mm_unpacklo_epi8( r8Rev, zero );
2167	__m128i b8Hi = _mm_unpackhi_epi8( b8Rev, zero );
2168	__m128i g8Hi = _mm_unpackhi_epi8( g8Rev, zero );
2169	__m128i r8Hi = _mm_unpackhi_epi8( r8Rev, zero );
2170
2171	__m256i b8 = _mm256_set_m128i( b8Hi, b8Lo );
2172	__m256i g8 = _mm256_set_m128i( g8Hi, g8Lo );
2173	__m256i r8 = _mm256_set_m128i( r8Hi, r8Lo );
2174
2175	// caculates differences between the pixel colrs and the palette colors
2176	__m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[`0`][B] ) ) );
2177	__m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[`0`][G] ) ) );
2178	__m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[`0`][R] ) ) );
2179
2180	// luma-based error calculations
2181	static const __m256i bWeight = _mm256_set1_epi16( `14` );
2182	static const __m256i gWeight = _mm256_set1_epi16( `76` );
2183	static const __m256i rWeight = _mm256_set1_epi16( `38` );
2184
2185	diffb = _mm256_mullo_epi16( diffb, bWeight );
2186	diffg = _mm256_mullo_epi16( diffg, gWeight );
2187	diffr = _mm256_mullo_epi16( diffr, rWeight );
2188
2189	// obtains the error with the current palette color
2190	__m256i lowestPixErr = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
2191
2192	// error calucations with the remaining three palette colors
2193	static const uint32_t masks[`4`] = { `0`, `0x55555555`, `0xAAAAAAAA`, `0xFFFFFFFF` };
2194	for( uint8_t c = `1`; c < `4`; c++ )
2195	{
2196	__m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[c][B] ) ) );
2197	__m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[c][G] ) ) );
2198	__m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[c][R] ) ) );
2199
2200	diffb = _mm256_mullo_epi16( diffb, bWeight );
2201	diffg = _mm256_mullo_epi16( diffg, gWeight );
2202	diffr = _mm256_mullo_epi16( diffr, rWeight );
2203
2204	// error comparison with the previous best color
2205	__m256i pixErrors = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
2206	__m256i minErr = _mm256_min_epu16( lowestPixErr, pixErrors );
2207	__m256i cmpRes = _mm256_cmpeq_epi16( pixErrors, minErr );
2208	lowestPixErr = minErr;
2209
2210	// update pixel colors
2211	uint32_t updPixColors = _mm256_movemask_epi8( cmpRes );
2212	uint32_t prevPixColors = pixColors & ~updPixColors;
2213	uint32_t mskPixColors = masks[c] & updPixColors;
2214	pixColors = prevPixColors \| mskPixColors;
2215	}
2216
2217	// accumulate the block error
2218	alignas( `32` ) uint16_t pixErr16[`16`] = { `0`, };
2219	_mm256_storeu_si256( (__m256i*)pixErr16, lowestPixErr );
2220	for( uint8_t p = `0`; p < `16`; p++ )
2221	{
2222	blockErr += (int)( pixErr16[p] ) * pixErr16[p];
2223	}
2224	#else
2225	for( size_t y = `0`; y < `4`; ++y )
2226	{
2227	for( size_t x = `0`; x < `4`; ++x )
2228	{
2229	uint32_t bestPixErr = MaxError;
2230	pixColors <<= `2`; // Make room for next value
2231
2232	// Loop possible block colors
2233	for( uint8_t c = `0`; c < `4`; ++c )
2234	{
2235	int diff[`3`];
2236	diff[R] = src[`4` * ( x * `4` + y ) + R] - possibleColors[c][R];
2237	diff[G] = src[`4` * ( x * `4` + y ) + G] - possibleColors[c][G];
2238	diff[B] = src[`4` * ( x * `4` + y ) + B] - possibleColors[c][B];
2239
2240	const uint32_t err = `38` * abs( diff[R] ) + `76` * abs( diff[G] ) + `14` * abs( diff[B] );
2241	uint32_t pixErr = err * err;
2242
2243	// Choose best error
2244	if( pixErr < bestPixErr )
2245	{
2246	bestPixErr = pixErr;
2247	pixColors ^= ( pixColors & `3` ); // Reset the two first bits
2248	pixColors \|= c;
2249	}
2250	}
2251	blockErr += bestPixErr;
2252	}
2253	}
2254	#endif
2255
2256	if( blockErr < bestBlockErr )
2257	{
2258	bestBlockErr = blockErr;
2259	dist = d;
2260	pixIndices = pixColors;
2261	}
2262	}
2263
2264	return bestBlockErr;
2265	}
2266
2267
2268	// main T-/H-mode compression function
2269	#ifdef __AVX2__
2270	uint32_t compressBlockTH( uint8_t* src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool& tMode, __m128i r8, __m128i g8, __m128i b8 )
2271	#else
2272	uint32_t compressBlockTH( uint8_t src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool* &tMode )
2273	#endif
2274	{
2275	#ifdef __AVX2__
2276	alignas( `8` ) uint8_t luma[`16`] = { `0`, };
2277	_mm_storeu_si128 ( (__m128i* )luma, l.luma8 );
2278	#elif defined __ARM_NEON && defined __aarch64__
2279	alignas( `8` ) uint8_t luma[`16`] = { `0` };
2280	vst1q_u8( luma, l.luma8 );
2281	#else
2282	uint8_t* luma = l.val;
2283	#endif
2284
2285	uint8_t pixIdx[`16`] = { `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15` };
2286
2287	// 1) sorts the pairs of (luma, pix_idx)
2288	insertionSort( luma, pixIdx );
2289
2290	// 2) finds the min (left+right)
2291	uint8_t minSumRangeIdx = `0`;
2292	uint16_t minSumRangeValue;
2293	uint16_t sum;
2294	static const uint8_t diffBonus[`15`] = {`8`, `4`, `2`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `2`, `4`, `8`};
2295	const int16_t temp = luma[`15`] - luma[`0`];
2296
2297	minSumRangeValue = luma[`15`] - luma[`1`] + diffBonus[`0`];
2298	for( uint8_t i = `1`; i < `14`; i++ )
2299	{
2300	sum = temp - luma[i+`1`] + luma[i] + diffBonus[i];
2301	if( minSumRangeValue > sum )
2302	{
2303	minSumRangeValue = sum;
2304	minSumRangeIdx = i;
2305	}
2306	}
2307
2308	sum = luma[`14`] - luma[`0`] + diffBonus[`14`];
2309	if( minSumRangeValue > sum )
2310	{
2311	minSumRangeValue = sum;
2312	minSumRangeIdx = `14`;
2313	}
2314	uint8_t lRange, rRange;
2315
2316	lRange = luma[minSumRangeIdx] - luma[`0`];
2317	rRange = luma[`15`] - luma[minSumRangeIdx + `1`];
2318
2319	// 3) sets a proper mode
2320	bool swap = false;
2321	if( lRange >= rRange )
2322	{
2323	if( lRange >= rRange * `2` )
2324	{
2325	swap = true;
2326	tMode = true;
2327	}
2328	}
2329	else
2330	{
2331	if( lRange * `2` <= rRange ) tMode = true;
2332	}
2333	// 4) calculates the two base colors
2334	uint8_t rangeIdx[`4`] = { pixIdx[`0`], pixIdx[minSumRangeIdx], pixIdx[minSumRangeIdx + `1`], pixIdx[`15`] };
2335
2336	uint16_t r[`4`], g[`4`], b[`4`];
2337	for( uint8_t i = `0`; i < `4`; ++i )
2338	{
2339	uint8_t idx = rangeIdx[i] * `4`;
2340	b[i] = src[idx];
2341	g[i] = src[idx + `1`];
2342	r[i] = src[idx + `2`];
2343	}
2344
2345	uint8_t mid_rgb[`2`][`3`];
2346	if( swap )
2347	{
2348	mid_rgb[`1`][B] = ( b[`0`] + b[`1`] ) / `2`;
2349	mid_rgb[`1`][G] = ( g[`0`] + g[`1`] ) / `2`;
2350	mid_rgb[`1`][R] = ( r[`0`] + r[`1`] ) / `2`;
2351
2352	uint16_t sum_rgb[`3`] = { `0`, `0`, `0` };
2353	for( uint8_t i = minSumRangeIdx + `1`; i < `16`; i++ )
2354	{
2355	uint8_t idx = pixIdx[i] * `4`;
2356	sum_rgb[B] += src[idx];
2357	sum_rgb[G] += src[idx + `1`];
2358	sum_rgb[R] += src[idx + `2`];
2359	}
2360	const uint8_t temp = `15` - minSumRangeIdx;
2361	mid_rgb[`0`][B] = sum_rgb[B] / temp;
2362	mid_rgb[`0`][G] = sum_rgb[G] / temp;
2363	mid_rgb[`0`][R] = sum_rgb[R] / temp;
2364	}
2365	else
2366	{
2367	mid_rgb[`0`][B] = (b[`0`] + b[`1`]) / `2`;
2368	mid_rgb[`0`][G] = (g[`0`] + g[`1`]) / `2`;
2369	mid_rgb[`0`][R] = (r[`0`] + r[`1`]) / `2`;
2370	if( tMode )
2371	{
2372	uint16_t sum_rgb[`3`] = { `0`, `0`, `0` };
2373	for( uint8_t i = minSumRangeIdx + `1`; i < `16`; i++ )
2374	{
2375	uint8_t idx = pixIdx[i] * `4`;
2376	sum_rgb[B] += src[idx];
2377	sum_rgb[G] += src[idx + `1`];
2378	sum_rgb[R] += src[idx + `2`];
2379	}
2380	const uint8_t temp = `15` - minSumRangeIdx;
2381	mid_rgb[`1`][B] = sum_rgb[B] / temp;
2382	mid_rgb[`1`][G] = sum_rgb[G] / temp;
2383	mid_rgb[`1`][R] = sum_rgb[R] / temp;
2384	}
2385	else
2386	{
2387	mid_rgb[`1`][B] = (b[`2`] + b[`3`]) / `2`;
2388	mid_rgb[`1`][G] = (g[`2`] + g[`3`]) / `2`;
2389	mid_rgb[`1`][R] = (r[`2`] + r[`3`]) / `2`;
2390	}
2391	}
2392
2393	// 5) sets the start distance index
2394	uint32_t startDistCandidate;
2395	uint32_t avgDist;
2396	if( tMode )
2397	{
2398	if( swap )
2399	{
2400	avgDist = ( b[`1`] - b[`0`] + g[`1`] - g[`0`] + r[`1`] - r[`0`] ) / `6`;
2401	}
2402	else
2403	{
2404	avgDist = ( b[`3`] - b[`2`] + g[`3`] - g[`2`] + r[`3`] - r[`2`] ) / `6`;
2405	}
2406	}
2407	else
2408	{
2409	avgDist = ( b[`1`] - b[`0`] + g[`1`] - g[`0`] + r[`1`] - r[`0`] + b[`3`] - b[`2`] + g[`3`] - g[`2`] + r[`3`] - r[`2`] ) / `12`;
2410	}
2411
2412	if( avgDist <= `16`)
2413	{
2414	startDistCandidate = `0`;
2415	}
2416	else if( avgDist <= `23` )
2417	{
2418	startDistCandidate = `1`;
2419	}
2420	else if( avgDist <= `32` )
2421	{
2422	startDistCandidate = `2`;
2423	}
2424	else if( avgDist <= `41` )
2425	{
2426	startDistCandidate = `3`;
2427	}
2428	else
2429	{
2430	startDistCandidate = `4`;
2431	}
2432
2433	uint32_t bestErr = MaxError;
2434	uint32_t bestPixIndices;
2435	uint8_t bestDist = `10`;
2436	uint8_t colorsRGB444[`2`][`3`];
2437	compressColor( mid_rgb, colorsRGB444, tMode );
2438	compressed1 = `0`;
2439
2440	// 6) finds the best candidate with the lowest error
2441	#ifdef __AVX2__
2442	// Vectorized ver
2443	bestErr = calculateErrorTH( tMode, colorsRGB444, bestDist, bestPixIndices, startDistCandidate, r8, g8, b8 );
2444	#else
2445	// Scalar ver
2446	bestErr = calculateErrorTH( tMode, src, colorsRGB444, bestDist, bestPixIndices, startDistCandidate );
2447	#endif
2448
2449	// 7) outputs the final T or H block
2450	if( tMode )
2451	{
2452	// Put the compress params into the compression block
2453	compressed1 \|= ( colorsRGB444[`0`][R] & `0xf` ) << `23`;
2454	compressed1 \|= ( colorsRGB444[`0`][G] & `0xf` ) << `19`;
2455	compressed1 \|= ( colorsRGB444[`0`][B] ) << `15`;
2456	compressed1 \|= ( colorsRGB444[`1`][R] ) << `11`;
2457	compressed1 \|= ( colorsRGB444[`1`][G] ) << `7`;
2458	compressed1 \|= ( colorsRGB444[`1`][B] ) << `3`;
2459	compressed1 \|= bestDist & `0x7`;
2460	}
2461	else
2462	{
2463	int bestRGB444ColPacked[`2`];
2464	bestRGB444ColPacked[`0`] = (colorsRGB444[`0`][R] << `8`) + (colorsRGB444[`0`][G] << `4`) + colorsRGB444[`0`][B];
2465	bestRGB444ColPacked[`1`] = (colorsRGB444[`1`][R] << `8`) + (colorsRGB444[`1`][G] << `4`) + colorsRGB444[`1`][B];
2466	if( ( bestRGB444ColPacked[`0`] >= bestRGB444ColPacked[`1`] ) ^ ( ( bestDist & `1` ) == `1` ) )
2467	{
2468	swapColors( colorsRGB444 );
2469	// Reshuffle pixel indices to to exchange C1 with C3, and C2 with C4
2470	bestPixIndices = ( `0x55555555` & bestPixIndices ) \| ( `0xaaaaaaaa` & ( ~bestPixIndices ) );
2471	}
2472
2473	// Put the compress params into the compression block
2474	compressed1 \|= ( colorsRGB444[`0`][R] & `0xf` ) << `22`;
2475	compressed1 \|= ( colorsRGB444[`0`][G] & `0xf` ) << `18`;
2476	compressed1 \|= ( colorsRGB444[`0`][B] & `0xf` ) << `14`;
2477	compressed1 \|= ( colorsRGB444[`1`][R] & `0xf` ) << `10`;
2478	compressed1 \|= ( colorsRGB444[`1`][G] & `0xf` ) << `6`;
2479	compressed1 \|= ( colorsRGB444[`1`][B] & `0xf` ) << `2`;
2480	compressed1 \|= ( bestDist >> `1` ) & `0x3`;
2481	}
2482
2483	bestPixIndices = indexConversion( bestPixIndices );
2484	compressed2 = `0`;
2485	compressed2 = ( compressed2 & ~( ( `0x2` << `31` ) - `1` ) ) \| ( bestPixIndices & ( ( `2` << `31` ) - `1` ) );
2486
2487	return bestErr;
2488	}
2489	//#endif
2490
2491	template<class T, class S>
2492	static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[`2`][`8`], const S tsel[`16`][`8`], const uint32_t* id, const uint64_t value, const uint64_t error)
2493	{
2494	size_t tidx[`2`];
2495	tidx[`0`] = GetLeastError( terr[`0`], `8` );
2496	tidx[`1`] = GetLeastError( terr[`1`], `8` );
2497
2498	if ((terr[`0`][tidx[`0`]] + terr[`1`][tidx[`1`]]) >= error)
2499	{
2500	return value;
2501	}
2502
2503	d \|= tidx[`0`] << `26`;
2504	d \|= tidx[`1`] << `29`;
2505	for( int i=`0`; i<`16`; i++ )
2506	{
2507	uint64_t t = tsel[i][tidx[id[i]%`2`]];
2508	d \|= ( t & `0x1` ) << ( i + `32` );
2509	d \|= ( t & `0x2` ) << ( i + `47` );
2510	}
2511
2512	return FixByteOrder(d);
2513	}
2514
2515	}
2516
2517	static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src )
2518	{
2519	#ifdef __AVX2__
2520	uint64_t d = CheckSolid_AVX2( src );
2521	if( d != `0` ) return d;
2522
2523	alignas(`32`) v4i a[`8`];
2524
2525	__m128i err0 = PrepareAverages_AVX2( a, src );
2526
2527	// Get index of minimum error (err0)
2528	__m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(`2`, `3`, `0`, `1`));
2529	__m128i errMin0 = _mm_min_epu32(err0, err1);
2530
2531	__m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(`1`, `0`, `3`, `2`));
2532	__m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
2533
2534	__m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
2535
2536	uint32_t mask = _mm_movemask_epi8(errMask);
2537
2538	uint32_t idx = _bit_scan_forward(mask) >> `2`;
2539
2540	d \|= EncodeAverages_AVX2( a, idx );
2541
2542	alignas(`32`) uint32_t terr[`2`][`8`] = {};
2543	alignas(`32`) uint32_t tsel[`8`];
2544
2545	if ((idx == `0`) \|\| (idx == `2`))
2546	{
2547	FindBestFit_4x2_AVX2( terr, tsel, a, idx * `2`, src );
2548	}
2549	else
2550	{
2551	FindBestFit_2x4_AVX2( terr, tsel, a, idx * `2`, src );
2552	}
2553
2554	return EncodeSelectors_AVX2( d, terr, tsel, (idx % `2`) == `1` );
2555	#else
2556	uint64_t d = CheckSolid( src );
2557	if( d != `0` ) return d;
2558
2559	v4i a[`8`];
2560	unsigned int err[`4`] = {};
2561	PrepareAverages( a, src, err );
2562	size_t idx = GetLeastError( err, `4` );
2563	EncodeAverages( d, a, idx );
2564
2565	#if ( defined __SSE4_1__ \|\| defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
2566	uint32_t terr[`2`][`8`] = {};
2567	#else
2568	uint64_t terr[`2`][`8`] = {};
2569	#endif
2570	uint16_t tsel[`16`][`8`];
2571	auto id = g_id[idx];
2572	FindBestFit( terr, tsel, a, id, src );
2573
2574	return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
2575	#endif
2576	}
2577
2578	#ifdef __AVX2__
2579	// horizontal min/max functions. https://stackoverflow.com/questions/22256525/horizontal-minimum-and-maximum-using-sse
2580	// if an error occurs in GCC, please change the value of -march in CFLAGS to a specific value for your CPU (e.g., skylake).
2581	static inline int16_t hMax( __m128i buffer, uint8_t& idx )
2582	{
2583	__m128i tmp1 = _mm_sub_epi8( _mm_set1_epi8( (char)( `255` ) ), buffer );
2584	__m128i tmp2 = _mm_min_epu8( tmp1, _mm_srli_epi16( tmp1, `8` ) );
2585	__m128i tmp3 = _mm_minpos_epu16( tmp2 );
2586	uint8_t result = `255` - (uint8_t)_mm_cvtsi128_si32( tmp3 );
2587	__m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) );
2588	idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
2589
2590	return result;
2591	}
2592	#elif defined __ARM_NEON && defined __aarch64__
2593	static inline int16_t hMax( uint8x16_t buffer, uint8_t& idx )
2594	{
2595	const uint8_t max = vmaxvq_u8( buffer );
2596	const uint16x8_t vmax = vdupq_n_u16( max );
2597	uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() );
2598	uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[`0`] );
2599	uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[`1`] );
2600	uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmax );
2601	uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmax );
2602
2603	static const uint16_t mask_lsb[] = {
2604	`0x1`, `0x2`, `0x4`, `0x8`,
2605	`0x10`, `0x20`, `0x40`, `0x80` };
2606
2607	static const uint16_t mask_msb[] = {
2608	`0x100`, `0x200`, `0x400`, `0x800`,
2609	`0x1000`, `0x2000`, `0x4000`, `0x8000` };
2610
2611	uint16x8_t vmask_lsb = vld1q_u16( mask_lsb );
2612	uint16x8_t vmask_msb = vld1q_u16( mask_msb );
2613	uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask );
2614	uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask );
2615	pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2616	pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2617	pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2618	uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), `0` );
2619	pos_msb = vpaddq_u16( pos_msb, pos_msb );
2620	pos_msb = vpaddq_u16( pos_msb, pos_msb );
2621	pos_msb = vpaddq_u16( pos_msb, pos_msb );
2622	uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), `0` );
2623	idx = idx_lane1 != `0` ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 );
2624
2625	return max;
2626	}
2627	#endif
2628
2629	#ifdef __AVX2__
2630	static inline int16_t hMin( __m128i buffer, uint8_t& idx )
2631	{
2632	__m128i tmp2 = _mm_min_epu8( buffer, _mm_srli_epi16( buffer, `8` ) );
2633	__m128i tmp3 = _mm_minpos_epu16( tmp2 );
2634	uint8_t result = (uint8_t)_mm_cvtsi128_si32( tmp3 );
2635	__m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) );
2636	idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
2637	return result;
2638	}
2639	#elif defined __ARM_NEON && defined __aarch64__
2640	static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx )
2641	{
2642	const uint8_t min = vminvq_u8( buffer );
2643	const uint16x8_t vmin = vdupq_n_u16( min );
2644	uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() );
2645	uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[`0`] );
2646	uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[`1`] );
2647	uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmin );
2648	uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmin );
2649
2650	static const uint16_t mask_lsb[] = {
2651	`0x1`, `0x2`, `0x4`, `0x8`,
2652	`0x10`, `0x20`, `0x40`, `0x80` };
2653
2654	static const uint16_t mask_msb[] = {
2655	`0x100`, `0x200`, `0x400`, `0x800`,
2656	`0x1000`, `0x2000`, `0x4000`, `0x8000` };
2657
2658	uint16x8_t vmask_lsb = vld1q_u16( mask_lsb );
2659	uint16x8_t vmask_msb = vld1q_u16( mask_msb );
2660	uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask );
2661	uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask );
2662	pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2663	pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2664	pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2665	uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), `0` );
2666	pos_msb = vpaddq_u16( pos_msb, pos_msb );
2667	pos_msb = vpaddq_u16( pos_msb, pos_msb );
2668	pos_msb = vpaddq_u16( pos_msb, pos_msb );
2669	uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), `0` );
2670	idx = idx_lane1 != `0` ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 );
2671
2672	return min;
2673	}
2674	#endif
2675
2676	// During search it is not convenient to store the bits the way they are stored in the
2677	// file format. Hence, after search, it is converted to this format.
2678	// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
2679	static inline void stuff59bits( unsigned int thumbT59W1, unsigned int thumbT59W2, unsigned int& thumbTW1, unsigned int& thumbTW2 )
2680	{
2681	// Put bits in twotimer configuration for 59 (red overflows)
2682	//
2683	// Go from this bit layout:
2684	//
2685	// \|63 62 61 60 59\|58 57 56 55\|54 53 52 51\|50 49 48 47\|46 45 44 43\|42 41 40 39\|38 37 36 35\|34 33 32\|
2686	// \|----empty-----\|---red 0---\|--green 0--\|--blue 0---\|---red 1---\|--green 1--\|--blue 1---\|--dist--\|
2687	//
2688	// \|31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00\|
2689	// \|----------------------------------------index bits---------------------------------------------\|
2690	//
2691	//
2692	// To this:
2693	//
2694	// 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2695	// -----------------------------------------------------------------------------------------------
2696	// \|// // //\|R0a \|//\|R0b \|G0 \|B0 \|R1 \|G1 \|B1 \|da \|df\|db\|
2697	// -----------------------------------------------------------------------------------------------
2698	//
2699	// \|31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00\|
2700	// \|----------------------------------------index bits---------------------------------------------\|
2701	//
2702	// 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2703	// -----------------------------------------------------------------------------------------------
2704	// \| base col1 \| dcol 2 \| base col1 \| dcol 2 \| base col 1 \| dcol 2 \| table \| table \|df\|fp\|
2705	// \| R1' (5 bits) \| dR2 \| G1' (5 bits) \| dG2 \| B1' (5 bits) \| dB2 \| cw 1 \| cw 2 \|bt\|bt\|
2706	// ------------------------------------------------------------------------------------------------
2707
2708	uint8_t R0a;
2709	uint8_t bit, a, b, c, d, bits;
2710
2711	R0a = ( thumbT59W1 >> `25` ) & `0x3`;
2712
2713	// Fix middle part
2714	thumbTW1 = thumbT59W1 << `1`;
2715	// Fix R0a (top two bits of R0)
2716	thumbTW1 = ( thumbTW1 & ~( `0x3` << `27` ) ) \| ( ( R0a & `0x3` ) << `27` );
2717	// Fix db (lowest bit of d)
2718	thumbTW1 = ( thumbTW1 & ~`0x1` ) \| ( thumbT59W1 & `0x1` );
2719
2720	// Make sure that red overflows:
2721	a = ( thumbTW1 >> `28` ) & `0x1`;
2722	b = ( thumbTW1 >> `27` ) & `0x1`;
2723	c = ( thumbTW1 >> `25` ) & `0x1`;
2724	d = ( thumbTW1 >> `24` ) & `0x1`;
2725
2726	// The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
2727	// The following logical expression checks for the presence of any of those:
2728	bit = ( a & c ) \| ( !a & b & c & d ) \| ( a & b & !c & d );
2729	bits = `0xf` * bit;
2730	thumbTW1 = ( thumbTW1 & ~( `0x7` << `29` ) ) \| ( bits & `0x7` ) << `29`;
2731	thumbTW1 = ( thumbTW1 & ~( `0x1` << `26` ) ) \| ( !bit & `0x1` ) << `26`;
2732
2733	// Set diffbit
2734	thumbTW1 = ( thumbTW1 & ~`0x2` ) \| `0x2`;
2735	thumbTW2 = thumbT59W2;
2736	}
2737
2738	// During search it is not convenient to store the bits the way they are stored in the
2739	// file format. Hence, after search, it is converted to this format.
2740	// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
2741	static inline void stuff58bits( unsigned int thumbH58W1, unsigned int thumbH58W2, unsigned int& thumbHW1, unsigned int& thumbHW2 )
2742	{
2743	// Put bits in twotimer configuration for 58 (red doesn't overflow, green does)
2744	//
2745	// Go from this bit layout:
2746	//
2747	//
2748	// \|63 62 61 60 59 58\|57 56 55 54\|53 52 51 50\|49 48 47 46\|45 44 43 42\|41 40 39 38\|37 36 35 34\|33 32\|
2749	// \|-------empty-----\|---red 0---\|--green 0--\|--blue 0---\|---red 1---\|--green 1--\|--blue 1---\|d2 d1\|
2750	//
2751	// \|31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00\|
2752	// \|---------------------------------------index bits----------------------------------------------\|
2753	//
2754	// To this:
2755	//
2756	// 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2757	// -----------------------------------------------------------------------------------------------
2758	// \|//\|R0 \|G0 \|// // //\|G0\|B0\|//\|B0b \|R1 \|G1 \|B0 \|d2\|df\|d1\|
2759	// -----------------------------------------------------------------------------------------------
2760	//
2761	// \|31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00\|
2762	// \|---------------------------------------index bits----------------------------------------------\|
2763	//
2764	// 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2765	// -----------------------------------------------------------------------------------------------
2766	// \| base col1 \| dcol 2 \| base col1 \| dcol 2 \| base col 1 \| dcol 2 \| table \| table \|df\|fp\|
2767	// \| R1' (5 bits) \| dR2 \| G1' (5 bits) \| dG2 \| B1' (5 bits) \| dB2 \| cw 1 \| cw 2 \|bt\|bt\|
2768	// -----------------------------------------------------------------------------------------------
2769	//
2770	//
2771	// Thus, what we are really doing is going from this bit layout:
2772	//
2773	//
2774	// \|63 62 61 60 59 58\|57 56 55 54 53 52 51\|50 49\|48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33\|32 \|
2775	// \|-------empty-----\|part0---------------\|part1\|part2------------------------------------------\|part3\|
2776	//
2777	// To this:
2778	//
2779	// 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2780	// --------------------------------------------------------------------------------------------------\|
2781	// \|//\|part0 \|// // //\|part1\|//\|part2 \|df\|part3\|
2782	// --------------------------------------------------------------------------------------------------\|
2783
2784	unsigned int part0, part1, part2, part3;
2785	uint8_t bit, a, b, c, d, bits;
2786
2787	// move parts
2788	part0 = ( thumbH58W1 >> `19` ) & `0x7f`;
2789	part1 = ( thumbH58W1 >> `17` ) & `0x3`;
2790	part2 = ( thumbH58W1 >> `1` ) & `0xffff`;
2791	part3 = thumbH58W1 & `0x1`;
2792	thumbHW1 = `0`;
2793	thumbHW1 = ( thumbHW1 & ~( `0x7f` << `24` ) ) \| ( ( part0 & `0x7f` ) << `24` );
2794	thumbHW1 = ( thumbHW1 & ~( `0x3` << `19` ) ) \| ( ( part1 & `0x3` ) << `19` );
2795	thumbHW1 = ( thumbHW1 & ~( `0xffff` << `2` ) ) \| ( ( part2 & `0xffff` ) << `2` );
2796	thumbHW1 = ( thumbHW1 & ~`0x1` ) \| ( part3 & `0x1` );
2797
2798	// Make sure that red does not overflow:
2799	bit = ( thumbHW1 >> `30` ) & `0x1`;
2800	thumbHW1 = ( thumbHW1 & ~( `0x1` << `31` ) ) \| ( ( !bit & `0x1` ) << `31` );
2801
2802	// Make sure that green overflows:
2803	a = ( thumbHW1 >> `20` ) & `0x1`;
2804	b = ( thumbHW1 >> `19` ) & `0x1`;
2805	c = ( thumbHW1 >> `17` ) & `0x1`;
2806	d = ( thumbHW1 >> `16` ) & `0x1`;
2807	// The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
2808	// The following logical expression checks for the presence of any of those:
2809	bit = ( a & c ) \| ( !a & b & c & d ) \| ( a & b & !c & d );
2810	bits = `0xf` * bit;
2811	thumbHW1 = ( thumbHW1 & ~( `0x7` << `21` ) ) \| ( ( bits & `0x7` ) << `21` );
2812	thumbHW1 = ( thumbHW1 & ~( `0x1` << `18` ) ) \| ( ( !bit & `0x1` ) << `18` );
2813
2814	// Set diffbit
2815	thumbHW1 = ( thumbHW1 & ~`0x2` ) \| `0x2`;
2816	thumbHW2 = thumbH58W2;
2817	}
2818
2819	#if defined __AVX2__ \|\| (defined __ARM_NEON && defined __aarch64__)
2820	static etcpak_force_inline Channels GetChannels( const uint8_t* src )
2821	{
2822	Channels ch;
2823	#ifdef __AVX2__
2824	__m128i d0 = _mm_loadu_si128( ( (__m128i*)src ) + `0` );
2825	__m128i d1 = _mm_loadu_si128( ( (__m128i*)src ) + `1` );
2826	__m128i d2 = _mm_loadu_si128( ( (__m128i*)src ) + `2` );
2827	__m128i d3 = _mm_loadu_si128( ( (__m128i*)src ) + `3` );
2828
2829	__m128i rgb0 = _mm_shuffle_epi8( d0, _mm_setr_epi8( `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, -`1`, -`1`, -`1`, -`1` ) );
2830	__m128i rgb1 = _mm_shuffle_epi8( d1, _mm_setr_epi8( `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, -`1`, -`1`, -`1`, -`1` ) );
2831	__m128i rgb2 = _mm_shuffle_epi8( d2, _mm_setr_epi8( `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, -`1`, -`1`, -`1`, -`1` ) );
2832	__m128i rgb3 = _mm_shuffle_epi8( d3, _mm_setr_epi8( `0`, `4`, `8`, `12`, `1`, `5`, `9`, `13`, `2`, `6`, `10`, `14`, -`1`, -`1`, -`1`, -`1` ) );
2833
2834	__m128i rg0 = _mm_unpacklo_epi32( rgb0, rgb1 );
2835	__m128i rg1 = _mm_unpacklo_epi32( rgb2, rgb3 );
2836	__m128i b0 = _mm_unpackhi_epi32( rgb0, rgb1 );
2837	__m128i b1 = _mm_unpackhi_epi32( rgb2, rgb3 );
2838
2839	// swap channels
2840	ch.b8 = _mm_unpacklo_epi64( rg0, rg1 );
2841	ch.g8 = _mm_unpackhi_epi64( rg0, rg1 );
2842	ch.r8 = _mm_unpacklo_epi64( b0, b1 );
2843	#elif defined __ARM_NEON && defined __aarch64__
2844	//load pixel data into 4 rows
2845	uint8x16_t px0 = vld1q_u8( src + `0` );
2846	uint8x16_t px1 = vld1q_u8( src + `16` );
2847	uint8x16_t px2 = vld1q_u8( src + `32` );
2848	uint8x16_t px3 = vld1q_u8( src + `48` );
2849
2850	uint8x16x2_t px0z1 = vzipq_u8( px0, px1 );
2851	uint8x16x2_t px2z3 = vzipq_u8( px2, px3 );
2852	uint8x16x2_t px01 = vzipq_u8( px0z1.val[`0`], px0z1.val[`1`] );
2853	uint8x16x2_t rgb01 = vzipq_u8( px01.val[`0`], px01.val[`1`] );
2854	uint8x16x2_t px23 = vzipq_u8( px2z3.val[`0`], px2z3.val[`1`] );
2855	uint8x16x2_t rgb23 = vzipq_u8( px23.val[`0`], px23.val[`1`] );
2856
2857	uint8x16_t rr = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[`0`] ), vreinterpretq_u64_u8( rgb23.val[`0`] ) ) );
2858	uint8x16_t gg = vreinterpretq_u8_u64( vzip2q_u64( vreinterpretq_u64_u8( rgb01.val[`0`] ), vreinterpretq_u64_u8( rgb23.val[`0`] ) ) );
2859	uint8x16_t bb = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[`1`] ), vreinterpretq_u64_u8( rgb23.val[`1`] ) ) );
2860
2861	uint8x16x2_t red = vzipq_u8( rr, uint8x16_t() );
2862	uint8x16x2_t grn = vzipq_u8( gg, uint8x16_t() );
2863	uint8x16x2_t blu = vzipq_u8( bb, uint8x16_t() );
2864	ch.r = red;
2865	ch.b = blu;
2866	ch.g = grn;
2867	#endif
2868	return ch;
2869	}
2870	#endif
2871
2872	#if defined __AVX2__ \|\| (defined __ARM_NEON && defined __aarch64__)
2873	static etcpak_force_inline void CalculateLuma( Channels& ch, Luma& luma )
2874	#else
2875	static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
2876	#endif
2877	{
2878	#ifdef __AVX2__
2879	__m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.b8 ), _mm256_set1_epi16( `14` ) );
2880	__m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.g8 ), _mm256_set1_epi16( `76` ) );
2881	__m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.r8 ), _mm256_set1_epi16( `38` ) );
2882
2883	__m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma );
2884	__m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, `7` );
2885	__m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, `0` );
2886	__m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, `1` );
2887
2888	static const __m128i interleaving_mask_lo = _mm_set_epi8( `15`, `13`, `11`, `9`, `7`, `5`, `3`, `1`, `14`, `12`, `10`, `8`, `6`, `4`, `2`, `0` );
2889	static const __m128i interleaving_mask_hi = _mm_set_epi8( `14`, `12`, `10`, `8`, `6`, `4`, `2`, `0`, `15`, `13`, `11`, `9`, `7`, `5`, `3`, `1` );
2890	__m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo );
2891	__m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi );
2892	__m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved );
2893	luma.luma8 = luma_8bit;
2894
2895	// min/max calculation
2896	luma.min = hMin( luma_8bit, luma.minIdx ) * `0.00392156f`;
2897	luma.max = hMax( luma_8bit, luma.maxIdx ) * `0.00392156f`;
2898	#elif defined __ARM_NEON && defined __aarch64__
2899	//load pixel data into 4 rows
2900	uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[`0`] ), `14` );
2901	uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[`1`] ), `14` );
2902	uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[`0`] ), `76` );
2903	uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[`1`] ), `76` );
2904	uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[`0`] ), `38` );
2905	uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[`1`] ), `38` );
2906
2907	//calculate luma for rows 0,1 and 2,3
2908	uint16x8_t lum_r01 = vaddq_u16( vaddq_u16( red0, grn0 ), blu0 );
2909	uint16x8_t lum_r23 = vaddq_u16( vaddq_u16( red1, grn1 ), blu1 );
2910
2911	//divide luma values with right shift and narrow results to 8bit
2912	uint8x8_t lum_r01_d = vshrn_n_u16( lum_r01, `7` );
2913	uint8x8_t lum_r02_d = vshrn_n_u16( lum_r23, `7` );
2914
2915	luma.luma8 = vcombine_u8( lum_r01_d, lum_r02_d );
2916	//find min and max luma value
2917	luma.min = hMin( luma.luma8, luma.minIdx ) * `0.00392156f`;
2918	luma.max = hMax( luma.luma8, luma.maxIdx ) * `0.00392156f`;
2919	#else
2920	for( int i = `0`; i < `16`; ++i )
2921	{
2922	luma.val[i] = ( src[i * `4` + `2`] * `76` + src[i * `4` + `1`] * `150` + src[i * `4`] * `28` ) / `254`; // luma calculation
2923	if( luma.min > luma.val[i] )
2924	{
2925	luma.min = luma.val[i];
2926	luma.minIdx = i;
2927	}
2928	if( luma.max < luma.val[i] )
2929	{
2930	luma.max = luma.val[i];
2931	luma.maxIdx = i;
2932	}
2933	}
2934	#endif
2935	}
2936
2937	static etcpak_force_inline uint8_t SelectModeETC2( const Luma& luma )
2938	{
2939	#if defined __AVX2__ \|\| defined __ARM_NEON
2940	const float lumaRange = ( luma.max - luma.min );
2941	#else
2942	const float lumaRange = ( luma.max - luma.min ) * ( `1.f` / `255.f` );
2943	#endif
2944	// filters a very-low-contrast block
2945	if( lumaRange <= ecmd_threshold[`0`] )
2946	{
2947	return ModePlanar;
2948	}
2949	// checks whether a pair of the corner pixels in a block has the min/max luma values;
2950	// if so, the ETC2 planar mode is enabled, and otherwise, the ETC1 mode is enabled
2951	else if( lumaRange <= ecmd_threshold[`1`] )
2952	{
2953	#ifdef __AVX2__
2954	static const __m128i corner_pair = _mm_set_epi8( `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `15`, `3`, `12`, `12`, `3`, `15`, `0` );
2955	__m128i current_max_min = _mm_set_epi8( `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx );
2956
2957	__m128i max_min_result = _mm_cmpeq_epi16( corner_pair, current_max_min );
2958
2959	int mask = _mm_movemask_epi8( max_min_result );
2960	if( mask )
2961	{
2962	return ModePlanar;
2963	}
2964	#else
2965	// check whether a pair of the corner pixels in a block has the min/max luma values;
2966	// if so, the ETC2 planar mode is enabled.
2967	if( ( luma.minIdx == `0` && luma.maxIdx == `15` ) \|\|
2968	( luma.minIdx == `15` && luma.maxIdx == `0` ) \|\|
2969	( luma.minIdx == `3` && luma.maxIdx == `12` ) \|\|
2970	( luma.minIdx == `12` && luma.maxIdx == `3` ) )
2971	{
2972	return ModePlanar;
2973	}
2974	#endif
2975	}
2976	// filters a high-contrast block for checking both ETC1 mode and the ETC2 T/H mode
2977	else if( lumaRange >= ecmd_threshold[`2`] )
2978	{
2979	return ModeTH;
2980	}
2981	return ModeUndecided;
2982	}
2983
2984	static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool useHeuristics )
2985	{
2986	#ifdef __AVX2__
2987	uint64_t d = CheckSolid_AVX2( src );
2988	if( d != `0` ) return d;
2989	#else
2990	uint64_t d = CheckSolid( src );
2991	if (d != `0`) return d;
2992	#endif
2993
2994	uint8_t mode = ModeUndecided;
2995	Luma luma;
2996	#ifdef __AVX2__
2997	Channels ch = GetChannels( src );
2998	if( useHeuristics )
2999	{
3000	CalculateLuma( ch, luma );
3001	mode = SelectModeETC2( luma );
3002	}
3003
3004	auto plane = Planar_AVX2( ch, mode, useHeuristics );
3005	if( useHeuristics && mode == ModePlanar ) return plane.plane;
3006
3007	alignas( `32` ) v4i a[`8`];
3008	__m128i err0 = PrepareAverages_AVX2( a, plane.sum4 );
3009
3010	// Get index of minimum error (err0)
3011	__m128i err1 = _mm_shuffle_epi32( err0, _MM_SHUFFLE( `2`, `3`, `0`, `1` ) );
3012	__m128i errMin0 = _mm_min_epu32(err0, err1);
3013
3014	__m128i errMin1 = _mm_shuffle_epi32( errMin0, _MM_SHUFFLE( `1`, `0`, `3`, `2` ) );
3015	__m128i errMin2 = _mm_min_epu32( errMin1, errMin0 );
3016
3017	__m128i errMask = _mm_cmpeq_epi32( errMin2, err0 );
3018
3019	uint32_t mask = _mm_movemask_epi8( errMask );
3020
3021	size_t idx = _bit_scan_forward( mask ) >> `2`;
3022
3023	d = EncodeAverages_AVX2( a, idx );
3024
3025	alignas(`32`) uint32_t terr[`2`][`8`] = {};
3026	alignas(`32`) uint32_t tsel[`8`];
3027
3028	if ((idx == `0`) \|\| (idx == `2`))
3029	{
3030	FindBestFit_4x2_AVX2( terr, tsel, a, idx * `2`, src );
3031	}
3032	else
3033	{
3034	FindBestFit_2x4_AVX2( terr, tsel, a, idx * `2`, src );
3035	}
3036
3037	if( useHeuristics )
3038	{
3039	if( mode == ModeTH )
3040	{
3041	uint64_t result = `0`;
3042	uint64_t error = `0`;
3043	uint32_t compressed[`4`] = { `0`, `0`, `0`, `0` };
3044	bool tMode = false;
3045
3046	error = compressBlockTH( (uint8_t*)src, luma, compressed[`0`], compressed[`1`], tMode, ch.r8, ch.g8, ch.b8 );
3047	if( tMode )
3048	{
3049	stuff59bits( compressed[`0`], compressed[`1`], compressed[`2`], compressed[`3`] );
3050	}
3051	else
3052	{
3053	stuff58bits( compressed[`0`], compressed[`1`], compressed[`2`], compressed[`3`] );
3054	}
3055
3056	result = (uint32_t)_bswap( compressed[`2`] );
3057	result \|= static_cast<uint64_t>( _bswap( compressed[`3`] ) ) << `32`;
3058
3059	plane.plane = result;
3060	plane.error = error;
3061	}
3062	else
3063	{
3064	plane.plane = `0`;
3065	plane.error = MaxError;
3066	}
3067	}
3068
3069	return EncodeSelectors_AVX2( d, terr, tsel, ( idx % `2` ) == `1`, plane.plane, plane.error );
3070	#else
3071	if( useHeuristics )
3072	{
3073	#if defined __ARM_NEON && defined __aarch64__
3074	Channels ch = GetChannels( src );
3075	CalculateLuma( ch, luma );
3076	#else
3077	CalculateLuma( src, luma );
3078	#endif
3079	mode = SelectModeETC2( luma );
3080	}
3081	#ifdef __ARM_NEON
3082	auto result = Planar_NEON( src, mode, useHeuristics );
3083	#else
3084	auto result = Planar( src, mode, useHeuristics );
3085	#endif
3086	if( result.second == `0` ) return result.first;
3087
3088	v4i a[`8`];
3089	unsigned int err[`4`] = {};
3090	PrepareAverages( a, src, err );
3091	size_t idx = GetLeastError( err, `4` );
3092	EncodeAverages( d, a, idx );
3093
3094	#if ( defined __SSE4_1__ \|\| defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
3095	uint32_t terr[`2`][`8`] = {};
3096	#else
3097	uint64_t terr[`2`][`8`] = {};
3098	#endif
3099	uint16_t tsel[`16`][`8`];
3100	auto id = g_id[idx];
3101	FindBestFit( terr, tsel, a, id, src );
3102
3103	if( useHeuristics )
3104	{
3105	if( mode == ModeTH )
3106	{
3107	uint32_t compressed[`4`] = { `0`, `0`, `0`, `0` };
3108	bool tMode = false;
3109
3110	result.second = compressBlockTH( (uint8_t*)src, luma, compressed[`0`], compressed[`1`], tMode );
3111	if( tMode )
3112	{
3113	stuff59bits( compressed[`0`], compressed[`1`], compressed[`2`], compressed[`3`] );
3114	}
3115	else
3116	{
3117	stuff58bits( compressed[`0`], compressed[`1`], compressed[`2`], compressed[`3`] );
3118	}
3119
3120	result.first = (uint32_t)_bswap( compressed[`2`] );
3121	result.first \|= static_cast<uint64_t>( _bswap( compressed[`3`] ) ) << `32`;
3122	}
3123	else
3124	{
3125	result.first = `0`;
3126	result.second = MaxError;
3127	}
3128	}
3129
3130	return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
3131	#endif
3132	}
3133
3134	#ifdef __SSE4_1__
3135	template<int K>
3136	static etcpak_force_inline __m128i Widen( const __m128i src )
3137	{
3138	static_assert( K >= `0` && K <= `7`, "Index out of range" );
3139
3140	__m128i tmp;
3141	switch( K )
3142	{
3143	case `0`:
3144	tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( `0`, `0`, `0`, `0` ) );
3145	return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( `0`, `0`, `0`, `0` ) );
3146	case `1`:
3147	tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( `1`, `1`, `1`, `1` ) );
3148	return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( `0`, `0`, `0`, `0` ) );
3149	case `2`:
3150	tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( `2`, `2`, `2`, `2` ) );
3151	return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( `0`, `0`, `0`, `0` ) );
3152	case `3`:
3153	tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( `3`, `3`, `3`, `3` ) );
3154	return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( `0`, `0`, `0`, `0` ) );
3155	case `4`:
3156	tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( `0`, `0`, `0`, `0` ) );
3157	return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( `2`, `2`, `2`, `2` ) );
3158	case `5`:
3159	tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( `1`, `1`, `1`, `1` ) );
3160	return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( `2`, `2`, `2`, `2` ) );
3161	case `6`:
3162	tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( `2`, `2`, `2`, `2` ) );
3163	return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( `2`, `2`, `2`, `2` ) );
3164	case `7`:
3165	tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( `3`, `3`, `3`, `3` ) );
3166	return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( `2`, `2`, `2`, `2` ) );
3167	}
3168	}
3169
3170	static etcpak_force_inline int GetMulSel( int sel )
3171	{
3172	switch( sel )
3173	{
3174	case `0`:
3175	return `0`;
3176	case `1`:
3177	case `2`:
3178	case `3`:
3179	return `1`;
3180	case `4`:
3181	return `2`;
3182	case `5`:
3183	case `6`:
3184	case `7`:
3185	return `3`;
3186	case `8`:
3187	case `9`:
3188	case `10`:
3189	case `11`:
3190	case `12`:
3191	case `13`:
3192	return `4`;
3193	case `14`:
3194	case `15`:
3195	return `5`;
3196	}
3197	}
3198
3199	#endif
3200
3201	#ifdef __ARM_NEON
3202
3203	static constexpr etcpak_force_inline int GetMulSel(int sel)
3204	{
3205	return ( sel < `1` ) ? `0` : ( sel < `4` ) ? `1` : ( sel < `5` ) ? `2` : ( sel < `8` ) ? `3` : ( sel < `14` ) ? `4` : `5`;
3206	}
3207
3208	static constexpr int ClampConstant( int x, int min, int max )
3209	{
3210	return x < min ? min : x > max ? max : x;
3211	}
3212
3213	template <int Index>
3214	etcpak_force_inline static uint16x8_t ErrorProbe_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
3215	{
3216	uint8x8_t srcValWide;
3217	#ifndef __aarch64__
3218	if( Index < `8` )
3219	srcValWide = vdup_lane_u8( vget_low_u8( alphaBlock ), ClampConstant( Index, `0`, `7` ) );
3220	else
3221	srcValWide = vdup_lane_u8( vget_high_u8( alphaBlock ), ClampConstant( Index - `8`, `0`, `7` ) );
3222	#else
3223	srcValWide = vdup_laneq_u8( alphaBlock, Index );
3224	#endif
3225
3226	uint8x8_t deltaVal = vabd_u8( srcValWide, recVal );
3227	return vmull_u8( deltaVal, deltaVal );
3228	}
3229
3230	etcpak_force_inline static uint16_t MinError_EAC_NEON( uint16x8_t errProbe )
3231	{
3232	#ifndef __aarch64__
3233	uint16x4_t tmpErr = vpmin_u16( vget_low_u16( errProbe ), vget_high_u16( errProbe ) );
3234	tmpErr = vpmin_u16( tmpErr, tmpErr );
3235	return vpmin_u16( tmpErr, tmpErr )[`0`];
3236	#else
3237	return vminvq_u16( errProbe );
3238	#endif
3239	}
3240
3241	template <int Index>
3242	etcpak_force_inline static uint64_t MinErrorIndex_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
3243	{
3244	uint16x8_t errProbe = ErrorProbe_EAC_NEON<Index>( recVal, alphaBlock );
3245	uint16x8_t minErrMask = vceqq_u16( errProbe, vdupq_n_u16( MinError_EAC_NEON( errProbe ) ) );
3246	uint64_t idx = __builtin_ctzll( vget_lane_u64( vreinterpret_u64_u8( vqmovn_u16( minErrMask ) ), `0` ) );
3247	idx >>= `3`;
3248	idx <<= `45` - Index * `3`;
3249
3250	return idx;
3251	}
3252
3253	template <int Index>
3254	etcpak_force_inline static int16x8_t WidenMultiplier_EAC_NEON( int16x8_t multipliers )
3255	{
3256	constexpr int Lane = GetMulSel( Index );
3257	#ifndef __aarch64__
3258	if( Lane < `4` )
3259	return vdupq_lane_s16( vget_low_s16( multipliers ), ClampConstant( Lane, `0`, `3` ) );
3260	else
3261	return vdupq_lane_s16( vget_high_s16( multipliers ), ClampConstant( Lane - `4`, `0`, `3` ) );
3262	#else
3263	return vdupq_laneq_s16( multipliers, Lane );
3264	#endif
3265	}
3266
3267	#endif
3268
3269	static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
3270	{
3271	#if defined __SSE4_1__
3272	// Check solid
3273	__m128i s = _mm_loadu_si128( (__m128i*)src );
3274	__m128i solidCmp = _mm_set1_epi8( src[`0`] );
3275	__m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
3276	if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -`1` ) ) )
3277	{
3278	return src[`0`];
3279	}
3280
3281	// Calculate min, max
3282	__m128i s1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( `2`, `3`, `0`, `1` ) );
3283	__m128i max1 = _mm_max_epu8( s, s1 );
3284	__m128i min1 = _mm_min_epu8( s, s1 );
3285	__m128i smax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( `0`, `0`, `2`, `2` ) );
3286	__m128i smin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( `0`, `0`, `2`, `2` ) );
3287	__m128i max2 = _mm_max_epu8( max1, smax2 );
3288	__m128i min2 = _mm_min_epu8( min1, smin2 );
3289	__m128i smax3 = _mm_alignr_epi8( max2, max2, `2` );
3290	__m128i smin3 = _mm_alignr_epi8( min2, min2, `2` );
3291	__m128i max3 = _mm_max_epu8( max2, smax3 );
3292	__m128i min3 = _mm_min_epu8( min2, smin3 );
3293	__m128i smax4 = _mm_alignr_epi8( max3, max3, `1` );
3294	__m128i smin4 = _mm_alignr_epi8( min3, min3, `1` );
3295	__m128i max = _mm_max_epu8( max3, smax4 );
3296	__m128i min = _mm_min_epu8( min3, smin4 );
3297	__m128i max16 = _mm_unpacklo_epi8( max, _mm_setzero_si128() );
3298	__m128i min16 = _mm_unpacklo_epi8( min, _mm_setzero_si128() );
3299
3300	// src range, mid
3301	__m128i srcRange = _mm_sub_epi16( max16, min16 );
3302	__m128i srcRangeHalf = _mm_srli_epi16( srcRange, `1` );
3303	__m128i srcMid = _mm_add_epi16( min16, srcRangeHalf );
3304
3305	// multiplier
3306	__m128i mul1 = _mm_mulhi_epi16( srcRange, g_alphaRange_SIMD );
3307	__m128i mul = _mm_add_epi16( mul1, _mm_set1_epi16( `1` ) );
3308
3309	// wide source
3310	__m128i s16_1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( `3`, `2`, `3`, `2` ) );
3311	__m128i s16[`2`] = { _mm_unpacklo_epi8( s, _mm_setzero_si128() ), _mm_unpacklo_epi8( s16_1, _mm_setzero_si128() ) };
3312
3313	__m128i sr[`16`] = {
3314	Widen<`0`>( s16[`0`] ),
3315	Widen<`1`>( s16[`0`] ),
3316	Widen<`2`>( s16[`0`] ),
3317	Widen<`3`>( s16[`0`] ),
3318	Widen<`4`>( s16[`0`] ),
3319	Widen<`5`>( s16[`0`] ),
3320	Widen<`6`>( s16[`0`] ),
3321	Widen<`7`>( s16[`0`] ),
3322	Widen<`0`>( s16[`1`] ),
3323	Widen<`1`>( s16[`1`] ),
3324	Widen<`2`>( s16[`1`] ),
3325	Widen<`3`>( s16[`1`] ),
3326	Widen<`4`>( s16[`1`] ),
3327	Widen<`5`>( s16[`1`] ),
3328	Widen<`6`>( s16[`1`] ),
3329	Widen<`7`>( s16[`1`] )
3330	};
3331
3332	#ifdef __AVX2__
3333	__m256i srcRangeWide = _mm256_broadcastsi128_si256( srcRange );
3334	__m256i srcMidWide = _mm256_broadcastsi128_si256( srcMid );
3335
3336	__m256i mulWide1 = _mm256_mulhi_epi16( srcRangeWide, g_alphaRange_AVX );
3337	__m256i mulWide = _mm256_add_epi16( mulWide1, _mm256_set1_epi16( `1` ) );
3338
3339	__m256i modMul[`8`] = {
3340	_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`0`] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`0`] ) ) ), _mm256_setzero_si256() ),
3341	_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`1`] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`1`] ) ) ), _mm256_setzero_si256() ),
3342	_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`2`] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`2`] ) ) ), _mm256_setzero_si256() ),
3343	_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`3`] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`3`] ) ) ), _mm256_setzero_si256() ),
3344	_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`4`] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`4`] ) ) ), _mm256_setzero_si256() ),
3345	_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`5`] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`5`] ) ) ), _mm256_setzero_si256() ),
3346	_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`6`] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`6`] ) ) ), _mm256_setzero_si256() ),
3347	_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`7`] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[`7`] ) ) ), _mm256_setzero_si256() ),
3348	};
3349
3350	// find selector
3351	__m256i mulErr = _mm256_setzero_si256();
3352	for( int j=`0`; j<`16`; j++ )
3353	{
3354	__m256i s16Wide = _mm256_broadcastsi128_si256( sr[j] );
3355	__m256i err1, err2;
3356
3357	err1 = _mm256_sub_epi16( s16Wide, modMul[`0`] );
3358	__m256i localErr = _mm256_mullo_epi16( err1, err1 );
3359
3360	err1 = _mm256_sub_epi16( s16Wide, modMul[`1`] );
3361	err2 = _mm256_mullo_epi16( err1, err1 );
3362	localErr = _mm256_min_epu16( localErr, err2 );
3363
3364	err1 = _mm256_sub_epi16( s16Wide, modMul[`2`] );
3365	err2 = _mm256_mullo_epi16( err1, err1 );
3366	localErr = _mm256_min_epu16( localErr, err2 );
3367
3368	err1 = _mm256_sub_epi16( s16Wide, modMul[`3`] );
3369	err2 = _mm256_mullo_epi16( err1, err1 );
3370	localErr = _mm256_min_epu16( localErr, err2 );
3371
3372	err1 = _mm256_sub_epi16( s16Wide, modMul[`4`] );
3373	err2 = _mm256_mullo_epi16( err1, err1 );
3374	localErr = _mm256_min_epu16( localErr, err2 );
3375
3376	err1 = _mm256_sub_epi16( s16Wide, modMul[`5`] );
3377	err2 = _mm256_mullo_epi16( err1, err1 );
3378	localErr = _mm256_min_epu16( localErr, err2 );
3379
3380	err1 = _mm256_sub_epi16( s16Wide, modMul[`6`] );
3381	err2 = _mm256_mullo_epi16( err1, err1 );
3382	localErr = _mm256_min_epu16( localErr, err2 );
3383
3384	err1 = _mm256_sub_epi16( s16Wide, modMul[`7`] );
3385	err2 = _mm256_mullo_epi16( err1, err1 );
3386	localErr = _mm256_min_epu16( localErr, err2 );
3387
3388	// note that this can overflow, but since we're looking for the smallest error, it shouldn't matter
3389	mulErr = _mm256_adds_epu16( mulErr, localErr );
3390	}
3391	uint64_t minPos1 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_castsi256_si128( mulErr ) ) );
3392	uint64_t minPos2 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_extracti128_si256( mulErr, `1` ) ) );
3393	int sel = ( ( minPos1 & `0xFFFF` ) < ( minPos2 & `0xFFFF` ) ) ? ( minPos1 >> `16` ) : ( `8` + ( minPos2 >> `16` ) );
3394
3395	__m128i recVal16;
3396	switch( sel )
3397	{
3398	case `0`:
3399	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`0`>( mul ), g_alpha_SIMD[`0`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`0`>( mul ), g_alpha_SIMD[`0`] ) ) ), _mm_setzero_si128() );
3400	break;
3401	case `1`:
3402	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`1`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`1`] ) ) ), _mm_setzero_si128() );
3403	break;
3404	case `2`:
3405	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`2`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`2`] ) ) ), _mm_setzero_si128() );
3406	break;
3407	case `3`:
3408	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`3`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`3`] ) ) ), _mm_setzero_si128() );
3409	break;
3410	case `4`:
3411	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`2`>( mul ), g_alpha_SIMD[`4`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`2`>( mul ), g_alpha_SIMD[`4`] ) ) ), _mm_setzero_si128() );
3412	break;
3413	case `5`:
3414	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`5`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`5`] ) ) ), _mm_setzero_si128() );
3415	break;
3416	case `6`:
3417	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`6`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`6`] ) ) ), _mm_setzero_si128() );
3418	break;
3419	case `7`:
3420	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`7`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`7`] ) ) ), _mm_setzero_si128() );
3421	break;
3422	case `8`:
3423	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`8`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`8`] ) ) ), _mm_setzero_si128() );
3424	break;
3425	case `9`:
3426	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`9`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`9`] ) ) ), _mm_setzero_si128() );
3427	break;
3428	case `10`:
3429	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`10`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`10`] ) ) ), _mm_setzero_si128() );
3430	break;
3431	case `11`:
3432	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`11`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`11`] ) ) ), _mm_setzero_si128() );
3433	break;
3434	case `12`:
3435	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`12`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`12`] ) ) ), _mm_setzero_si128() );
3436	break;
3437	case `13`:
3438	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`13`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`13`] ) ) ), _mm_setzero_si128() );
3439	break;
3440	case `14`:
3441	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`5`>( mul ), g_alpha_SIMD[`14`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`5`>( mul ), g_alpha_SIMD[`14`] ) ) ), _mm_setzero_si128() );
3442	break;
3443	case `15`:
3444	recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`5`>( mul ), g_alpha_SIMD[`15`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`5`>( mul ), g_alpha_SIMD[`15`] ) ) ), _mm_setzero_si128() );
3445	break;
3446	default:
3447	assert( false );
3448	break;
3449	}
3450	#else
3451	// wide multiplier
3452	__m128i rangeMul[`16`] = {
3453	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`0`>( mul ), g_alpha_SIMD[`0`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`0`>( mul ), g_alpha_SIMD[`0`] ) ) ), _mm_setzero_si128() ),
3454	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`1`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`1`] ) ) ), _mm_setzero_si128() ),
3455	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`2`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`2`] ) ) ), _mm_setzero_si128() ),
3456	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`3`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`1`>( mul ), g_alpha_SIMD[`3`] ) ) ), _mm_setzero_si128() ),
3457	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`2`>( mul ), g_alpha_SIMD[`4`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`2`>( mul ), g_alpha_SIMD[`4`] ) ) ), _mm_setzero_si128() ),
3458	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`5`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`5`] ) ) ), _mm_setzero_si128() ),
3459	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`6`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`6`] ) ) ), _mm_setzero_si128() ),
3460	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`7`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`3`>( mul ), g_alpha_SIMD[`7`] ) ) ), _mm_setzero_si128() ),
3461	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`8`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`8`] ) ) ), _mm_setzero_si128() ),
3462	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`9`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`9`] ) ) ), _mm_setzero_si128() ),
3463	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`10`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`10`] ) ) ), _mm_setzero_si128() ),
3464	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`11`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`11`] ) ) ), _mm_setzero_si128() ),
3465	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`12`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`12`] ) ) ), _mm_setzero_si128() ),
3466	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`13`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`4`>( mul ), g_alpha_SIMD[`13`] ) ) ), _mm_setzero_si128() ),
3467	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`5`>( mul ), g_alpha_SIMD[`14`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`5`>( mul ), g_alpha_SIMD[`14`] ) ) ), _mm_setzero_si128() ),
3468	_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`5`>( mul ), g_alpha_SIMD[`15`] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<`5`>( mul ), g_alpha_SIMD[`15`] ) ) ), _mm_setzero_si128() )
3469	};
3470
3471	// find selector
3472	int err = std::numeric_limits<int>::max();
3473	int sel;
3474	for( int r=`0`; r<`16`; r++ )
3475	{
3476	__m128i err1, err2, minerr;
3477	__m128i recVal16 = rangeMul[r];
3478	int rangeErr;
3479
3480	err1 = _mm_sub_epi16( sr[`0`], recVal16 );
3481	err2 = _mm_mullo_epi16( err1, err1 );
3482	minerr = _mm_minpos_epu16( err2 );
3483	rangeErr = _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3484
3485	err1 = _mm_sub_epi16( sr[`1`], recVal16 );
3486	err2 = _mm_mullo_epi16( err1, err1 );
3487	minerr = _mm_minpos_epu16( err2 );
3488	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3489
3490	err1 = _mm_sub_epi16( sr[`2`], recVal16 );
3491	err2 = _mm_mullo_epi16( err1, err1 );
3492	minerr = _mm_minpos_epu16( err2 );
3493	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3494
3495	err1 = _mm_sub_epi16( sr[`3`], recVal16 );
3496	err2 = _mm_mullo_epi16( err1, err1 );
3497	minerr = _mm_minpos_epu16( err2 );
3498	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3499
3500	err1 = _mm_sub_epi16( sr[`4`], recVal16 );
3501	err2 = _mm_mullo_epi16( err1, err1 );
3502	minerr = _mm_minpos_epu16( err2 );
3503	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3504
3505	err1 = _mm_sub_epi16( sr[`5`], recVal16 );
3506	err2 = _mm_mullo_epi16( err1, err1 );
3507	minerr = _mm_minpos_epu16( err2 );
3508	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3509
3510	err1 = _mm_sub_epi16( sr[`6`], recVal16 );
3511	err2 = _mm_mullo_epi16( err1, err1 );
3512	minerr = _mm_minpos_epu16( err2 );
3513	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3514
3515	err1 = _mm_sub_epi16( sr[`7`], recVal16 );
3516	err2 = _mm_mullo_epi16( err1, err1 );
3517	minerr = _mm_minpos_epu16( err2 );
3518	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3519
3520	err1 = _mm_sub_epi16( sr[`8`], recVal16 );
3521	err2 = _mm_mullo_epi16( err1, err1 );
3522	minerr = _mm_minpos_epu16( err2 );
3523	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3524
3525	err1 = _mm_sub_epi16( sr[`9`], recVal16 );
3526	err2 = _mm_mullo_epi16( err1, err1 );
3527	minerr = _mm_minpos_epu16( err2 );
3528	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3529
3530	err1 = _mm_sub_epi16( sr[`10`], recVal16 );
3531	err2 = _mm_mullo_epi16( err1, err1 );
3532	minerr = _mm_minpos_epu16( err2 );
3533	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3534
3535	err1 = _mm_sub_epi16( sr[`11`], recVal16 );
3536	err2 = _mm_mullo_epi16( err1, err1 );
3537	minerr = _mm_minpos_epu16( err2 );
3538	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3539
3540	err1 = _mm_sub_epi16( sr[`12`], recVal16 );
3541	err2 = _mm_mullo_epi16( err1, err1 );
3542	minerr = _mm_minpos_epu16( err2 );
3543	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3544
3545	err1 = _mm_sub_epi16( sr[`13`], recVal16 );
3546	err2 = _mm_mullo_epi16( err1, err1 );
3547	minerr = _mm_minpos_epu16( err2 );
3548	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3549
3550	err1 = _mm_sub_epi16( sr[`14`], recVal16 );
3551	err2 = _mm_mullo_epi16( err1, err1 );
3552	minerr = _mm_minpos_epu16( err2 );
3553	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3554
3555	err1 = _mm_sub_epi16( sr[`15`], recVal16 );
3556	err2 = _mm_mullo_epi16( err1, err1 );
3557	minerr = _mm_minpos_epu16( err2 );
3558	rangeErr += _mm_cvtsi128_si64( minerr ) & `0xFFFF`;
3559
3560	if( rangeErr < err )
3561	{
3562	err = rangeErr;
3563	sel = r;
3564	if( err == `0` ) break;
3565	}
3566	}
3567
3568	__m128i recVal16 = rangeMul[sel];
3569	#endif
3570
3571	// find indices
3572	__m128i err1, err2, minerr;
3573	uint64_t idx = `0`, tmp;
3574
3575	err1 = _mm_sub_epi16( sr[`0`], recVal16 );
3576	err2 = _mm_mullo_epi16( err1, err1 );
3577	minerr = _mm_minpos_epu16( err2 );
3578	tmp = _mm_cvtsi128_si64( minerr );
3579	idx \|= ( tmp >> `16` ) << `15`*`3`;
3580
3581	err1 = _mm_sub_epi16( sr[`1`], recVal16 );
3582	err2 = _mm_mullo_epi16( err1, err1 );
3583	minerr = _mm_minpos_epu16( err2 );
3584	tmp = _mm_cvtsi128_si64( minerr );
3585	idx \|= ( tmp >> `16` ) << `14`*`3`;
3586
3587	err1 = _mm_sub_epi16( sr[`2`], recVal16 );
3588	err2 = _mm_mullo_epi16( err1, err1 );
3589	minerr = _mm_minpos_epu16( err2 );
3590	tmp = _mm_cvtsi128_si64( minerr );
3591	idx \|= ( tmp >> `16` ) << `13`*`3`;
3592
3593	err1 = _mm_sub_epi16( sr[`3`], recVal16 );
3594	err2 = _mm_mullo_epi16( err1, err1 );
3595	minerr = _mm_minpos_epu16( err2 );
3596	tmp = _mm_cvtsi128_si64( minerr );
3597	idx \|= ( tmp >> `16` ) << `12`*`3`;
3598
3599	err1 = _mm_sub_epi16( sr[`4`], recVal16 );
3600	err2 = _mm_mullo_epi16( err1, err1 );
3601	minerr = _mm_minpos_epu16( err2 );
3602	tmp = _mm_cvtsi128_si64( minerr );
3603	idx \|= ( tmp >> `16` ) << `11`*`3`;
3604
3605	err1 = _mm_sub_epi16( sr[`5`], recVal16 );
3606	err2 = _mm_mullo_epi16( err1, err1 );
3607	minerr = _mm_minpos_epu16( err2 );
3608	tmp = _mm_cvtsi128_si64( minerr );
3609	idx \|= ( tmp >> `16` ) << `10`*`3`;
3610
3611	err1 = _mm_sub_epi16( sr[`6`], recVal16 );
3612	err2 = _mm_mullo_epi16( err1, err1 );
3613	minerr = _mm_minpos_epu16( err2 );
3614	tmp = _mm_cvtsi128_si64( minerr );
3615	idx \|= ( tmp >> `16` ) << `9`*`3`;
3616
3617	err1 = _mm_sub_epi16( sr[`7`], recVal16 );
3618	err2 = _mm_mullo_epi16( err1, err1 );
3619	minerr = _mm_minpos_epu16( err2 );
3620	tmp = _mm_cvtsi128_si64( minerr );
3621	idx \|= ( tmp >> `16` ) << `8`*`3`;
3622
3623	err1 = _mm_sub_epi16( sr[`8`], recVal16 );
3624	err2 = _mm_mullo_epi16( err1, err1 );
3625	minerr = _mm_minpos_epu16( err2 );
3626	tmp = _mm_cvtsi128_si64( minerr );
3627	idx \|= ( tmp >> `16` ) << `7`*`3`;
3628
3629	err1 = _mm_sub_epi16( sr[`9`], recVal16 );
3630	err2 = _mm_mullo_epi16( err1, err1 );
3631	minerr = _mm_minpos_epu16( err2 );
3632	tmp = _mm_cvtsi128_si64( minerr );
3633	idx \|= ( tmp >> `16` ) << `6`*`3`;
3634
3635	err1 = _mm_sub_epi16( sr[`10`], recVal16 );
3636	err2 = _mm_mullo_epi16( err1, err1 );
3637	minerr = _mm_minpos_epu16( err2 );
3638	tmp = _mm_cvtsi128_si64( minerr );
3639	idx \|= ( tmp >> `16` ) << `5`*`3`;
3640
3641	err1 = _mm_sub_epi16( sr[`11`], recVal16 );
3642	err2 = _mm_mullo_epi16( err1, err1 );
3643	minerr = _mm_minpos_epu16( err2 );
3644	tmp = _mm_cvtsi128_si64( minerr );
3645	idx \|= ( tmp >> `16` ) << `4`*`3`;
3646
3647	err1 = _mm_sub_epi16( sr[`12`], recVal16 );
3648	err2 = _mm_mullo_epi16( err1, err1 );
3649	minerr = _mm_minpos_epu16( err2 );
3650	tmp = _mm_cvtsi128_si64( minerr );
3651	idx \|= ( tmp >> `16` ) << `3`*`3`;
3652
3653	err1 = _mm_sub_epi16( sr[`13`], recVal16 );
3654	err2 = _mm_mullo_epi16( err1, err1 );
3655	minerr = _mm_minpos_epu16( err2 );
3656	tmp = _mm_cvtsi128_si64( minerr );
3657	idx \|= ( tmp >> `16` ) << `2`*`3`;
3658
3659	err1 = _mm_sub_epi16( sr[`14`], recVal16 );
3660	err2 = _mm_mullo_epi16( err1, err1 );
3661	minerr = _mm_minpos_epu16( err2 );
3662	tmp = _mm_cvtsi128_si64( minerr );
3663	idx \|= ( tmp >> `16` ) << `1`*`3`;
3664
3665	err1 = _mm_sub_epi16( sr[`15`], recVal16 );
3666	err2 = _mm_mullo_epi16( err1, err1 );
3667	minerr = _mm_minpos_epu16( err2 );
3668	tmp = _mm_cvtsi128_si64( minerr );
3669	idx \|= ( tmp >> `16` ) << `0`*`3`;
3670
3671	uint16_t rm[`8`];
3672	_mm_storeu_si128( (__m128i*)rm, mul );
3673	uint16_t sm = _mm_cvtsi128_si64( srcMid );
3674
3675	uint64_t d = ( uint64_t( sm ) << `56` ) \|
3676	( uint64_t( rm[GetMulSel( sel )] ) << `52` ) \|
3677	( uint64_t( sel ) << `48` ) \|
3678	idx;
3679
3680	return _bswap64( d );
3681	#elif defined __ARM_NEON
3682
3683	int16x8_t srcMidWide, multipliers;
3684	int srcMid;
3685	uint8x16_t srcAlphaBlock = vld1q_u8( src );
3686	{
3687	uint8_t ref = src[`0`];
3688	uint8x16_t a0 = vdupq_n_u8( ref );
3689	uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 );
3690	int64x2_t m = vreinterpretq_s64_u8( r );
3691	if( m[`0`] == -`1` && m[`1`] == -`1` )
3692	return ref;
3693
3694	// srcRange
3695	#ifdef __aarch64__
3696	uint8_t min = vminvq_u8( srcAlphaBlock );
3697	uint8_t max = vmaxvq_u8( srcAlphaBlock );
3698	uint8_t srcRange = max - min;
3699	multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_n_s16( g_alphaRange_NEON, srcRange ), `1` ), vdupq_n_s16( `1` ) );
3700	srcMid = min + srcRange / `2`;
3701	srcMidWide = vdupq_n_s16( srcMid );
3702	#else
3703	uint8x8_t vmin = vpmin_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
3704	vmin = vpmin_u8( vmin, vmin );
3705	vmin = vpmin_u8( vmin, vmin );
3706	vmin = vpmin_u8( vmin, vmin );
3707	uint8x8_t vmax = vpmax_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
3708	vmax = vpmax_u8( vmax, vmax );
3709	vmax = vpmax_u8( vmax, vmax );
3710	vmax = vpmax_u8( vmax, vmax );
3711
3712	int16x8_t srcRangeWide = vreinterpretq_s16_u16( vsubl_u8( vmax, vmin ) );
3713	multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_s16( g_alphaRange_NEON, srcRangeWide ), `1` ), vdupq_n_s16( `1` ) );
3714	srcMidWide = vsraq_n_s16( vreinterpretq_s16_u16(vmovl_u8(vmin)), srcRangeWide, `1`);
3715	srcMid = vgetq_lane_s16( srcMidWide, `0` );
3716	#endif
3717	}
3718
3719	// calculate reconstructed values
3720	#define EAC_APPLY_16X( m ) m( 0 ) m( 1 ) m( 2 ) m( 3 ) m( 4 ) m( 5 ) m( 6 ) m( 7 ) m( 8 ) m( 9 ) m( 10 ) m( 11 ) m( 12 ) m( 13 ) m( 14 ) m( 15 )
3721
3722	#define EAC_RECONSTRUCT_VALUE( n ) vqmovun_s16( vmlaq_s16( srcMidWide, g_alpha_NEON[n], WidenMultiplier_EAC_NEON<n>( multipliers ) ) ),
3723	uint8x8_t recVals[`16`] = { EAC_APPLY_16X( EAC_RECONSTRUCT_VALUE ) };
3724
3725	// find selector
3726	int err = std::numeric_limits<int>::max();
3727	int sel = `0`;
3728	for( int r = `0`; r < `16`; r++ )
3729	{
3730	uint8x8_t recVal = recVals[r];
3731
3732	int rangeErr = `0`;
3733	#define EAC_ACCUMULATE_ERROR( n ) rangeErr += MinError_EAC_NEON( ErrorProbe_EAC_NEON<n>( recVal, srcAlphaBlock ) );
3734	EAC_APPLY_16X( EAC_ACCUMULATE_ERROR )
3735
3736	if( rangeErr < err )
3737	{
3738	err = rangeErr;
3739	sel = r;
3740	if ( err == `0` ) break;
3741	}
3742	}
3743
3744	// combine results
3745	uint64_t d = ( uint64_t( srcMid ) << `56` ) \|
3746	( uint64_t( multipliers[GetMulSel( sel )] ) << `52` ) \|
3747	( uint64_t( sel ) << `48`);
3748
3749	// generate indices
3750	uint8x8_t recVal = recVals[sel];
3751	#define EAC_INSERT_INDEX(n) d \|= MinErrorIndex_EAC_NEON<n>( recVal, srcAlphaBlock );
3752	EAC_APPLY_16X( EAC_INSERT_INDEX )
3753
3754	return _bswap64( d );
3755
3756	#undef EAC_APPLY_16X
3757	#undef EAC_INSERT_INDEX
3758	#undef EAC_ACCUMULATE_ERROR
3759	#undef EAC_RECONSTRUCT_VALUE
3760
3761	#else
3762	{
3763	bool solid = true;
3764	const uint8_t* ptr = src + `1`;
3765	const uint8_t ref = *src;
3766	for( int i=`1`; i<`16`; i++ )
3767	{
3768	if( ref != *ptr++ )
3769	{
3770	solid = false;
3771	break;
3772	}
3773	}
3774	if( solid )
3775	{
3776	return ref;
3777	}
3778	}
3779
3780	uint8_t min = src[`0`];
3781	uint8_t max = src[`0`];
3782	for( int i=`1`; i<`16`; i++ )
3783	{
3784	if( min > src[i] ) min = src[i];
3785	else if( max < src[i] ) max = src[i];
3786	}
3787	int srcRange = max - min;
3788	int srcMid = min + srcRange / `2`;
3789
3790	uint8_t buf[`16`][`16`];
3791	int err = std::numeric_limits<int>::max();
3792	int sel;
3793	int selmul;
3794	for( int r=`0`; r<`16`; r++ )
3795	{
3796	int mul = ( ( srcRange * g_alphaRange[r] ) >> `16` ) + `1`;
3797
3798	int rangeErr = `0`;
3799	for( int i=`0`; i<`16`; i++ )
3800	{
3801	const auto srcVal = src[i];
3802
3803	int idx = `0`;
3804	const auto modVal = g_alpha[r][`0`] * mul;
3805	const auto recVal = clampu8( srcMid + modVal );
3806	int localErr = sq( srcVal - recVal );
3807
3808	if( localErr != `0` )
3809	{
3810	for( int j=`1`; j<`8`; j++ )
3811	{
3812	const auto modVal = g_alpha[r][j] * mul;
3813	const auto recVal = clampu8( srcMid + modVal );
3814	const auto errProbe = sq( srcVal - recVal );
3815	if( errProbe < localErr )
3816	{
3817	localErr = errProbe;
3818	idx = j;
3819	}
3820	}
3821	}
3822
3823	buf[r][i] = idx;
3824	rangeErr += localErr;
3825	}
3826
3827	if( rangeErr < err )
3828	{
3829	err = rangeErr;
3830	sel = r;
3831	selmul = mul;
3832	if( err == `0` ) break;
3833	}
3834	}
3835
3836	uint64_t d = ( uint64_t( srcMid ) << `56` ) \|
3837	( uint64_t( selmul ) << `52` ) \|
3838	( uint64_t( sel ) << `48` );
3839
3840	int offset = `45`;
3841	auto ptr = buf[sel];
3842	for( int i=`0`; i<`16`; i++ )
3843	{
3844	d \|= uint64_t( *ptr++ ) << offset;
3845	offset -= `3`;
3846	}
3847
3848	return _bswap64( d );
3849	#endif
3850	}
3851
3852
3853	void CompressEtc1Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
3854	{
3855	int w = `0`;
3856	uint32_t buf[`4`*`4`];
3857	do
3858	{
3859	#ifdef __SSE4_1__
3860	__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `0` ) ) );
3861	__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `1` ) ) );
3862	__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `2` ) ) );
3863	__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `3` ) ) );
3864
3865	_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3866
3867	__m128i c0 = _mm_castps_si128( px0 );
3868	__m128i c1 = _mm_castps_si128( px1 );
3869	__m128i c2 = _mm_castps_si128( px2 );
3870	__m128i c3 = _mm_castps_si128( px3 );
3871
3872	__m128i mask = _mm_setr_epi32( `0x03030303`, `0x07070707`, `0x0b0b0b0b`, `0x0f0f0f0f` );
3873	__m128i p0 = _mm_shuffle_epi8( c0, mask );
3874	__m128i p1 = _mm_shuffle_epi8( c1, mask );
3875	__m128i p2 = _mm_shuffle_epi8( c2, mask );
3876	__m128i p3 = _mm_shuffle_epi8( c3, mask );
3877
3878	_mm_store_si128( (__m128i*)(buf + `0`), p0 );
3879	_mm_store_si128( (__m128i*)(buf + `4`), p1 );
3880	_mm_store_si128( (__m128i*)(buf + `8`), p2 );
3881	_mm_store_si128( (__m128i*)(buf + `12`), p3 );
3882
3883	src += `4`;
3884	#else
3885	auto ptr = buf;
3886	for( int x=`0`; x<`4`; x++ )
3887	{
3888	unsigned int a = *src >> `24`;
3889	*ptr++ = a \| ( a << `8` ) \| ( a << `16` );
3890	src += width;
3891	a = *src >> `24`;
3892	*ptr++ = a \| ( a << `8` ) \| ( a << `16` );
3893	src += width;
3894	a = *src >> `24`;
3895	*ptr++ = a \| ( a << `8` ) \| ( a << `16` );
3896	src += width;
3897	a = *src >> `24`;
3898	*ptr++ = a \| ( a << `8` ) \| ( a << `16` );
3899	src -= width * `3` - `1`;
3900	}
3901	#endif
3902	if( ++w == width/`4` )
3903	{
3904	src += width * `3`;
3905	w = `0`;
3906	}
3907	dst++ = ProcessRGB( (uint8_t)buf );
3908	}
3909	while( --blocks );
3910	}
3911
3912	void CompressEtc2Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics )
3913	{
3914	int w = `0`;
3915	uint32_t buf[`4`*`4`];
3916	do
3917	{
3918	#ifdef __SSE4_1__
3919	__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `0` ) ) );
3920	__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `1` ) ) );
3921	__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `2` ) ) );
3922	__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `3` ) ) );
3923
3924	_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3925
3926	__m128i c0 = _mm_castps_si128( px0 );
3927	__m128i c1 = _mm_castps_si128( px1 );
3928	__m128i c2 = _mm_castps_si128( px2 );
3929	__m128i c3 = _mm_castps_si128( px3 );
3930
3931	__m128i mask = _mm_setr_epi32( `0x03030303`, `0x07070707`, `0x0b0b0b0b`, `0x0f0f0f0f` );
3932	__m128i p0 = _mm_shuffle_epi8( c0, mask );
3933	__m128i p1 = _mm_shuffle_epi8( c1, mask );
3934	__m128i p2 = _mm_shuffle_epi8( c2, mask );
3935	__m128i p3 = _mm_shuffle_epi8( c3, mask );
3936
3937	_mm_store_si128( (__m128i*)(buf + `0`), p0 );
3938	_mm_store_si128( (__m128i*)(buf + `4`), p1 );
3939	_mm_store_si128( (__m128i*)(buf + `8`), p2 );
3940	_mm_store_si128( (__m128i*)(buf + `12`), p3 );
3941
3942	src += `4`;
3943	#else
3944	auto ptr = buf;
3945	for( int x=`0`; x<`4`; x++ )
3946	{
3947	unsigned int a = *src >> `24`;
3948	*ptr++ = a \| ( a << `8` ) \| ( a << `16` );
3949	src += width;
3950	a = *src >> `24`;
3951	*ptr++ = a \| ( a << `8` ) \| ( a << `16` );
3952	src += width;
3953	a = *src >> `24`;
3954	*ptr++ = a \| ( a << `8` ) \| ( a << `16` );
3955	src += width;
3956	a = *src >> `24`;
3957	*ptr++ = a \| ( a << `8` ) \| ( a << `16` );
3958	src -= width * `3` - `1`;
3959	}
3960	#endif
3961	if( ++w == width/`4` )
3962	{
3963	src += width * `3`;
3964	w = `0`;
3965	}
3966	dst++ = ProcessRGB_ETC2( (uint8_t)buf, useHeuristics );
3967	}
3968	while( --blocks );
3969	}
3970
3971	#include <chrono>
3972	#include <thread>
3973
3974	void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
3975	{
3976	int w = `0`;
3977	uint32_t buf[`4`*`4`];
3978	do
3979	{
3980	#ifdef __SSE4_1__
3981	__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `0` ) ) );
3982	__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `1` ) ) );
3983	__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `2` ) ) );
3984	__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `3` ) ) );
3985
3986	_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3987
3988	_mm_store_si128( (__m128i*)(buf + `0`), _mm_castps_si128( px0 ) );
3989	_mm_store_si128( (__m128i*)(buf + `4`), _mm_castps_si128( px1 ) );
3990	_mm_store_si128( (__m128i*)(buf + `8`), _mm_castps_si128( px2 ) );
3991	_mm_store_si128( (__m128i*)(buf + `12`), _mm_castps_si128( px3 ) );
3992
3993	src += `4`;
3994	#else
3995	auto ptr = buf;
3996	for( int x=`0`; x<`4`; x++ )
3997	{
3998	ptr++ = src;
3999	src += width;
4000	ptr++ = src;
4001	src += width;
4002	ptr++ = src;
4003	src += width;
4004	ptr++ = src;
4005	src -= width * `3` - `1`;
4006	}
4007	#endif
4008	if( ++w == width/`4` )
4009	{
4010	src += width * `3`;
4011	w = `0`;
4012	}
4013	dst++ = ProcessRGB( (uint8_t)buf );
4014	}
4015	while( --blocks );
4016	}
4017
4018	void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
4019	{
4020	int w = `0`;
4021	uint32_t buf[`4`*`4`];
4022	do
4023	{
4024	#ifdef __SSE4_1__
4025	__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `0` ) ) );
4026	__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `1` ) ) );
4027	__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `2` ) ) );
4028	__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `3` ) ) );
4029
4030	_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4031
4032	# ifdef __AVX2__
4033	DitherAvx2( (uint8_t*)buf, _mm_castps_si128( px0 ), _mm_castps_si128( px1 ), _mm_castps_si128( px2 ), _mm_castps_si128( px3 ) );
4034	# else
4035	_mm_store_si128( (__m128i*)(buf + `0`), _mm_castps_si128( px0 ) );
4036	_mm_store_si128( (__m128i*)(buf + `4`), _mm_castps_si128( px1 ) );
4037	_mm_store_si128( (__m128i*)(buf + `8`), _mm_castps_si128( px2 ) );
4038	_mm_store_si128( (__m128i*)(buf + `12`), _mm_castps_si128( px3 ) );
4039
4040	Dither( (uint8_t*)buf );
4041	# endif
4042
4043	src += `4`;
4044	#else
4045	auto ptr = buf;
4046	for( int x=`0`; x<`4`; x++ )
4047	{
4048	ptr++ = src;
4049	src += width;
4050	ptr++ = src;
4051	src += width;
4052	ptr++ = src;
4053	src += width;
4054	ptr++ = src;
4055	src -= width * `3` - `1`;
4056	}
4057	#endif
4058	if( ++w == width/`4` )
4059	{
4060	src += width * `3`;
4061	w = `0`;
4062	}
4063	dst++ = ProcessRGB( (uint8_t)buf );
4064	}
4065	while( --blocks );
4066	}
4067
4068	void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics )
4069	{
4070	int w = `0`;
4071	uint32_t buf[`4`*`4`];
4072	do
4073	{
4074	#ifdef __SSE4_1__
4075	__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `0` ) ) );
4076	__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `1` ) ) );
4077	__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `2` ) ) );
4078	__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `3` ) ) );
4079
4080	_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4081
4082	_mm_store_si128( (__m128i*)(buf + `0`), _mm_castps_si128( px0 ) );
4083	_mm_store_si128( (__m128i*)(buf + `4`), _mm_castps_si128( px1 ) );
4084	_mm_store_si128( (__m128i*)(buf + `8`), _mm_castps_si128( px2 ) );
4085	_mm_store_si128( (__m128i*)(buf + `12`), _mm_castps_si128( px3 ) );
4086
4087	src += `4`;
4088	#else
4089	auto ptr = buf;
4090	for( int x=`0`; x<`4`; x++ )
4091	{
4092	ptr++ = src;
4093	src += width;
4094	ptr++ = src;
4095	src += width;
4096	ptr++ = src;
4097	src += width;
4098	ptr++ = src;
4099	src -= width * `3` - `1`;
4100	}
4101	#endif
4102	if( ++w == width/`4` )
4103	{
4104	src += width * `3`;
4105	w = `0`;
4106	}
4107	dst++ = ProcessRGB_ETC2( (uint8_t)buf, useHeuristics );
4108	}
4109	while( --blocks );
4110	}
4111
4112	void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics )
4113	{
4114	int w = `0`;
4115	uint32_t rgba[`4`*`4`];
4116	uint8_t alpha[`4`*`4`];
4117	do
4118	{
4119	#ifdef __SSE4_1__
4120	__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `0` ) ) );
4121	__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `1` ) ) );
4122	__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `2` ) ) );
4123	__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i)( src + width `3` ) ) );
4124
4125	_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4126
4127	__m128i c0 = _mm_castps_si128( px0 );
4128	__m128i c1 = _mm_castps_si128( px1 );
4129	__m128i c2 = _mm_castps_si128( px2 );
4130	__m128i c3 = _mm_castps_si128( px3 );
4131
4132	_mm_store_si128( (__m128i*)(rgba + `0`), c0 );
4133	_mm_store_si128( (__m128i*)(rgba + `4`), c1 );
4134	_mm_store_si128( (__m128i*)(rgba + `8`), c2 );
4135	_mm_store_si128( (__m128i*)(rgba + `12`), c3 );
4136
4137	__m128i mask = _mm_setr_epi32( `0x0f0b0703`, -`1`, -`1`, -`1` );
4138
4139	__m128i a0 = _mm_shuffle_epi8( c0, mask );
4140	__m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( `3`, `3`, `0`, `3` ) ) );
4141	__m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( `3`, `0`, `3`, `3` ) ) );
4142	__m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( `0`, `3`, `3`, `3` ) ) );
4143
4144	__m128i s0 = _mm_or_si128( a0, a1 );
4145	__m128i s1 = _mm_or_si128( a2, a3 );
4146	__m128i s2 = _mm_or_si128( s0, s1 );
4147
4148	_mm_store_si128( (__m128i*)alpha, s2 );
4149
4150	src += `4`;
4151	#else
4152	auto ptr = rgba;
4153	auto ptr8 = alpha;
4154	for( int x=`0`; x<`4`; x++ )
4155	{
4156	auto v = *src;
4157	*ptr++ = v;
4158	*ptr8++ = v >> `24`;
4159	src += width;
4160	v = *src;
4161	*ptr++ = v;
4162	*ptr8++ = v >> `24`;
4163	src += width;
4164	v = *src;
4165	*ptr++ = v;
4166	*ptr8++ = v >> `24`;
4167	src += width;
4168	v = *src;
4169	*ptr++ = v;
4170	*ptr8++ = v >> `24`;
4171	src -= width * `3` - `1`;
4172	}
4173	#endif
4174	if( ++w == width/`4` )
4175	{
4176	src += width * `3`;
4177	w = `0`;
4178	}
4179	*dst++ = ProcessAlpha_ETC2( alpha );
4180	dst++ = ProcessRGB_ETC2( (uint8_t)rgba, useHeuristics );
4181	}
4182	while( --blocks );
4183	}
4184

Browse the source code of Godot/thirdparty/etcpak/ProcessRGB.cpp