1 | #include <array> |
2 | #include <string.h> |
3 | #include <limits> |
4 | #ifdef __ARM_NEON |
5 | # include <arm_neon.h> |
6 | #endif |
7 | |
8 | #include "Dither.hpp" |
9 | #include "ForceInline.hpp" |
10 | #include "Math.hpp" |
11 | #include "ProcessCommon.hpp" |
12 | #include "ProcessRGB.hpp" |
13 | #include "Tables.hpp" |
14 | #include "Vector.hpp" |
15 | #if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER |
16 | # ifdef _MSC_VER |
17 | # include <intrin.h> |
18 | # include <Windows.h> |
19 | # define _bswap(x) _byteswap_ulong(x) |
20 | # define _bswap64(x) _byteswap_uint64(x) |
21 | # else |
22 | # include <x86intrin.h> |
23 | # endif |
24 | #endif |
25 | |
26 | #ifndef _bswap |
27 | # define _bswap(x) __builtin_bswap32(x) |
28 | # define _bswap64(x) __builtin_bswap64(x) |
29 | #endif |
30 | |
31 | static const uint32_t MaxError = 1065369600; // ((38+76+14) * 255)^2 |
32 | // common T-/H-mode table |
33 | static uint8_t tableTH[8] = { 3, 6, 11, 16, 23, 32, 41, 64 }; |
34 | |
35 | // thresholds for the early compression-mode decision scheme |
36 | // default: 0.03, 0.09, and 0.38 |
37 | float ecmd_threshold[3] = { 0.03f, 0.09f, 0.38f }; |
38 | |
39 | static const uint8_t ModeUndecided = 0; |
40 | static const uint8_t ModePlanar = 0x1; |
41 | static const uint8_t ModeTH = 0x2; |
42 | |
43 | const unsigned int R = 2; |
44 | const unsigned int G = 1; |
45 | const unsigned int B = 0; |
46 | |
47 | struct Luma |
48 | { |
49 | #ifdef __AVX2__ |
50 | float max, min; |
51 | uint8_t minIdx = 255, maxIdx = 255; |
52 | __m128i luma8; |
53 | #elif defined __ARM_NEON && defined __aarch64__ |
54 | float max, min; |
55 | uint8_t minIdx = 255, maxIdx = 255; |
56 | uint8x16_t luma8; |
57 | #else |
58 | uint8_t max = 0, min = 255, maxIdx = 0, minIdx = 0; |
59 | uint8_t val[16]; |
60 | #endif |
61 | }; |
62 | |
63 | #ifdef __AVX2__ |
64 | struct Plane |
65 | { |
66 | uint64_t plane; |
67 | uint64_t error; |
68 | __m256i sum4; |
69 | }; |
70 | #endif |
71 | |
72 | #if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__) |
73 | struct Channels |
74 | { |
75 | #ifdef __AVX2__ |
76 | __m128i r8, g8, b8; |
77 | #elif defined __ARM_NEON && defined __aarch64__ |
78 | uint8x16x2_t r, g, b; |
79 | #endif |
80 | }; |
81 | #endif |
82 | |
83 | namespace |
84 | { |
85 | static etcpak_force_inline uint8_t clamp( uint8_t min, int16_t val, uint8_t max ) |
86 | { |
87 | return val < min ? min : ( val > max ? max : val ); |
88 | } |
89 | |
90 | static etcpak_force_inline uint8_t clampMin( uint8_t min, int16_t val ) |
91 | { |
92 | return val < min ? min : val; |
93 | } |
94 | |
95 | static etcpak_force_inline uint8_t clampMax( int16_t val, uint8_t max ) |
96 | { |
97 | return val > max ? max : val; |
98 | } |
99 | |
100 | // slightly faster than std::sort |
101 | static void insertionSort( uint8_t* arr1, uint8_t* arr2 ) |
102 | { |
103 | for( uint8_t i = 1; i < 16; ++i ) |
104 | { |
105 | uint8_t value = arr1[i]; |
106 | uint8_t hole = i; |
107 | |
108 | for( ; hole > 0 && value < arr1[hole - 1]; --hole ) |
109 | { |
110 | arr1[hole] = arr1[hole - 1]; |
111 | arr2[hole] = arr2[hole - 1]; |
112 | } |
113 | arr1[hole] = value; |
114 | arr2[hole] = i; |
115 | } |
116 | } |
117 | |
118 | //converts indices from |a0|a1|e0|e1|i0|i1|m0|m1|b0|b1|f0|f1|j0|j1|n0|n1|c0|c1|g0|g1|k0|k1|o0|o1|d0|d1|h0|h1|l0|l1|p0|p1| previously used by T- and H-modes |
119 | // into |p0|o0|n0|m0|l0|k0|j0|i0|h0|g0|f0|e0|d0|c0|b0|a0|p1|o1|n1|m1|l1|k1|j1|i1|h1|g1|f1|e1|d1|c1|b1|a1| which should be used for all modes. |
120 | // NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved. |
121 | static etcpak_force_inline int indexConversion( int pixelIndices ) |
122 | { |
123 | int correctIndices = 0; |
124 | int LSB[4][4]; |
125 | int MSB[4][4]; |
126 | int shift = 0; |
127 | for( int y = 3; y >= 0; y-- ) |
128 | { |
129 | for( int x = 3; x >= 0; x-- ) |
130 | { |
131 | LSB[x][y] = ( pixelIndices >> shift ) & 1; |
132 | shift++; |
133 | MSB[x][y] = ( pixelIndices >> shift ) & 1; |
134 | shift++; |
135 | } |
136 | } |
137 | shift = 0; |
138 | for( int x = 0; x < 4; x++ ) |
139 | { |
140 | for( int y = 0; y < 4; y++ ) |
141 | { |
142 | correctIndices |= ( LSB[x][y] << shift ); |
143 | correctIndices |= ( MSB[x][y] << ( 16 + shift ) ); |
144 | shift++; |
145 | } |
146 | } |
147 | return correctIndices; |
148 | } |
149 | |
150 | // Swapping two RGB-colors |
151 | // NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved. |
152 | static etcpak_force_inline void swapColors( uint8_t( colors )[2][3] ) |
153 | { |
154 | uint8_t temp = colors[0][R]; |
155 | colors[0][R] = colors[1][R]; |
156 | colors[1][R] = temp; |
157 | |
158 | temp = colors[0][G]; |
159 | colors[0][G] = colors[1][G]; |
160 | colors[1][G] = temp; |
161 | |
162 | temp = colors[0][B]; |
163 | colors[0][B] = colors[1][B]; |
164 | colors[1][B] = temp; |
165 | } |
166 | |
167 | |
168 | // calculates quantized colors for T or H modes |
169 | void compressColor( uint8_t( currColor )[2][3], uint8_t( quantColor )[2][3], bool t_mode ) |
170 | { |
171 | if( t_mode ) |
172 | { |
173 | quantColor[0][R] = clampMax( 15 * ( currColor[0][R] + 8 ) / 255, 15 ); |
174 | quantColor[0][G] = clampMax( 15 * ( currColor[0][G] + 8 ) / 255, 15 ); |
175 | quantColor[0][B] = clampMax( 15 * ( currColor[0][B] + 8 ) / 255, 15 ); |
176 | } |
177 | else // clamped to [1,14] to get a wider range |
178 | { |
179 | quantColor[0][R] = clamp( 1, 15 * ( currColor[0][R] + 8 ) / 255, 14 ); |
180 | quantColor[0][G] = clamp( 1, 15 * ( currColor[0][G] + 8 ) / 255, 14 ); |
181 | quantColor[0][B] = clamp( 1, 15 * ( currColor[0][B] + 8 ) / 255, 14 ); |
182 | } |
183 | |
184 | // clamped to [1,14] to get a wider range |
185 | quantColor[1][R] = clamp( 1, 15 * ( currColor[1][R] + 8 ) / 255, 14 ); |
186 | quantColor[1][G] = clamp( 1, 15 * ( currColor[1][G] + 8 ) / 255, 14 ); |
187 | quantColor[1][B] = clamp( 1, 15 * ( currColor[1][B] + 8 ) / 255, 14 ); |
188 | } |
189 | |
190 | // three decoding functions come from ETCPACK v2.74 and are slightly changed. |
191 | static etcpak_force_inline void decompressColor( uint8_t( colorsRGB444 )[2][3], uint8_t( colors )[2][3] ) |
192 | { |
193 | // The color should be retrieved as: |
194 | // |
195 | // c = round(255/(r_bits^2-1))*comp_color |
196 | // |
197 | // This is similar to bit replication |
198 | // |
199 | // Note -- this code only work for bit replication from 4 bits and up --- 3 bits needs |
200 | // two copy operations. |
201 | colors[0][R] = ( colorsRGB444[0][R] << 4 ) | colorsRGB444[0][R]; |
202 | colors[0][G] = ( colorsRGB444[0][G] << 4 ) | colorsRGB444[0][G]; |
203 | colors[0][B] = ( colorsRGB444[0][B] << 4 ) | colorsRGB444[0][B]; |
204 | colors[1][R] = ( colorsRGB444[1][R] << 4 ) | colorsRGB444[1][R]; |
205 | colors[1][G] = ( colorsRGB444[1][G] << 4 ) | colorsRGB444[1][G]; |
206 | colors[1][B] = ( colorsRGB444[1][B] << 4 ) | colorsRGB444[1][B]; |
207 | } |
208 | |
209 | // calculates the paint colors from the block colors |
210 | // using a distance d and one of the H- or T-patterns. |
211 | static void calculatePaintColors59T( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] ) |
212 | { |
213 | ////////////////////////////////////////////// |
214 | // |
215 | // C3 C1 C4----C1---C2 |
216 | // | | | |
217 | // | | | |
218 | // |-------| | |
219 | // | | | |
220 | // | | | |
221 | // C4 C2 C3 |
222 | // |
223 | ////////////////////////////////////////////// |
224 | |
225 | // C4 |
226 | pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] ); |
227 | pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] ); |
228 | pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] ); |
229 | |
230 | // C3 |
231 | pColors[0][R] = colors[0][R]; |
232 | pColors[0][G] = colors[0][G]; |
233 | pColors[0][B] = colors[0][B]; |
234 | // C2 |
235 | pColors[1][R] = clampMax( colors[1][R] + tableTH[d], 255 ); |
236 | pColors[1][G] = clampMax( colors[1][G] + tableTH[d], 255 ); |
237 | pColors[1][B] = clampMax( colors[1][B] + tableTH[d], 255 ); |
238 | // C1 |
239 | pColors[2][R] = colors[1][R]; |
240 | pColors[2][G] = colors[1][G]; |
241 | pColors[2][B] = colors[1][B]; |
242 | } |
243 | |
244 | static void calculatePaintColors58H( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] ) |
245 | { |
246 | pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] ); |
247 | pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] ); |
248 | pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] ); |
249 | |
250 | // C1 |
251 | pColors[0][R] = clampMax( colors[0][R] + tableTH[d], 255 ); |
252 | pColors[0][G] = clampMax( colors[0][G] + tableTH[d], 255 ); |
253 | pColors[0][B] = clampMax( colors[0][B] + tableTH[d], 255 ); |
254 | // C2 |
255 | pColors[1][R] = clampMin( 0, colors[0][R] - tableTH[d] ); |
256 | pColors[1][G] = clampMin( 0, colors[0][G] - tableTH[d] ); |
257 | pColors[1][B] = clampMin( 0, colors[0][B] - tableTH[d] ); |
258 | // C3 |
259 | pColors[2][R] = clampMax( colors[1][R] + tableTH[d], 255 ); |
260 | pColors[2][G] = clampMax( colors[1][G] + tableTH[d], 255 ); |
261 | pColors[2][B] = clampMax( colors[1][B] + tableTH[d], 255 ); |
262 | } |
263 | |
264 | #if defined _MSC_VER && !defined __clang__ |
265 | static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask ) |
266 | { |
267 | unsigned long ret; |
268 | _BitScanForward( &ret, mask ); |
269 | return ret; |
270 | } |
271 | #endif |
272 | |
273 | typedef std::array<uint16_t, 4> v4i; |
274 | |
275 | #ifdef __AVX2__ |
276 | static etcpak_force_inline __m256i Sum4_AVX2( const uint8_t* data) noexcept |
277 | { |
278 | __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0); |
279 | __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1); |
280 | __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2); |
281 | __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3); |
282 | |
283 | __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF)); |
284 | __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF)); |
285 | __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF)); |
286 | __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF)); |
287 | |
288 | __m256i t0 = _mm256_cvtepu8_epi16(dm0); |
289 | __m256i t1 = _mm256_cvtepu8_epi16(dm1); |
290 | __m256i t2 = _mm256_cvtepu8_epi16(dm2); |
291 | __m256i t3 = _mm256_cvtepu8_epi16(dm3); |
292 | |
293 | __m256i sum0 = _mm256_add_epi16(t0, t1); |
294 | __m256i sum1 = _mm256_add_epi16(t2, t3); |
295 | |
296 | __m256i s0 = _mm256_permute2x128_si256(sum0, sum1, (0) | (3 << 4)); // 0, 0, 3, 3 |
297 | __m256i s1 = _mm256_permute2x128_si256(sum0, sum1, (1) | (2 << 4)); // 1, 1, 2, 2 |
298 | |
299 | __m256i s2 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(1, 3, 0, 2)); |
300 | __m256i s3 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(0, 2, 1, 3)); |
301 | __m256i s4 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(3, 1, 0, 2)); |
302 | __m256i s5 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(2, 0, 1, 3)); |
303 | |
304 | __m256i sum5 = _mm256_add_epi16(s2, s3); // 3, 0, 3, 0 |
305 | __m256i sum6 = _mm256_add_epi16(s4, s5); // 2, 1, 1, 2 |
306 | return _mm256_add_epi16(sum5, sum6); // 3+2, 0+1, 3+1, 3+2 |
307 | } |
308 | |
309 | static etcpak_force_inline __m256i Average_AVX2( const __m256i data) noexcept |
310 | { |
311 | __m256i a = _mm256_add_epi16(data, _mm256_set1_epi16(4)); |
312 | |
313 | return _mm256_srli_epi16(a, 3); |
314 | } |
315 | |
316 | static etcpak_force_inline __m128i CalcErrorBlock_AVX2( const __m256i data, const v4i a[8]) noexcept |
317 | { |
318 | // |
319 | __m256i a0 = _mm256_load_si256((__m256i*)a[0].data()); |
320 | __m256i a1 = _mm256_load_si256((__m256i*)a[4].data()); |
321 | |
322 | // err = 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) ); |
323 | __m256i a4 = _mm256_madd_epi16(a0, a0); |
324 | __m256i a5 = _mm256_madd_epi16(a1, a1); |
325 | |
326 | __m256i a6 = _mm256_hadd_epi32(a4, a5); |
327 | __m256i a7 = _mm256_slli_epi32(a6, 3); |
328 | |
329 | __m256i a8 = _mm256_add_epi32(a7, _mm256_set1_epi32(0x3FFFFFFF)); // Big value to prevent negative values, but small enough to prevent overflow |
330 | |
331 | // average is not swapped |
332 | // err -= block[0] * 2 * average[0]; |
333 | // err -= block[1] * 2 * average[1]; |
334 | // err -= block[2] * 2 * average[2]; |
335 | __m256i a2 = _mm256_slli_epi16(a0, 1); |
336 | __m256i a3 = _mm256_slli_epi16(a1, 1); |
337 | __m256i b0 = _mm256_madd_epi16(a2, data); |
338 | __m256i b1 = _mm256_madd_epi16(a3, data); |
339 | |
340 | __m256i b2 = _mm256_hadd_epi32(b0, b1); |
341 | __m256i b3 = _mm256_sub_epi32(a8, b2); |
342 | __m256i b4 = _mm256_hadd_epi32(b3, b3); |
343 | |
344 | __m256i b5 = _mm256_permutevar8x32_epi32(b4, _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0)); |
345 | |
346 | return _mm256_castsi256_si128(b5); |
347 | } |
348 | |
349 | static etcpak_force_inline void ProcessAverages_AVX2(const __m256i d, v4i a[8] ) noexcept |
350 | { |
351 | __m256i t = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(31)), _mm256_set1_epi16(128)); |
352 | |
353 | __m256i c = _mm256_srli_epi16(_mm256_add_epi16(t, _mm256_srli_epi16(t, 8)), 8); |
354 | |
355 | __m256i c1 = _mm256_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2)); |
356 | __m256i diff = _mm256_sub_epi16(c, c1); |
357 | diff = _mm256_max_epi16(diff, _mm256_set1_epi16(-4)); |
358 | diff = _mm256_min_epi16(diff, _mm256_set1_epi16(3)); |
359 | |
360 | __m256i co = _mm256_add_epi16(c1, diff); |
361 | |
362 | c = _mm256_blend_epi16(co, c, 0xF0); |
363 | |
364 | __m256i a0 = _mm256_or_si256(_mm256_slli_epi16(c, 3), _mm256_srli_epi16(c, 2)); |
365 | |
366 | _mm256_store_si256((__m256i*)a[4].data(), a0); |
367 | |
368 | __m256i t0 = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(15)), _mm256_set1_epi16(128)); |
369 | __m256i t1 = _mm256_srli_epi16(_mm256_add_epi16(t0, _mm256_srli_epi16(t0, 8)), 8); |
370 | |
371 | __m256i t2 = _mm256_or_si256(t1, _mm256_slli_epi16(t1, 4)); |
372 | |
373 | _mm256_store_si256((__m256i*)a[0].data(), t2); |
374 | } |
375 | |
376 | static etcpak_force_inline uint64_t EncodeAverages_AVX2( const v4i a[8], size_t idx ) noexcept |
377 | { |
378 | uint64_t d = ( idx << 24 ); |
379 | size_t base = idx << 1; |
380 | |
381 | __m128i a0 = _mm_load_si128((const __m128i*)a[base].data()); |
382 | |
383 | __m128i r0, r1; |
384 | |
385 | if( ( idx & 0x2 ) == 0 ) |
386 | { |
387 | r0 = _mm_srli_epi16(a0, 4); |
388 | |
389 | __m128i a1 = _mm_unpackhi_epi64(r0, r0); |
390 | r1 = _mm_slli_epi16(a1, 4); |
391 | } |
392 | else |
393 | { |
394 | __m128i a1 = _mm_and_si128(a0, _mm_set1_epi16(-8)); |
395 | |
396 | r0 = _mm_unpackhi_epi64(a1, a1); |
397 | __m128i a2 = _mm_sub_epi16(a1, r0); |
398 | __m128i a3 = _mm_srai_epi16(a2, 3); |
399 | r1 = _mm_and_si128(a3, _mm_set1_epi16(0x07)); |
400 | } |
401 | |
402 | __m128i r2 = _mm_or_si128(r0, r1); |
403 | // do missing swap for average values |
404 | __m128i r3 = _mm_shufflelo_epi16(r2, _MM_SHUFFLE(3, 0, 1, 2)); |
405 | __m128i r4 = _mm_packus_epi16(r3, _mm_setzero_si128()); |
406 | d |= _mm_cvtsi128_si32(r4); |
407 | |
408 | return d; |
409 | } |
410 | |
411 | static etcpak_force_inline uint64_t CheckSolid_AVX2( const uint8_t* src ) noexcept |
412 | { |
413 | __m256i d0 = _mm256_loadu_si256(((__m256i*)src) + 0); |
414 | __m256i d1 = _mm256_loadu_si256(((__m256i*)src) + 1); |
415 | |
416 | __m256i c = _mm256_broadcastd_epi32(_mm256_castsi256_si128(d0)); |
417 | |
418 | __m256i c0 = _mm256_cmpeq_epi8(d0, c); |
419 | __m256i c1 = _mm256_cmpeq_epi8(d1, c); |
420 | |
421 | __m256i m = _mm256_and_si256(c0, c1); |
422 | |
423 | if (!_mm256_testc_si256(m, _mm256_set1_epi32(-1))) |
424 | { |
425 | return 0; |
426 | } |
427 | |
428 | return 0x02000000 | |
429 | ( (unsigned int)( src[0] & 0xF8 ) << 16 ) | |
430 | ( (unsigned int)( src[1] & 0xF8 ) << 8 ) | |
431 | ( (unsigned int)( src[2] & 0xF8 ) ); |
432 | } |
433 | |
434 | static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const uint8_t* src) noexcept |
435 | { |
436 | __m256i sum4 = Sum4_AVX2( src ); |
437 | |
438 | ProcessAverages_AVX2(Average_AVX2( sum4 ), a ); |
439 | |
440 | return CalcErrorBlock_AVX2( sum4, a); |
441 | } |
442 | |
443 | static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const __m256i sum4) noexcept |
444 | { |
445 | ProcessAverages_AVX2(Average_AVX2( sum4 ), a ); |
446 | |
447 | return CalcErrorBlock_AVX2( sum4, a); |
448 | } |
449 | |
450 | static etcpak_force_inline void FindBestFit_4x2_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept |
451 | { |
452 | __m256i sel0 = _mm256_setzero_si256(); |
453 | __m256i sel1 = _mm256_setzero_si256(); |
454 | |
455 | for (unsigned int j = 0; j < 2; ++j) |
456 | { |
457 | unsigned int bid = offset + 1 - j; |
458 | |
459 | __m256i squareErrorSum = _mm256_setzero_si256(); |
460 | |
461 | __m128i a0 = _mm_loadl_epi64((const __m128i*)a[bid].data()); |
462 | __m256i a1 = _mm256_broadcastq_epi64(a0); |
463 | |
464 | // Processing one full row each iteration |
465 | for (size_t i = 0; i < 8; i += 4) |
466 | { |
467 | __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4)); |
468 | |
469 | __m256i rgb16 = _mm256_cvtepu8_epi16(rgb); |
470 | __m256i d = _mm256_sub_epi16(a1, rgb16); |
471 | |
472 | // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16 |
473 | // This produces slightly different results, but is significant faster |
474 | __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14)); |
475 | __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0); |
476 | __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1); |
477 | __m128i pixel3 = _mm256_castsi256_si128(pixel2); |
478 | |
479 | __m128i pix0 = _mm_broadcastw_epi16(pixel3); |
480 | __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16)); |
481 | __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); |
482 | |
483 | // Processing first two pixels of the row |
484 | { |
485 | __m256i pix = _mm256_abs_epi16(pixel); |
486 | |
487 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
488 | // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries. |
489 | __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0]))); |
490 | __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1]))); |
491 | |
492 | __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1)); |
493 | __m256i minError = _mm256_min_epi16(error0, error1); |
494 | |
495 | // Exploiting symmetry of the selector table and use the sign bit |
496 | // This produces slightly different results, but is significant faster |
497 | __m256i minIndex1 = _mm256_srli_epi16(pixel, 15); |
498 | |
499 | // Interleaving values so madd instruction can be used |
500 | __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
501 | __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
502 | |
503 | __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi); |
504 | // Squaring the minimum error to produce correct values when adding |
505 | __m256i squareError = _mm256_madd_epi16(minError2, minError2); |
506 | |
507 | squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError); |
508 | |
509 | // Packing selector bits |
510 | __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8)); |
511 | __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8)); |
512 | |
513 | sel0 = _mm256_or_si256(sel0, minIndexLo2); |
514 | sel1 = _mm256_or_si256(sel1, minIndexHi2); |
515 | } |
516 | |
517 | pixel3 = _mm256_extracti128_si256(pixel2, 1); |
518 | pix0 = _mm_broadcastw_epi16(pixel3); |
519 | pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16)); |
520 | pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); |
521 | |
522 | // Processing second two pixels of the row |
523 | { |
524 | __m256i pix = _mm256_abs_epi16(pixel); |
525 | |
526 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
527 | // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries. |
528 | __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0]))); |
529 | __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1]))); |
530 | |
531 | __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1)); |
532 | __m256i minError = _mm256_min_epi16(error0, error1); |
533 | |
534 | // Exploiting symmetry of the selector table and use the sign bit |
535 | __m256i minIndex1 = _mm256_srli_epi16(pixel, 15); |
536 | |
537 | // Interleaving values so madd instruction can be used |
538 | __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
539 | __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
540 | |
541 | __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi); |
542 | // Squaring the minimum error to produce correct values when adding |
543 | __m256i squareError = _mm256_madd_epi16(minError2, minError2); |
544 | |
545 | squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError); |
546 | |
547 | // Packing selector bits |
548 | __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8)); |
549 | __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8)); |
550 | __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2); |
551 | __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2); |
552 | |
553 | sel0 = _mm256_or_si256(sel0, minIndexLo3); |
554 | sel1 = _mm256_or_si256(sel1, minIndexHi3); |
555 | } |
556 | } |
557 | |
558 | data += 8 * 4; |
559 | |
560 | _mm256_store_si256((__m256i*)terr[1 - j], squareErrorSum); |
561 | } |
562 | |
563 | // Interleave selector bits |
564 | __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1); |
565 | __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1); |
566 | |
567 | __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4)); |
568 | __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4)); |
569 | |
570 | __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1); |
571 | |
572 | __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2); |
573 | |
574 | _mm256_store_si256((__m256i*)tsel, sel); |
575 | } |
576 | |
577 | static etcpak_force_inline void FindBestFit_2x4_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept |
578 | { |
579 | __m256i sel0 = _mm256_setzero_si256(); |
580 | __m256i sel1 = _mm256_setzero_si256(); |
581 | |
582 | __m256i squareErrorSum0 = _mm256_setzero_si256(); |
583 | __m256i squareErrorSum1 = _mm256_setzero_si256(); |
584 | |
585 | __m128i a0 = _mm_loadl_epi64((const __m128i*)a[offset + 1].data()); |
586 | __m128i a1 = _mm_loadl_epi64((const __m128i*)a[offset + 0].data()); |
587 | |
588 | __m128i a2 = _mm_broadcastq_epi64(a0); |
589 | __m128i a3 = _mm_broadcastq_epi64(a1); |
590 | __m256i a4 = _mm256_insertf128_si256(_mm256_castsi128_si256(a2), a3, 1); |
591 | |
592 | // Processing one full row each iteration |
593 | for (size_t i = 0; i < 16; i += 4) |
594 | { |
595 | __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4)); |
596 | |
597 | __m256i rgb16 = _mm256_cvtepu8_epi16(rgb); |
598 | __m256i d = _mm256_sub_epi16(a4, rgb16); |
599 | |
600 | // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16 |
601 | // This produces slightly different results, but is significant faster |
602 | __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14)); |
603 | __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0); |
604 | __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1); |
605 | __m128i pixel3 = _mm256_castsi256_si128(pixel2); |
606 | |
607 | __m128i pix0 = _mm_broadcastw_epi16(pixel3); |
608 | __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16)); |
609 | __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); |
610 | |
611 | // Processing first two pixels of the row |
612 | { |
613 | __m256i pix = _mm256_abs_epi16(pixel); |
614 | |
615 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
616 | // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries. |
617 | __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0]))); |
618 | __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1]))); |
619 | |
620 | __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1)); |
621 | __m256i minError = _mm256_min_epi16(error0, error1); |
622 | |
623 | // Exploiting symmetry of the selector table and use the sign bit |
624 | __m256i minIndex1 = _mm256_srli_epi16(pixel, 15); |
625 | |
626 | // Interleaving values so madd instruction can be used |
627 | __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
628 | __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
629 | |
630 | __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi); |
631 | // Squaring the minimum error to produce correct values when adding |
632 | __m256i squareError = _mm256_madd_epi16(minError2, minError2); |
633 | |
634 | squareErrorSum0 = _mm256_add_epi32(squareErrorSum0, squareError); |
635 | |
636 | // Packing selector bits |
637 | __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i)); |
638 | __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i)); |
639 | |
640 | sel0 = _mm256_or_si256(sel0, minIndexLo2); |
641 | sel1 = _mm256_or_si256(sel1, minIndexHi2); |
642 | } |
643 | |
644 | pixel3 = _mm256_extracti128_si256(pixel2, 1); |
645 | pix0 = _mm_broadcastw_epi16(pixel3); |
646 | pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16)); |
647 | pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); |
648 | |
649 | // Processing second two pixels of the row |
650 | { |
651 | __m256i pix = _mm256_abs_epi16(pixel); |
652 | |
653 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
654 | // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries. |
655 | __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0]))); |
656 | __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1]))); |
657 | |
658 | __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1)); |
659 | __m256i minError = _mm256_min_epi16(error0, error1); |
660 | |
661 | // Exploiting symmetry of the selector table and use the sign bit |
662 | __m256i minIndex1 = _mm256_srli_epi16(pixel, 15); |
663 | |
664 | // Interleaving values so madd instruction can be used |
665 | __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
666 | __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
667 | |
668 | __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi); |
669 | // Squaring the minimum error to produce correct values when adding |
670 | __m256i squareError = _mm256_madd_epi16(minError2, minError2); |
671 | |
672 | squareErrorSum1 = _mm256_add_epi32(squareErrorSum1, squareError); |
673 | |
674 | // Packing selector bits |
675 | __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i)); |
676 | __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i)); |
677 | __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2); |
678 | __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2); |
679 | |
680 | sel0 = _mm256_or_si256(sel0, minIndexLo3); |
681 | sel1 = _mm256_or_si256(sel1, minIndexHi3); |
682 | } |
683 | } |
684 | |
685 | _mm256_store_si256((__m256i*)terr[1], squareErrorSum0); |
686 | _mm256_store_si256((__m256i*)terr[0], squareErrorSum1); |
687 | |
688 | // Interleave selector bits |
689 | __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1); |
690 | __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1); |
691 | |
692 | __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4)); |
693 | __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4)); |
694 | |
695 | __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1); |
696 | |
697 | __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2); |
698 | |
699 | _mm256_store_si256((__m256i*)tsel, sel); |
700 | } |
701 | |
702 | static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate) noexcept |
703 | { |
704 | size_t tidx[2]; |
705 | |
706 | // Get index of minimum error (terr[0] and terr[1]) |
707 | __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]); |
708 | __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]); |
709 | |
710 | __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4)); |
711 | __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4)); |
712 | |
713 | __m256i errMin0 = _mm256_min_epu32(errLo, errHi); |
714 | |
715 | __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1)); |
716 | __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1); |
717 | |
718 | __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2)); |
719 | __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2); |
720 | |
721 | __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4)); |
722 | __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4)); |
723 | |
724 | __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0); |
725 | __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1); |
726 | |
727 | uint32_t mask0 = _mm256_movemask_epi8(errMask0); |
728 | uint32_t mask1 = _mm256_movemask_epi8(errMask1); |
729 | |
730 | tidx[0] = _bit_scan_forward(mask0) >> 2; |
731 | tidx[1] = _bit_scan_forward(mask1) >> 2; |
732 | |
733 | d |= tidx[0] << 26; |
734 | d |= tidx[1] << 29; |
735 | |
736 | unsigned int t0 = tsel[tidx[0]]; |
737 | unsigned int t1 = tsel[tidx[1]]; |
738 | |
739 | if (!rotate) |
740 | { |
741 | t0 &= 0xFF00FF00; |
742 | t1 &= 0x00FF00FF; |
743 | } |
744 | else |
745 | { |
746 | t0 &= 0xCCCCCCCC; |
747 | t1 &= 0x33333333; |
748 | } |
749 | |
750 | // Flip selectors from sign bit |
751 | unsigned int t2 = (t0 | t1) ^ 0xFFFF0000; |
752 | |
753 | return d | static_cast<uint64_t>(_bswap(t2)) << 32; |
754 | } |
755 | |
756 | static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cvf) noexcept |
757 | { |
758 | __m128i co = _mm_cvttps_epi32(cof); |
759 | __m128i ch = _mm_cvttps_epi32(chf); |
760 | __m128i cv = _mm_cvttps_epi32(cvf); |
761 | |
762 | __m128i coh = _mm_packus_epi32(co, ch); |
763 | __m128i cv0 = _mm_packus_epi32(cv, _mm_setzero_si128()); |
764 | |
765 | __m256i cohv0 = _mm256_inserti128_si256(_mm256_castsi128_si256(coh), cv0, 1); |
766 | __m256i cohv1 = _mm256_min_epu16(cohv0, _mm256_set1_epi16(1023)); |
767 | |
768 | __m256i cohv2 = _mm256_sub_epi16(cohv1, _mm256_set1_epi16(15)); |
769 | __m256i cohv3 = _mm256_srai_epi16(cohv2, 1); |
770 | |
771 | __m256i cohvrb0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(11)); |
772 | __m256i cohvrb1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(4)); |
773 | __m256i cohvg0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(9)); |
774 | __m256i cohvg1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(6)); |
775 | |
776 | __m256i cohvrb2 = _mm256_srai_epi16(cohvrb0, 7); |
777 | __m256i cohvrb3 = _mm256_srai_epi16(cohvrb1, 7); |
778 | __m256i cohvg2 = _mm256_srai_epi16(cohvg0, 8); |
779 | __m256i cohvg3 = _mm256_srai_epi16(cohvg1, 8); |
780 | |
781 | __m256i cohvrb4 = _mm256_sub_epi16(cohvrb0, cohvrb2); |
782 | __m256i cohvrb5 = _mm256_sub_epi16(cohvrb4, cohvrb3); |
783 | __m256i cohvg4 = _mm256_sub_epi16(cohvg0, cohvg2); |
784 | __m256i cohvg5 = _mm256_sub_epi16(cohvg4, cohvg3); |
785 | |
786 | __m256i cohvrb6 = _mm256_srai_epi16(cohvrb5, 3); |
787 | __m256i cohvg6 = _mm256_srai_epi16(cohvg5, 2); |
788 | |
789 | __m256i cohv4 = _mm256_blend_epi16(cohvg6, cohvrb6, 0x55); |
790 | |
791 | __m128i cohv5 = _mm_packus_epi16(_mm256_castsi256_si128(cohv4), _mm256_extracti128_si256(cohv4, 1)); |
792 | return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1)); |
793 | } |
794 | |
795 | static etcpak_force_inline Plane Planar_AVX2( const Channels& ch, uint8_t& mode, bool useHeuristics ) |
796 | { |
797 | __m128i t0 = _mm_sad_epu8( ch.r8, _mm_setzero_si128() ); |
798 | __m128i t1 = _mm_sad_epu8( ch.g8, _mm_setzero_si128() ); |
799 | __m128i t2 = _mm_sad_epu8( ch.b8, _mm_setzero_si128() ); |
800 | |
801 | __m128i r8s = _mm_shuffle_epi8( ch.r8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) ); |
802 | __m128i g8s = _mm_shuffle_epi8( ch.g8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) ); |
803 | __m128i b8s = _mm_shuffle_epi8( ch.b8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) ); |
804 | |
805 | __m128i s0 = _mm_sad_epu8( r8s, _mm_setzero_si128() ); |
806 | __m128i s1 = _mm_sad_epu8( g8s, _mm_setzero_si128() ); |
807 | __m128i s2 = _mm_sad_epu8( b8s, _mm_setzero_si128() ); |
808 | |
809 | __m256i sr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), s0, 1 ); |
810 | __m256i sg0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t1 ), s1, 1 ); |
811 | __m256i sb0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), s2, 1 ); |
812 | |
813 | __m256i sr1 = _mm256_slli_epi64( sr0, 32 ); |
814 | __m256i sg1 = _mm256_slli_epi64( sg0, 16 ); |
815 | |
816 | __m256i srb = _mm256_or_si256( sr1, sb0 ); |
817 | __m256i srgb = _mm256_or_si256( srb, sg1 ); |
818 | |
819 | if( mode != ModePlanar && useHeuristics ) |
820 | { |
821 | Plane plane; |
822 | plane.sum4 = _mm256_permute4x64_epi64( srgb, _MM_SHUFFLE( 2, 3, 0, 1 ) ); |
823 | return plane; |
824 | } |
825 | |
826 | __m128i t3 = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( t0 ), _mm_castsi128_ps( t1 ), _MM_SHUFFLE( 2, 0, 2, 0 ) ) ); |
827 | __m128i t4 = _mm_shuffle_epi32( t2, _MM_SHUFFLE( 3, 1, 2, 0 ) ); |
828 | __m128i t5 = _mm_hadd_epi32( t3, t4 ); |
829 | __m128i t6 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 1, 1, 1, 1 ) ); |
830 | __m128i t7 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
831 | |
832 | __m256i sr = _mm256_broadcastw_epi16( t5 ); |
833 | __m256i sg = _mm256_broadcastw_epi16( t6 ); |
834 | __m256i sb = _mm256_broadcastw_epi16( t7 ); |
835 | |
836 | __m256i r08 = _mm256_cvtepu8_epi16( ch.r8 ); |
837 | __m256i g08 = _mm256_cvtepu8_epi16( ch.g8 ); |
838 | __m256i b08 = _mm256_cvtepu8_epi16( ch.b8 ); |
839 | |
840 | __m256i r16 = _mm256_slli_epi16( r08, 4 ); |
841 | __m256i g16 = _mm256_slli_epi16( g08, 4 ); |
842 | __m256i b16 = _mm256_slli_epi16( b08, 4 ); |
843 | |
844 | __m256i difR0 = _mm256_sub_epi16( r16, sr ); |
845 | __m256i difG0 = _mm256_sub_epi16( g16, sg ); |
846 | __m256i difB0 = _mm256_sub_epi16( b16, sb ); |
847 | |
848 | __m256i difRyz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) ); |
849 | __m256i difGyz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) ); |
850 | __m256i difByz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) ); |
851 | |
852 | __m256i difRxz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) ); |
853 | __m256i difGxz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) ); |
854 | __m256i difBxz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) ); |
855 | |
856 | __m256i difRGyz = _mm256_hadd_epi32( difRyz, difGyz ); |
857 | __m256i difByzxz = _mm256_hadd_epi32( difByz, difBxz ); |
858 | |
859 | __m256i difRGxz = _mm256_hadd_epi32( difRxz, difGxz ); |
860 | |
861 | __m128i sumRGyz = _mm_add_epi32( _mm256_castsi256_si128( difRGyz ), _mm256_extracti128_si256( difRGyz, 1 ) ); |
862 | __m128i sumByzxz = _mm_add_epi32( _mm256_castsi256_si128( difByzxz ), _mm256_extracti128_si256( difByzxz, 1 ) ); |
863 | __m128i sumRGxz = _mm_add_epi32( _mm256_castsi256_si128( difRGxz ), _mm256_extracti128_si256( difRGxz, 1 ) ); |
864 | |
865 | __m128i sumRGByz = _mm_hadd_epi32( sumRGyz, sumByzxz ); |
866 | __m128i sumRGByzxz = _mm_hadd_epi32( sumRGxz, sumByzxz ); |
867 | |
868 | __m128i sumRGBxz = _mm_shuffle_epi32( sumRGByzxz, _MM_SHUFFLE( 2, 3, 1, 0 ) ); |
869 | |
870 | __m128 sumRGByzf = _mm_cvtepi32_ps( sumRGByz ); |
871 | __m128 sumRGBxzf = _mm_cvtepi32_ps( sumRGBxz ); |
872 | |
873 | const float value = ( 255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f; |
874 | |
875 | __m128 scale = _mm_set1_ps( -4.0f / value ); |
876 | |
877 | __m128 af = _mm_mul_ps( sumRGBxzf, scale ); |
878 | __m128 bf = _mm_mul_ps( sumRGByzf, scale ); |
879 | |
880 | __m128 df = _mm_mul_ps( _mm_cvtepi32_ps( t5 ), _mm_set1_ps( 4.0f / 16.0f ) ); |
881 | |
882 | // calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y; |
883 | __m128 cof0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) ); |
884 | __m128 chf0 = _mm_fnmadd_ps( af, _mm_set1_ps( 425.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) ); |
885 | __m128 cvf0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( 425.0f ), df ) ); |
886 | |
887 | // convert to r6g7b6 |
888 | __m128i cohv = r6g7b6_AVX2( cof0, chf0, cvf0 ); |
889 | |
890 | uint64_t rgbho = _mm_extract_epi64( cohv, 0 ); |
891 | uint32_t rgbv0 = _mm_extract_epi32( cohv, 2 ); |
892 | |
893 | // Error calculation |
894 | uint64_t error = 0; |
895 | if( !useHeuristics ) |
896 | { |
897 | auto ro0 = ( rgbho >> 48 ) & 0x3F; |
898 | auto go0 = ( rgbho >> 40 ) & 0x7F; |
899 | auto bo0 = ( rgbho >> 32 ) & 0x3F; |
900 | auto ro1 = ( ro0 >> 4 ) | ( ro0 << 2 ); |
901 | auto go1 = ( go0 >> 6 ) | ( go0 << 1 ); |
902 | auto bo1 = ( bo0 >> 4 ) | ( bo0 << 2 ); |
903 | auto ro2 = ( ro1 << 2 ) + 2; |
904 | auto go2 = ( go1 << 2 ) + 2; |
905 | auto bo2 = ( bo1 << 2 ) + 2; |
906 | |
907 | __m256i ro3 = _mm256_set1_epi16( ro2 ); |
908 | __m256i go3 = _mm256_set1_epi16( go2 ); |
909 | __m256i bo3 = _mm256_set1_epi16( bo2 ); |
910 | |
911 | auto rh0 = ( rgbho >> 16 ) & 0x3F; |
912 | auto gh0 = ( rgbho >> 8 ) & 0x7F; |
913 | auto bh0 = ( rgbho >> 0 ) & 0x3F; |
914 | auto rh1 = ( rh0 >> 4 ) | ( rh0 << 2 ); |
915 | auto gh1 = ( gh0 >> 6 ) | ( gh0 << 1 ); |
916 | auto bh1 = ( bh0 >> 4 ) | ( bh0 << 2 ); |
917 | |
918 | auto rh2 = rh1 - ro1; |
919 | auto gh2 = gh1 - go1; |
920 | auto bh2 = bh1 - bo1; |
921 | |
922 | __m256i rh3 = _mm256_set1_epi16( rh2 ); |
923 | __m256i gh3 = _mm256_set1_epi16( gh2 ); |
924 | __m256i bh3 = _mm256_set1_epi16( bh2 ); |
925 | |
926 | auto rv0 = ( rgbv0 >> 16 ) & 0x3F; |
927 | auto gv0 = ( rgbv0 >> 8 ) & 0x7F; |
928 | auto bv0 = ( rgbv0 >> 0 ) & 0x3F; |
929 | auto rv1 = ( rv0 >> 4 ) | ( rv0 << 2 ); |
930 | auto gv1 = ( gv0 >> 6 ) | ( gv0 << 1 ); |
931 | auto bv1 = ( bv0 >> 4 ) | ( bv0 << 2 ); |
932 | |
933 | auto rv2 = rv1 - ro1; |
934 | auto gv2 = gv1 - go1; |
935 | auto bv2 = bv1 - bo1; |
936 | |
937 | __m256i rv3 = _mm256_set1_epi16( rv2 ); |
938 | __m256i gv3 = _mm256_set1_epi16( gv2 ); |
939 | __m256i bv3 = _mm256_set1_epi16( bv2 ); |
940 | |
941 | __m256i x = _mm256_set_epi16( 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 ); |
942 | |
943 | __m256i rh4 = _mm256_mullo_epi16( rh3, x ); |
944 | __m256i gh4 = _mm256_mullo_epi16( gh3, x ); |
945 | __m256i bh4 = _mm256_mullo_epi16( bh3, x ); |
946 | |
947 | __m256i y = _mm256_set_epi16( 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0 ); |
948 | |
949 | __m256i rv4 = _mm256_mullo_epi16( rv3, y ); |
950 | __m256i gv4 = _mm256_mullo_epi16( gv3, y ); |
951 | __m256i bv4 = _mm256_mullo_epi16( bv3, y ); |
952 | |
953 | __m256i rxy = _mm256_add_epi16( rh4, rv4 ); |
954 | __m256i gxy = _mm256_add_epi16( gh4, gv4 ); |
955 | __m256i bxy = _mm256_add_epi16( bh4, bv4 ); |
956 | |
957 | __m256i rp0 = _mm256_add_epi16( rxy, ro3 ); |
958 | __m256i gp0 = _mm256_add_epi16( gxy, go3 ); |
959 | __m256i bp0 = _mm256_add_epi16( bxy, bo3 ); |
960 | |
961 | __m256i rp1 = _mm256_srai_epi16( rp0, 2 ); |
962 | __m256i gp1 = _mm256_srai_epi16( gp0, 2 ); |
963 | __m256i bp1 = _mm256_srai_epi16( bp0, 2 ); |
964 | |
965 | __m256i rp2 = _mm256_max_epi16( _mm256_min_epi16( rp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() ); |
966 | __m256i gp2 = _mm256_max_epi16( _mm256_min_epi16( gp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() ); |
967 | __m256i bp2 = _mm256_max_epi16( _mm256_min_epi16( bp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() ); |
968 | |
969 | __m256i rdif = _mm256_sub_epi16( r08, rp2 ); |
970 | __m256i gdif = _mm256_sub_epi16( g08, gp2 ); |
971 | __m256i bdif = _mm256_sub_epi16( b08, bp2 ); |
972 | |
973 | __m256i rerr = _mm256_mullo_epi16( rdif, _mm256_set1_epi16( 38 ) ); |
974 | __m256i gerr = _mm256_mullo_epi16( gdif, _mm256_set1_epi16( 76 ) ); |
975 | __m256i berr = _mm256_mullo_epi16( bdif, _mm256_set1_epi16( 14 ) ); |
976 | |
977 | __m256i sum0 = _mm256_add_epi16( rerr, gerr ); |
978 | __m256i sum1 = _mm256_add_epi16( sum0, berr ); |
979 | |
980 | __m256i sum2 = _mm256_madd_epi16( sum1, sum1 ); |
981 | |
982 | __m128i sum3 = _mm_add_epi32( _mm256_castsi256_si128( sum2 ), _mm256_extracti128_si256( sum2, 1 ) ); |
983 | |
984 | uint32_t err0 = _mm_extract_epi32( sum3, 0 ); |
985 | uint32_t err1 = _mm_extract_epi32( sum3, 1 ); |
986 | uint32_t err2 = _mm_extract_epi32( sum3, 2 ); |
987 | uint32_t err3 = _mm_extract_epi32( sum3, 3 ); |
988 | |
989 | error = err0 + err1 + err2 + err3; |
990 | } |
991 | /**/ |
992 | |
993 | uint32_t rgbv = ( rgbv0 & 0x3F ) | ( ( rgbv0 >> 2 ) & 0x1FC0 ) | ( ( rgbv0 >> 3 ) & 0x7E000 ); |
994 | uint64_t rgbho0_ = ( rgbho & 0x3F0000003F ) | ( ( rgbho >> 2 ) & 0x1FC000001FC0 ) | ( ( rgbho >> 3 ) & 0x7E0000007E000 ); |
995 | uint64_t rgbho0 = ( rgbho0_ & 0x7FFFF ) | ( ( rgbho0_ >> 13 ) & 0x3FFFF80000 ); |
996 | |
997 | uint32_t hi = rgbv | ((rgbho0 & 0x1FFF) << 19); |
998 | rgbho0 >>= 13; |
999 | uint32_t lo = ( rgbho0 & 0x1 ) | ( ( rgbho0 & 0x1FE ) << 1 ) | ( ( rgbho0 & 0x600 ) << 2 ) | ( ( rgbho0 & 0x3F800 ) << 5 ) | ( ( rgbho0 & 0x1FC0000 ) << 6 ); |
1000 | |
1001 | uint32_t idx = ( ( rgbho >> 33 ) & 0xF ) | ( ( rgbho >> 41 ) & 0x10 ) | ( ( rgbho >> 48 ) & 0x20 ); |
1002 | lo |= g_flags[idx]; |
1003 | uint64_t result = static_cast<uint32_t>(_bswap(lo)); |
1004 | result |= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << 32; |
1005 | |
1006 | Plane plane; |
1007 | |
1008 | plane.plane = result; |
1009 | if( useHeuristics ) |
1010 | { |
1011 | plane.error = 0; |
1012 | mode = ModePlanar; |
1013 | } |
1014 | else |
1015 | { |
1016 | plane.error = error; |
1017 | } |
1018 | plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1)); |
1019 | |
1020 | return plane; |
1021 | } |
1022 | |
1023 | static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate, const uint64_t value, const uint32_t error) noexcept |
1024 | { |
1025 | size_t tidx[2]; |
1026 | |
1027 | // Get index of minimum error (terr[0] and terr[1]) |
1028 | __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]); |
1029 | __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]); |
1030 | |
1031 | __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4)); |
1032 | __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4)); |
1033 | |
1034 | __m256i errMin0 = _mm256_min_epu32(errLo, errHi); |
1035 | |
1036 | __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1)); |
1037 | __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1); |
1038 | |
1039 | __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2)); |
1040 | __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2); |
1041 | |
1042 | __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4)); |
1043 | __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4)); |
1044 | |
1045 | __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0); |
1046 | __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1); |
1047 | |
1048 | uint32_t mask0 = _mm256_movemask_epi8(errMask0); |
1049 | uint32_t mask1 = _mm256_movemask_epi8(errMask1); |
1050 | |
1051 | tidx[0] = _bit_scan_forward(mask0) >> 2; |
1052 | tidx[1] = _bit_scan_forward(mask1) >> 2; |
1053 | |
1054 | if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error) |
1055 | { |
1056 | return value; |
1057 | } |
1058 | |
1059 | d |= tidx[0] << 26; |
1060 | d |= tidx[1] << 29; |
1061 | |
1062 | unsigned int t0 = tsel[tidx[0]]; |
1063 | unsigned int t1 = tsel[tidx[1]]; |
1064 | |
1065 | if (!rotate) |
1066 | { |
1067 | t0 &= 0xFF00FF00; |
1068 | t1 &= 0x00FF00FF; |
1069 | } |
1070 | else |
1071 | { |
1072 | t0 &= 0xCCCCCCCC; |
1073 | t1 &= 0x33333333; |
1074 | } |
1075 | |
1076 | // Flip selectors from sign bit |
1077 | unsigned int t2 = (t0 | t1) ^ 0xFFFF0000; |
1078 | |
1079 | return d | static_cast<uint64_t>(_bswap(t2)) << 32; |
1080 | } |
1081 | |
1082 | #endif |
1083 | |
1084 | static etcpak_force_inline void Average( const uint8_t* data, v4i* a ) |
1085 | { |
1086 | #ifdef __SSE4_1__ |
1087 | __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0); |
1088 | __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1); |
1089 | __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2); |
1090 | __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3); |
1091 | |
1092 | __m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); |
1093 | __m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128()); |
1094 | __m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128()); |
1095 | __m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128()); |
1096 | __m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128()); |
1097 | __m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128()); |
1098 | __m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128()); |
1099 | __m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128()); |
1100 | |
1101 | __m128i sum0 = _mm_add_epi16(d0l, d1l); |
1102 | __m128i sum1 = _mm_add_epi16(d0h, d1h); |
1103 | __m128i sum2 = _mm_add_epi16(d2l, d3l); |
1104 | __m128i sum3 = _mm_add_epi16(d2h, d3h); |
1105 | |
1106 | __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128()); |
1107 | __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128()); |
1108 | __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128()); |
1109 | __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128()); |
1110 | __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128()); |
1111 | __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128()); |
1112 | __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128()); |
1113 | __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128()); |
1114 | |
1115 | __m128i b0 = _mm_add_epi32(sum0l, sum0h); |
1116 | __m128i b1 = _mm_add_epi32(sum1l, sum1h); |
1117 | __m128i b2 = _mm_add_epi32(sum2l, sum2h); |
1118 | __m128i b3 = _mm_add_epi32(sum3l, sum3h); |
1119 | |
1120 | __m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3); |
1121 | __m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3); |
1122 | __m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3); |
1123 | __m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3); |
1124 | |
1125 | _mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2)))); |
1126 | _mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2)))); |
1127 | #elif defined __ARM_NEON |
1128 | uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data + 0), uint8x16_t()); |
1129 | uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t()); |
1130 | uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t()); |
1131 | uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t()); |
1132 | |
1133 | uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) }; |
1134 | uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) }; |
1135 | uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) }; |
1136 | uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) }; |
1137 | |
1138 | uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ) ) ), uint16x8_t()); |
1139 | uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ) ) ), uint16x8_t()); |
1140 | uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ) ) ), uint16x8_t()); |
1141 | uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ) ) ), uint16x8_t()); |
1142 | |
1143 | uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) }; |
1144 | uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) }; |
1145 | uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) }; |
1146 | uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) }; |
1147 | |
1148 | uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]); |
1149 | uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]); |
1150 | uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]); |
1151 | uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]); |
1152 | |
1153 | uint32x4_t a0 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b2, b3), vdupq_n_u32(4)), 3); |
1154 | uint32x4_t a1 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b1), vdupq_n_u32(4)), 3); |
1155 | uint32x4_t a2 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b1, b3), vdupq_n_u32(4)), 3); |
1156 | uint32x4_t a3 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b2), vdupq_n_u32(4)), 3); |
1157 | |
1158 | uint16x8_t o0 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a0 )), vqmovun_s32(vreinterpretq_s32_u32( a1 ))); |
1159 | uint16x8_t o1 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a2 )), vqmovun_s32(vreinterpretq_s32_u32( a3 ))); |
1160 | |
1161 | a[0] = v4i{o0[2], o0[1], o0[0], 0}; |
1162 | a[1] = v4i{o0[6], o0[5], o0[4], 0}; |
1163 | a[2] = v4i{o1[2], o1[1], o1[0], 0}; |
1164 | a[3] = v4i{o1[6], o1[5], o1[4], 0}; |
1165 | #else |
1166 | uint32_t r[4]; |
1167 | uint32_t g[4]; |
1168 | uint32_t b[4]; |
1169 | |
1170 | memset(r, 0, sizeof(r)); |
1171 | memset(g, 0, sizeof(g)); |
1172 | memset(b, 0, sizeof(b)); |
1173 | |
1174 | for( int j=0; j<4; j++ ) |
1175 | { |
1176 | for( int i=0; i<4; i++ ) |
1177 | { |
1178 | int index = (j & 2) + (i >> 1); |
1179 | b[index] += *data++; |
1180 | g[index] += *data++; |
1181 | r[index] += *data++; |
1182 | data++; |
1183 | } |
1184 | } |
1185 | |
1186 | a[0] = v4i{ uint16_t( (r[2] + r[3] + 4) / 8 ), uint16_t( (g[2] + g[3] + 4) / 8 ), uint16_t( (b[2] + b[3] + 4) / 8 ), 0}; |
1187 | a[1] = v4i{ uint16_t( (r[0] + r[1] + 4) / 8 ), uint16_t( (g[0] + g[1] + 4) / 8 ), uint16_t( (b[0] + b[1] + 4) / 8 ), 0}; |
1188 | a[2] = v4i{ uint16_t( (r[1] + r[3] + 4) / 8 ), uint16_t( (g[1] + g[3] + 4) / 8 ), uint16_t( (b[1] + b[3] + 4) / 8 ), 0}; |
1189 | a[3] = v4i{ uint16_t( (r[0] + r[2] + 4) / 8 ), uint16_t( (g[0] + g[2] + 4) / 8 ), uint16_t( (b[0] + b[2] + 4) / 8 ), 0}; |
1190 | #endif |
1191 | } |
1192 | |
1193 | static etcpak_force_inline void CalcErrorBlock( const uint8_t* data, unsigned int err[4][4] ) |
1194 | { |
1195 | #ifdef __SSE4_1__ |
1196 | __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0); |
1197 | __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1); |
1198 | __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2); |
1199 | __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3); |
1200 | |
1201 | __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF)); |
1202 | __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF)); |
1203 | __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF)); |
1204 | __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF)); |
1205 | |
1206 | __m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128()); |
1207 | __m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128()); |
1208 | __m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128()); |
1209 | __m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128()); |
1210 | __m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128()); |
1211 | __m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128()); |
1212 | __m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128()); |
1213 | __m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128()); |
1214 | |
1215 | __m128i sum0 = _mm_add_epi16(d0l, d1l); |
1216 | __m128i sum1 = _mm_add_epi16(d0h, d1h); |
1217 | __m128i sum2 = _mm_add_epi16(d2l, d3l); |
1218 | __m128i sum3 = _mm_add_epi16(d2h, d3h); |
1219 | |
1220 | __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128()); |
1221 | __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128()); |
1222 | __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128()); |
1223 | __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128()); |
1224 | __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128()); |
1225 | __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128()); |
1226 | __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128()); |
1227 | __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128()); |
1228 | |
1229 | __m128i b0 = _mm_add_epi32(sum0l, sum0h); |
1230 | __m128i b1 = _mm_add_epi32(sum1l, sum1h); |
1231 | __m128i b2 = _mm_add_epi32(sum2l, sum2h); |
1232 | __m128i b3 = _mm_add_epi32(sum3l, sum3h); |
1233 | |
1234 | __m128i a0 = _mm_add_epi32(b2, b3); |
1235 | __m128i a1 = _mm_add_epi32(b0, b1); |
1236 | __m128i a2 = _mm_add_epi32(b1, b3); |
1237 | __m128i a3 = _mm_add_epi32(b0, b2); |
1238 | |
1239 | _mm_storeu_si128((__m128i*)&err[0], a0); |
1240 | _mm_storeu_si128((__m128i*)&err[1], a1); |
1241 | _mm_storeu_si128((__m128i*)&err[2], a2); |
1242 | _mm_storeu_si128((__m128i*)&err[3], a3); |
1243 | #elif defined __ARM_NEON |
1244 | uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data + 0), uint8x16_t()); |
1245 | uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t()); |
1246 | uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t()); |
1247 | uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t()); |
1248 | |
1249 | uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) }; |
1250 | uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) }; |
1251 | uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) }; |
1252 | uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) }; |
1253 | |
1254 | uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ))), uint16x8_t()); |
1255 | uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ))), uint16x8_t()); |
1256 | uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ))), uint16x8_t()); |
1257 | uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ))), uint16x8_t()); |
1258 | |
1259 | uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) }; |
1260 | uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) }; |
1261 | uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) }; |
1262 | uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) }; |
1263 | |
1264 | uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]); |
1265 | uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]); |
1266 | uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]); |
1267 | uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]); |
1268 | |
1269 | uint32x4_t a0 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b2, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) ); |
1270 | uint32x4_t a1 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b1) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) ); |
1271 | uint32x4_t a2 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b1, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) ); |
1272 | uint32x4_t a3 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b2) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) ); |
1273 | |
1274 | vst1q_u32(err[0], a0); |
1275 | vst1q_u32(err[1], a1); |
1276 | vst1q_u32(err[2], a2); |
1277 | vst1q_u32(err[3], a3); |
1278 | #else |
1279 | unsigned int terr[4][4]; |
1280 | |
1281 | memset(terr, 0, 16 * sizeof(unsigned int)); |
1282 | |
1283 | for( int j=0; j<4; j++ ) |
1284 | { |
1285 | for( int i=0; i<4; i++ ) |
1286 | { |
1287 | int index = (j & 2) + (i >> 1); |
1288 | unsigned int d = *data++; |
1289 | terr[index][0] += d; |
1290 | d = *data++; |
1291 | terr[index][1] += d; |
1292 | d = *data++; |
1293 | terr[index][2] += d; |
1294 | data++; |
1295 | } |
1296 | } |
1297 | |
1298 | for( int i=0; i<3; i++ ) |
1299 | { |
1300 | err[0][i] = terr[2][i] + terr[3][i]; |
1301 | err[1][i] = terr[0][i] + terr[1][i]; |
1302 | err[2][i] = terr[1][i] + terr[3][i]; |
1303 | err[3][i] = terr[0][i] + terr[2][i]; |
1304 | } |
1305 | for( int i=0; i<4; i++ ) |
1306 | { |
1307 | err[i][3] = 0; |
1308 | } |
1309 | #endif |
1310 | } |
1311 | |
1312 | static etcpak_force_inline unsigned int CalcError( const unsigned int block[4], const v4i& average ) |
1313 | { |
1314 | unsigned int err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow |
1315 | err -= block[0] * 2 * average[2]; |
1316 | err -= block[1] * 2 * average[1]; |
1317 | err -= block[2] * 2 * average[0]; |
1318 | err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) ); |
1319 | return err; |
1320 | } |
1321 | |
1322 | static etcpak_force_inline void ProcessAverages( v4i* a ) |
1323 | { |
1324 | #ifdef __SSE4_1__ |
1325 | for( int i=0; i<2; i++ ) |
1326 | { |
1327 | __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data()); |
1328 | |
1329 | __m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128)); |
1330 | |
1331 | __m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8); |
1332 | |
1333 | __m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2)); |
1334 | __m128i diff = _mm_sub_epi16(c, c1); |
1335 | diff = _mm_max_epi16(diff, _mm_set1_epi16(-4)); |
1336 | diff = _mm_min_epi16(diff, _mm_set1_epi16(3)); |
1337 | |
1338 | __m128i co = _mm_add_epi16(c1, diff); |
1339 | |
1340 | c = _mm_blend_epi16(co, c, 0xF0); |
1341 | |
1342 | __m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2)); |
1343 | |
1344 | _mm_storeu_si128((__m128i*)a[4+i*2].data(), a0); |
1345 | } |
1346 | |
1347 | for( int i=0; i<2; i++ ) |
1348 | { |
1349 | __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data()); |
1350 | |
1351 | __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128)); |
1352 | __m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8); |
1353 | |
1354 | __m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4)); |
1355 | |
1356 | _mm_storeu_si128((__m128i*)a[i*2].data(), t2); |
1357 | } |
1358 | #elif defined __ARM_NEON |
1359 | for( int i=0; i<2; i++ ) |
1360 | { |
1361 | int16x8_t d = vld1q_s16((int16_t*)&a[i*2]); |
1362 | int16x8_t t = vaddq_s16(vmulq_s16(d, vdupq_n_s16(31)), vdupq_n_s16(128)); |
1363 | int16x8_t c = vshrq_n_s16(vaddq_s16(t, vshrq_n_s16(t, 8)), 8); |
1364 | |
1365 | int16x8_t c1 = vcombine_s16(vget_high_s16(c), vget_high_s16(c)); |
1366 | int16x8_t diff = vsubq_s16(c, c1); |
1367 | diff = vmaxq_s16(diff, vdupq_n_s16(-4)); |
1368 | diff = vminq_s16(diff, vdupq_n_s16(3)); |
1369 | |
1370 | int16x8_t co = vaddq_s16(c1, diff); |
1371 | |
1372 | c = vcombine_s16(vget_low_s16(co), vget_high_s16(c)); |
1373 | |
1374 | int16x8_t a0 = vorrq_s16(vshlq_n_s16(c, 3), vshrq_n_s16(c, 2)); |
1375 | |
1376 | vst1q_s16((int16_t*)&a[4+i*2], a0); |
1377 | } |
1378 | |
1379 | for( int i=0; i<2; i++ ) |
1380 | { |
1381 | int16x8_t d = vld1q_s16((int16_t*)&a[i*2]); |
1382 | |
1383 | int16x8_t t0 = vaddq_s16(vmulq_s16(d, vdupq_n_s16(15)), vdupq_n_s16(128)); |
1384 | int16x8_t t1 = vshrq_n_s16(vaddq_s16(t0, vshrq_n_s16(t0, 8)), 8); |
1385 | |
1386 | int16x8_t t2 = vorrq_s16(t1, vshlq_n_s16(t1, 4)); |
1387 | |
1388 | vst1q_s16((int16_t*)&a[i*2], t2); |
1389 | } |
1390 | #else |
1391 | for( int i=0; i<2; i++ ) |
1392 | { |
1393 | for( int j=0; j<3; j++ ) |
1394 | { |
1395 | int32_t c1 = mul8bit( a[i*2+1][j], 31 ); |
1396 | int32_t c2 = mul8bit( a[i*2][j], 31 ); |
1397 | |
1398 | int32_t diff = c2 - c1; |
1399 | if( diff > 3 ) diff = 3; |
1400 | else if( diff < -4 ) diff = -4; |
1401 | |
1402 | int32_t co = c1 + diff; |
1403 | |
1404 | a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 ); |
1405 | a[4+i*2][j] = ( co << 3 ) | ( co >> 2 ); |
1406 | } |
1407 | } |
1408 | |
1409 | for( int i=0; i<4; i++ ) |
1410 | { |
1411 | a[i][0] = g_avg2[mul8bit( a[i][0], 15 )]; |
1412 | a[i][1] = g_avg2[mul8bit( a[i][1], 15 )]; |
1413 | a[i][2] = g_avg2[mul8bit( a[i][2], 15 )]; |
1414 | } |
1415 | #endif |
1416 | } |
1417 | |
1418 | static etcpak_force_inline void EncodeAverages( uint64_t& _d, const v4i* a, size_t idx ) |
1419 | { |
1420 | auto d = _d; |
1421 | d |= ( idx << 24 ); |
1422 | size_t base = idx << 1; |
1423 | |
1424 | if( ( idx & 0x2 ) == 0 ) |
1425 | { |
1426 | for( int i=0; i<3; i++ ) |
1427 | { |
1428 | d |= uint64_t( a[base+0][i] >> 4 ) << ( i*8 ); |
1429 | d |= uint64_t( a[base+1][i] >> 4 ) << ( i*8 + 4 ); |
1430 | } |
1431 | } |
1432 | else |
1433 | { |
1434 | for( int i=0; i<3; i++ ) |
1435 | { |
1436 | d |= uint64_t( a[base+1][i] & 0xF8 ) << ( i*8 ); |
1437 | int32_t c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3; |
1438 | c &= ~0xFFFFFFF8; |
1439 | d |= ((uint64_t)c) << ( i*8 ); |
1440 | } |
1441 | } |
1442 | _d = d; |
1443 | } |
1444 | |
1445 | static etcpak_force_inline uint64_t CheckSolid( const uint8_t* src ) |
1446 | { |
1447 | #ifdef __SSE4_1__ |
1448 | __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0); |
1449 | __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1); |
1450 | __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2); |
1451 | __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3); |
1452 | |
1453 | __m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0)); |
1454 | |
1455 | __m128i c0 = _mm_cmpeq_epi8(d0, c); |
1456 | __m128i c1 = _mm_cmpeq_epi8(d1, c); |
1457 | __m128i c2 = _mm_cmpeq_epi8(d2, c); |
1458 | __m128i c3 = _mm_cmpeq_epi8(d3, c); |
1459 | |
1460 | __m128i m0 = _mm_and_si128(c0, c1); |
1461 | __m128i m1 = _mm_and_si128(c2, c3); |
1462 | __m128i m = _mm_and_si128(m0, m1); |
1463 | |
1464 | if (!_mm_testc_si128(m, _mm_set1_epi32(-1))) |
1465 | { |
1466 | return 0; |
1467 | } |
1468 | #elif defined __ARM_NEON |
1469 | int32x4_t d0 = vld1q_s32((int32_t*)src + 0); |
1470 | int32x4_t d1 = vld1q_s32((int32_t*)src + 4); |
1471 | int32x4_t d2 = vld1q_s32((int32_t*)src + 8); |
1472 | int32x4_t d3 = vld1q_s32((int32_t*)src + 12); |
1473 | |
1474 | int32x4_t c = vdupq_n_s32(d0[0]); |
1475 | |
1476 | int32x4_t c0 = vreinterpretq_s32_u32(vceqq_s32(d0, c)); |
1477 | int32x4_t c1 = vreinterpretq_s32_u32(vceqq_s32(d1, c)); |
1478 | int32x4_t c2 = vreinterpretq_s32_u32(vceqq_s32(d2, c)); |
1479 | int32x4_t c3 = vreinterpretq_s32_u32(vceqq_s32(d3, c)); |
1480 | |
1481 | int32x4_t m0 = vandq_s32(c0, c1); |
1482 | int32x4_t m1 = vandq_s32(c2, c3); |
1483 | int64x2_t m = vreinterpretq_s64_s32(vandq_s32(m0, m1)); |
1484 | |
1485 | if (m[0] != -1 || m[1] != -1) |
1486 | { |
1487 | return 0; |
1488 | } |
1489 | #else |
1490 | const uint8_t* ptr = src + 4; |
1491 | for( int i=1; i<16; i++ ) |
1492 | { |
1493 | if( memcmp( src, ptr, 4 ) != 0 ) |
1494 | { |
1495 | return 0; |
1496 | } |
1497 | ptr += 4; |
1498 | } |
1499 | #endif |
1500 | return 0x02000000 | |
1501 | ( (unsigned int)( src[0] & 0xF8 ) << 16 ) | |
1502 | ( (unsigned int)( src[1] & 0xF8 ) << 8 ) | |
1503 | ( (unsigned int)( src[2] & 0xF8 ) ); |
1504 | } |
1505 | |
1506 | static etcpak_force_inline void PrepareAverages( v4i a[8], const uint8_t* src, unsigned int err[4] ) |
1507 | { |
1508 | Average( src, a ); |
1509 | ProcessAverages( a ); |
1510 | |
1511 | unsigned int errblock[4][4]; |
1512 | CalcErrorBlock( src, errblock ); |
1513 | |
1514 | for( int i=0; i<4; i++ ) |
1515 | { |
1516 | err[i/2] += CalcError( errblock[i], a[i] ); |
1517 | err[2+i/2] += CalcError( errblock[i], a[i+4] ); |
1518 | } |
1519 | } |
1520 | |
1521 | static etcpak_force_inline void FindBestFit( uint64_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data ) |
1522 | { |
1523 | for( size_t i=0; i<16; i++ ) |
1524 | { |
1525 | uint16_t* sel = tsel[i]; |
1526 | unsigned int bid = id[i]; |
1527 | uint64_t* ter = terr[bid%2]; |
1528 | |
1529 | uint8_t b = *data++; |
1530 | uint8_t g = *data++; |
1531 | uint8_t r = *data++; |
1532 | data++; |
1533 | |
1534 | int dr = a[bid][0] - r; |
1535 | int dg = a[bid][1] - g; |
1536 | int db = a[bid][2] - b; |
1537 | |
1538 | #ifdef __SSE4_1__ |
1539 | // Reference implementation |
1540 | |
1541 | __m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28); |
1542 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
1543 | __m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0])); |
1544 | __m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1])); |
1545 | __m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0])); |
1546 | __m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1])); |
1547 | |
1548 | __m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1)); |
1549 | __m128i minError0 = _mm_min_epi32(error0, error1); |
1550 | |
1551 | __m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2)); |
1552 | __m128i minError1 = _mm_min_epi32(error2, error3); |
1553 | |
1554 | __m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0)); |
1555 | __m128i minError = _mm_min_epi32(minError0, minError1); |
1556 | |
1557 | // Squaring the minimum error to produce correct values when adding |
1558 | __m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
1559 | __m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow); |
1560 | squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0)); |
1561 | _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow); |
1562 | __m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
1563 | __m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh); |
1564 | squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1)); |
1565 | _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh); |
1566 | |
1567 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
1568 | error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2])); |
1569 | error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3])); |
1570 | error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2])); |
1571 | error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3])); |
1572 | |
1573 | index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1)); |
1574 | minError0 = _mm_min_epi32(error0, error1); |
1575 | |
1576 | index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2)); |
1577 | minError1 = _mm_min_epi32(error2, error3); |
1578 | |
1579 | __m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0)); |
1580 | minError = _mm_min_epi32(minError0, minError1); |
1581 | |
1582 | // Squaring the minimum error to produce correct values when adding |
1583 | minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0)); |
1584 | squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow); |
1585 | squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2)); |
1586 | _mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow); |
1587 | minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2)); |
1588 | squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh); |
1589 | squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3)); |
1590 | _mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh); |
1591 | __m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1); |
1592 | _mm_storeu_si128((__m128i*)sel, minIndex); |
1593 | #elif defined __ARM_NEON |
1594 | int32x4_t pix = vdupq_n_s32(dr * 77 + dg * 151 + db * 28); |
1595 | |
1596 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
1597 | uint32x4_t error0 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[0]))); |
1598 | uint32x4_t error1 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[1]))); |
1599 | uint32x4_t error2 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[0]))); |
1600 | uint32x4_t error3 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[1]))); |
1601 | |
1602 | uint32x4_t index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1)); |
1603 | uint32x4_t minError0 = vminq_u32(error0, error1); |
1604 | |
1605 | uint32x4_t index1 = vreinterpretq_u32_s32(vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2)))); |
1606 | uint32x4_t minError1 = vminq_u32(error2, error3); |
1607 | |
1608 | uint32x4_t blendMask = vcltq_u32(minError1, minError0); |
1609 | uint32x4_t minIndex0 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask)); |
1610 | uint32x4_t minError = vminq_u32(minError0, minError1); |
1611 | |
1612 | // Squaring the minimum error to produce correct values when adding |
1613 | uint32x4_t squareErrorLow = vmulq_u32(minError, minError); |
1614 | uint32x4_t squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError))), 1); |
1615 | uint32x4x2_t squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh); |
1616 | uint64x2x2_t squareError = { vreinterpretq_u64_u32(squareErrorZip.val[0]), vreinterpretq_u64_u32(squareErrorZip.val[1]) }; |
1617 | squareError.val[0] = vaddq_u64(squareError.val[0], vld1q_u64(ter + 0)); |
1618 | squareError.val[1] = vaddq_u64(squareError.val[1], vld1q_u64(ter + 2)); |
1619 | vst1q_u64(ter + 0, squareError.val[0]); |
1620 | vst1q_u64(ter + 2, squareError.val[1]); |
1621 | |
1622 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
1623 | error0 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[2]))); |
1624 | error1 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[3]))); |
1625 | error2 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[2]))); |
1626 | error3 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[3]))); |
1627 | |
1628 | index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1)); |
1629 | minError0 = vminq_u32(error0, error1); |
1630 | |
1631 | index1 = vreinterpretq_u32_s32( vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))) ); |
1632 | minError1 = vminq_u32(error2, error3); |
1633 | |
1634 | blendMask = vcltq_u32(minError1, minError0); |
1635 | uint32x4_t minIndex1 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask)); |
1636 | minError = vminq_u32(minError0, minError1); |
1637 | |
1638 | // Squaring the minimum error to produce correct values when adding |
1639 | squareErrorLow = vmulq_u32(minError, minError); |
1640 | squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32( vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError)) ), 1 ); |
1641 | squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh); |
1642 | squareError.val[0] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[0] ), vld1q_u64(ter + 4)); |
1643 | squareError.val[1] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[1] ), vld1q_u64(ter + 6)); |
1644 | vst1q_u64(ter + 4, squareError.val[0]); |
1645 | vst1q_u64(ter + 6, squareError.val[1]); |
1646 | |
1647 | uint16x8_t minIndex = vcombine_u16(vqmovn_u32(minIndex0), vqmovn_u32(minIndex1)); |
1648 | vst1q_u16(sel, minIndex); |
1649 | #else |
1650 | int pix = dr * 77 + dg * 151 + db * 28; |
1651 | |
1652 | for( int t=0; t<8; t++ ) |
1653 | { |
1654 | const int64_t* tab = g_table256[t]; |
1655 | unsigned int idx = 0; |
1656 | uint64_t err = sq( tab[0] + pix ); |
1657 | for( int j=1; j<4; j++ ) |
1658 | { |
1659 | uint64_t local = sq( tab[j] + pix ); |
1660 | if( local < err ) |
1661 | { |
1662 | err = local; |
1663 | idx = j; |
1664 | } |
1665 | } |
1666 | *sel++ = idx; |
1667 | *ter++ += err; |
1668 | } |
1669 | #endif |
1670 | } |
1671 | } |
1672 | |
1673 | #if defined __SSE4_1__ || defined __ARM_NEON |
1674 | // Non-reference implementation, but faster. Produces same results as the AVX2 version |
1675 | static etcpak_force_inline void FindBestFit( uint32_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data ) |
1676 | { |
1677 | for( size_t i=0; i<16; i++ ) |
1678 | { |
1679 | uint16_t* sel = tsel[i]; |
1680 | unsigned int bid = id[i]; |
1681 | uint32_t* ter = terr[bid%2]; |
1682 | |
1683 | uint8_t b = *data++; |
1684 | uint8_t g = *data++; |
1685 | uint8_t r = *data++; |
1686 | data++; |
1687 | |
1688 | int dr = a[bid][0] - r; |
1689 | int dg = a[bid][1] - g; |
1690 | int db = a[bid][2] - b; |
1691 | |
1692 | #ifdef __SSE4_1__ |
1693 | // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16 |
1694 | // This produces slightly different results, but is significant faster |
1695 | __m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14); |
1696 | __m128i pix = _mm_abs_epi16(pixel); |
1697 | |
1698 | // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same. |
1699 | // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries. |
1700 | __m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0])); |
1701 | __m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1])); |
1702 | |
1703 | __m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1)); |
1704 | __m128i minError = _mm_min_epi16(error0, error1); |
1705 | |
1706 | // Exploiting symmetry of the selector table and use the sign bit |
1707 | // This produces slightly different results, but is needed to produce same results as AVX2 implementation |
1708 | __m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1)); |
1709 | __m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit)); |
1710 | |
1711 | // Squaring the minimum error to produce correct values when adding |
1712 | __m128i squareErrorLo = _mm_mullo_epi16(minError, minError); |
1713 | __m128i squareErrorHi = _mm_mulhi_epi16(minError, minError); |
1714 | |
1715 | __m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi); |
1716 | __m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi); |
1717 | |
1718 | squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0)); |
1719 | _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow); |
1720 | squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1)); |
1721 | _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh); |
1722 | |
1723 | _mm_storeu_si128((__m128i*)sel, minIndex); |
1724 | #elif defined __ARM_NEON |
1725 | int16x8_t pixel = vdupq_n_s16( dr * 38 + dg * 76 + db * 14 ); |
1726 | int16x8_t pix = vabsq_s16( pixel ); |
1727 | |
1728 | int16x8_t error0 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[0] ) ); |
1729 | int16x8_t error1 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[1] ) ); |
1730 | |
1731 | int16x8_t index = vandq_s16( vreinterpretq_s16_u16( vcltq_s16( error1, error0 ) ), vdupq_n_s16( 1 ) ); |
1732 | int16x8_t minError = vminq_s16( error0, error1 ); |
1733 | |
1734 | int16x8_t indexBit = vandq_s16( vmvnq_s16( vshrq_n_s16( pixel, 15 ) ), vdupq_n_s16( -1 ) ); |
1735 | int16x8_t minIndex = vorrq_s16( index, vaddq_s16( indexBit, indexBit ) ); |
1736 | |
1737 | int16x4_t minErrorLow = vget_low_s16( minError ); |
1738 | int16x4_t minErrorHigh = vget_high_s16( minError ); |
1739 | |
1740 | int32x4_t squareErrorLow = vmull_s16( minErrorLow, minErrorLow ); |
1741 | int32x4_t squareErrorHigh = vmull_s16( minErrorHigh, minErrorHigh ); |
1742 | |
1743 | int32x4_t squareErrorSumLow = vaddq_s32( squareErrorLow, vld1q_s32( (int32_t*)ter ) ); |
1744 | int32x4_t squareErrorSumHigh = vaddq_s32( squareErrorHigh, vld1q_s32( (int32_t*)ter + 4 ) ); |
1745 | |
1746 | vst1q_s32( (int32_t*)ter, squareErrorSumLow ); |
1747 | vst1q_s32( (int32_t*)ter + 4, squareErrorSumHigh ); |
1748 | |
1749 | vst1q_s16( (int16_t*)sel, minIndex ); |
1750 | #endif |
1751 | } |
1752 | } |
1753 | #endif |
1754 | |
1755 | static etcpak_force_inline uint8_t convert6(float f) |
1756 | { |
1757 | int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1; |
1758 | return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3; |
1759 | } |
1760 | |
1761 | static etcpak_force_inline uint8_t convert7(float f) |
1762 | { |
1763 | int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1; |
1764 | return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2; |
1765 | } |
1766 | |
1767 | static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode, bool useHeuristics ) |
1768 | { |
1769 | int32_t r = 0; |
1770 | int32_t g = 0; |
1771 | int32_t b = 0; |
1772 | |
1773 | for( int i = 0; i < 16; ++i ) |
1774 | { |
1775 | b += src[i * 4 + 0]; |
1776 | g += src[i * 4 + 1]; |
1777 | r += src[i * 4 + 2]; |
1778 | } |
1779 | |
1780 | int32_t difRyz = 0; |
1781 | int32_t difGyz = 0; |
1782 | int32_t difByz = 0; |
1783 | int32_t difRxz = 0; |
1784 | int32_t difGxz = 0; |
1785 | int32_t difBxz = 0; |
1786 | |
1787 | const int32_t scaling[] = { -255, -85, 85, 255 }; |
1788 | |
1789 | for (int i = 0; i < 16; ++i) |
1790 | { |
1791 | int32_t difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b; |
1792 | int32_t difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g; |
1793 | int32_t difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r; |
1794 | |
1795 | difRyz += difR * scaling[i % 4]; |
1796 | difGyz += difG * scaling[i % 4]; |
1797 | difByz += difB * scaling[i % 4]; |
1798 | |
1799 | difRxz += difR * scaling[i / 4]; |
1800 | difGxz += difG * scaling[i / 4]; |
1801 | difBxz += difB * scaling[i / 4]; |
1802 | } |
1803 | |
1804 | const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f); |
1805 | |
1806 | float aR = difRxz * scale; |
1807 | float aG = difGxz * scale; |
1808 | float aB = difBxz * scale; |
1809 | |
1810 | float bR = difRyz * scale; |
1811 | float bG = difGyz * scale; |
1812 | float bB = difByz * scale; |
1813 | |
1814 | float dR = r * (4.0f / 16.0f); |
1815 | float dG = g * (4.0f / 16.0f); |
1816 | float dB = b * (4.0f / 16.0f); |
1817 | |
1818 | // calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y; |
1819 | float cofR = std::fma(aR, 255.0f, std::fma(bR, 255.0f, dR)); |
1820 | float cofG = std::fma(aG, 255.0f, std::fma(bG, 255.0f, dG)); |
1821 | float cofB = std::fma(aB, 255.0f, std::fma(bB, 255.0f, dB)); |
1822 | float chfR = std::fma(aR, -425.0f, std::fma(bR, 255.0f, dR)); |
1823 | float chfG = std::fma(aG, -425.0f, std::fma(bG, 255.0f, dG)); |
1824 | float chfB = std::fma(aB, -425.0f, std::fma(bB, 255.0f, dB)); |
1825 | float cvfR = std::fma(aR, 255.0f, std::fma(bR, -425.0f, dR)); |
1826 | float cvfG = std::fma(aG, 255.0f, std::fma(bG, -425.0f, dG)); |
1827 | float cvfB = std::fma(aB, 255.0f, std::fma(bB, -425.0f, dB)); |
1828 | |
1829 | // convert to r6g7b6 |
1830 | int32_t coR = convert6(cofR); |
1831 | int32_t coG = convert7(cofG); |
1832 | int32_t coB = convert6(cofB); |
1833 | int32_t chR = convert6(chfR); |
1834 | int32_t chG = convert7(chfG); |
1835 | int32_t chB = convert6(chfB); |
1836 | int32_t cvR = convert6(cvfR); |
1837 | int32_t cvG = convert7(cvfG); |
1838 | int32_t cvB = convert6(cvfB); |
1839 | |
1840 | // Error calculation |
1841 | uint64_t error = 0; |
1842 | if( ModePlanar != mode && useHeuristics ) |
1843 | { |
1844 | auto ro0 = coR; |
1845 | auto go0 = coG; |
1846 | auto bo0 = coB; |
1847 | auto ro1 = ( ro0 >> 4 ) | ( ro0 << 2 ); |
1848 | auto go1 = ( go0 >> 6 ) | ( go0 << 1 ); |
1849 | auto bo1 = ( bo0 >> 4 ) | ( bo0 << 2 ); |
1850 | auto ro2 = ( ro1 << 2 ) + 2; |
1851 | auto go2 = ( go1 << 2 ) + 2; |
1852 | auto bo2 = ( bo1 << 2 ) + 2; |
1853 | |
1854 | auto rh0 = chR; |
1855 | auto gh0 = chG; |
1856 | auto bh0 = chB; |
1857 | auto rh1 = ( rh0 >> 4 ) | ( rh0 << 2 ); |
1858 | auto gh1 = ( gh0 >> 6 ) | ( gh0 << 1 ); |
1859 | auto bh1 = ( bh0 >> 4 ) | ( bh0 << 2 ); |
1860 | |
1861 | auto rh2 = rh1 - ro1; |
1862 | auto gh2 = gh1 - go1; |
1863 | auto bh2 = bh1 - bo1; |
1864 | |
1865 | auto rv0 = cvR; |
1866 | auto gv0 = cvG; |
1867 | auto bv0 = cvB; |
1868 | auto rv1 = ( rv0 >> 4 ) | ( rv0 << 2 ); |
1869 | auto gv1 = ( gv0 >> 6 ) | ( gv0 << 1 ); |
1870 | auto bv1 = ( bv0 >> 4 ) | ( bv0 << 2 ); |
1871 | |
1872 | auto rv2 = rv1 - ro1; |
1873 | auto gv2 = gv1 - go1; |
1874 | auto bv2 = bv1 - bo1; |
1875 | for( int i = 0; i < 16; ++i ) |
1876 | { |
1877 | int32_t cR = clampu8( ( rh2 * ( i / 4 ) + rv2 * ( i % 4 ) + ro2 ) >> 2 ); |
1878 | int32_t cG = clampu8( ( gh2 * ( i / 4 ) + gv2 * ( i % 4 ) + go2 ) >> 2 ); |
1879 | int32_t cB = clampu8( ( bh2 * ( i / 4 ) + bv2 * ( i % 4 ) + bo2 ) >> 2 ); |
1880 | |
1881 | int32_t difB = static_cast<int>( src[i * 4 + 0] ) - cB; |
1882 | int32_t difG = static_cast<int>( src[i * 4 + 1] ) - cG; |
1883 | int32_t difR = static_cast<int>( src[i * 4 + 2] ) - cR; |
1884 | |
1885 | int32_t dif = difR * 38 + difG * 76 + difB * 14; |
1886 | |
1887 | error += dif * dif; |
1888 | } |
1889 | } |
1890 | |
1891 | /**/ |
1892 | uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 ); |
1893 | uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 ); |
1894 | uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 ); |
1895 | uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C ); |
1896 | lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 ); |
1897 | lo |= ( ( coG & 0x3F ) << 17 ) | ( ( coG & 0x40 ) << 18 ); |
1898 | lo |= coR << 25; |
1899 | |
1900 | const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 ); |
1901 | |
1902 | lo |= g_flags[idx]; |
1903 | |
1904 | uint64_t result = static_cast<uint32_t>( _bswap( lo ) ); |
1905 | result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32; |
1906 | |
1907 | return std::make_pair( result, error ); |
1908 | } |
1909 | |
1910 | #ifdef __ARM_NEON |
1911 | |
1912 | static etcpak_force_inline int32x2_t Planar_NEON_DifXZ( int16x8_t dif_lo, int16x8_t dif_hi ) |
1913 | { |
1914 | int32x4_t dif0 = vmull_n_s16( vget_low_s16( dif_lo ), -255 ); |
1915 | int32x4_t dif1 = vmull_n_s16( vget_high_s16( dif_lo ), -85 ); |
1916 | int32x4_t dif2 = vmull_n_s16( vget_low_s16( dif_hi ), 85 ); |
1917 | int32x4_t dif3 = vmull_n_s16( vget_high_s16( dif_hi ), 255 ); |
1918 | int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) ); |
1919 | |
1920 | #ifndef __aarch64__ |
1921 | int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) ); |
1922 | return vpadd_s32( dif5, dif5 ); |
1923 | #else |
1924 | return vdup_n_s32( vaddvq_s32( dif4 ) ); |
1925 | #endif |
1926 | } |
1927 | |
1928 | static etcpak_force_inline int32x2_t Planar_NEON_DifYZ( int16x8_t dif_lo, int16x8_t dif_hi ) |
1929 | { |
1930 | int16x4_t scaling = { -255, -85, 85, 255 }; |
1931 | int32x4_t dif0 = vmull_s16( vget_low_s16( dif_lo ), scaling ); |
1932 | int32x4_t dif1 = vmull_s16( vget_high_s16( dif_lo ), scaling ); |
1933 | int32x4_t dif2 = vmull_s16( vget_low_s16( dif_hi ), scaling ); |
1934 | int32x4_t dif3 = vmull_s16( vget_high_s16( dif_hi ), scaling ); |
1935 | int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) ); |
1936 | |
1937 | #ifndef __aarch64__ |
1938 | int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) ); |
1939 | return vpadd_s32( dif5, dif5 ); |
1940 | #else |
1941 | return vdup_n_s32( vaddvq_s32( dif4 ) ); |
1942 | #endif |
1943 | } |
1944 | |
1945 | static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src ) |
1946 | { |
1947 | uint16x8_t accu8 = vpaddlq_u8( src ); |
1948 | #ifndef __aarch64__ |
1949 | uint16x4_t accu4 = vpadd_u16( vget_low_u16( accu8 ), vget_high_u16( accu8 ) ); |
1950 | uint16x4_t accu2 = vpadd_u16( accu4, accu4 ); |
1951 | uint16x4_t accu1 = vpadd_u16( accu2, accu2 ); |
1952 | return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) ); |
1953 | #else |
1954 | return vdupq_n_s16( vaddvq_u16( accu8 ) ); |
1955 | #endif |
1956 | } |
1957 | |
1958 | static etcpak_force_inline int16x8_t convert6_NEON( int32x4_t lo, int32x4_t hi ) |
1959 | { |
1960 | uint16x8_t x = vcombine_u16( vqmovun_s32( lo ), vqmovun_s32( hi ) ); |
1961 | int16x8_t i = vreinterpretq_s16_u16( vshrq_n_u16( vqshlq_n_u16( x, 6 ), 6) ); // clamp 0-1023 |
1962 | i = vhsubq_s16( i, vdupq_n_s16( 15 ) ); |
1963 | |
1964 | int16x8_t ip11 = vaddq_s16( i, vdupq_n_s16( 11 ) ); |
1965 | int16x8_t ip4 = vaddq_s16( i, vdupq_n_s16( 4 ) ); |
1966 | |
1967 | return vshrq_n_s16( vsubq_s16( vsubq_s16( ip11, vshrq_n_s16( ip11, 7 ) ), vshrq_n_s16( ip4, 7) ), 3 ); |
1968 | } |
1969 | |
1970 | static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x ) |
1971 | { |
1972 | int16x4_t i = vreinterpret_s16_u16( vshr_n_u16( vqshl_n_u16( vqmovun_s32( x ), 6 ), 6 ) ); // clamp 0-1023 |
1973 | i = vhsub_s16( i, vdup_n_s16( 15 ) ); |
1974 | |
1975 | int16x4_t p9 = vadd_s16( i, vdup_n_s16( 9 ) ); |
1976 | int16x4_t p6 = vadd_s16( i, vdup_n_s16( 6 ) ); |
1977 | return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, 8 ) ), vshr_n_s16( p6, 8 ) ), 2 ); |
1978 | } |
1979 | |
1980 | static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode, bool useHeuristics ) |
1981 | { |
1982 | uint8x16x4_t srcBlock = vld4q_u8( src ); |
1983 | |
1984 | int16x8_t bSumWide = Planar_NEON_SumWide( srcBlock.val[0] ); |
1985 | int16x8_t gSumWide = Planar_NEON_SumWide( srcBlock.val[1] ); |
1986 | int16x8_t rSumWide = Planar_NEON_SumWide( srcBlock.val[2] ); |
1987 | |
1988 | int16x8_t dif_R_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[2] ), 4) ), rSumWide ); |
1989 | int16x8_t dif_R_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[2] ), 4) ), rSumWide ); |
1990 | |
1991 | int16x8_t dif_G_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[1] ), 4 ) ), gSumWide ); |
1992 | int16x8_t dif_G_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[1] ), 4 ) ), gSumWide ); |
1993 | |
1994 | int16x8_t dif_B_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[0] ), 4) ), bSumWide ); |
1995 | int16x8_t dif_B_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[0] ), 4) ), bSumWide ); |
1996 | |
1997 | int32x2x2_t dif_xz_z = vzip_s32( vzip_s32( Planar_NEON_DifXZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifXZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifXZ( dif_G_lo, dif_G_hi ) ); |
1998 | int32x4_t dif_xz = vcombine_s32( dif_xz_z.val[0], dif_xz_z.val[1] ); |
1999 | int32x2x2_t dif_yz_z = vzip_s32( vzip_s32( Planar_NEON_DifYZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifYZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifYZ( dif_G_lo, dif_G_hi ) ); |
2000 | int32x4_t dif_yz = vcombine_s32( dif_yz_z.val[0], dif_yz_z.val[1] ); |
2001 | |
2002 | const float fscale = -4.0f / ( (255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f ); |
2003 | float32x4_t fa = vmulq_n_f32( vcvtq_f32_s32( dif_xz ), fscale ); |
2004 | float32x4_t fb = vmulq_n_f32( vcvtq_f32_s32( dif_yz ), fscale ); |
2005 | int16x4_t bgrgSum = vzip_s16( vzip_s16( vget_low_s16( bSumWide ), vget_low_s16( rSumWide ) ).val[0], vget_low_s16( gSumWide ) ).val[0]; |
2006 | float32x4_t fd = vmulq_n_f32( vcvtq_f32_s32( vmovl_s16( bgrgSum ) ), 4.0f / 16.0f); |
2007 | |
2008 | float32x4_t cof = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, 255.0f ); |
2009 | float32x4_t chf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, -425.0f ); |
2010 | float32x4_t cvf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, -425.0f ), fa, 255.0f ); |
2011 | |
2012 | int32x4_t coi = vcvtq_s32_f32( cof ); |
2013 | int32x4_t chi = vcvtq_s32_f32( chf ); |
2014 | int32x4_t cvi = vcvtq_s32_f32( cvf ); |
2015 | |
2016 | int32x4x2_t tr_hv = vtrnq_s32( chi, cvi ); |
2017 | int32x4x2_t tr_o = vtrnq_s32( coi, coi ); |
2018 | |
2019 | int16x8_t c_hvoo_br_6 = convert6_NEON( tr_hv.val[0], tr_o.val[0] ); |
2020 | int16x4_t c_hvox_g_7 = convert7_NEON( vcombine_s32( vget_low_s32( tr_hv.val[1] ), vget_low_s32( tr_o.val[1] ) ) ); |
2021 | int16x8_t c_hvoo_br_8 = vorrq_s16( vshrq_n_s16( c_hvoo_br_6, 4 ), vshlq_n_s16( c_hvoo_br_6, 2 ) ); |
2022 | int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, 6 ), vshl_n_s16( c_hvox_g_7, 1 ) ); |
2023 | |
2024 | uint64_t error = 0; |
2025 | if( mode != ModePlanar && useHeuristics ) |
2026 | { |
2027 | int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), 3 ); |
2028 | |
2029 | rec_gxbr_o = vadd_s16( vshl_n_s16( rec_gxbr_o, 2 ), vdup_n_s16( 2 ) ); |
2030 | int16x8_t rec_ro_wide = vdupq_lane_s16( rec_gxbr_o, 3 ); |
2031 | int16x8_t rec_go_wide = vdupq_lane_s16( rec_gxbr_o, 0 ); |
2032 | int16x8_t rec_bo_wide = vdupq_lane_s16( rec_gxbr_o, 1 ); |
2033 | |
2034 | int16x4_t br_hv2 = vsub_s16( vget_low_s16( c_hvoo_br_8 ), vget_high_s16( c_hvoo_br_8 ) ); |
2035 | int16x4_t gg_hv2 = vsub_s16( c_hvox_g_8, vdup_lane_s16( c_hvox_g_8, 2 ) ); |
2036 | |
2037 | int16x8_t scaleh_lo = { 0, 0, 0, 0, 1, 1, 1, 1 }; |
2038 | int16x8_t scaleh_hi = { 2, 2, 2, 2, 3, 3, 3, 3 }; |
2039 | int16x8_t scalev = { 0, 1, 2, 3, 0, 1, 2, 3 }; |
2040 | |
2041 | int16x8_t rec_r_1 = vmlaq_lane_s16( rec_ro_wide, scalev, br_hv2, 3 ); |
2042 | int16x8_t rec_r_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_lo, br_hv2, 2 ), 2 ) ) ); |
2043 | int16x8_t rec_r_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_hi, br_hv2, 2 ), 2 ) ) ); |
2044 | |
2045 | int16x8_t rec_b_1 = vmlaq_lane_s16( rec_bo_wide, scalev, br_hv2, 1 ); |
2046 | int16x8_t rec_b_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_lo, br_hv2, 0 ), 2 ) ) ); |
2047 | int16x8_t rec_b_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_hi, br_hv2, 0 ), 2 ) ) ); |
2048 | |
2049 | int16x8_t rec_g_1 = vmlaq_lane_s16( rec_go_wide, scalev, gg_hv2, 1 ); |
2050 | int16x8_t rec_g_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_lo, gg_hv2, 0 ), 2 ) ) ); |
2051 | int16x8_t rec_g_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_hi, gg_hv2, 0 ), 2 ) ) ); |
2052 | |
2053 | int16x8_t dif_r_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[2] ) ) ), rec_r_lo ); |
2054 | int16x8_t dif_r_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[2] ) ) ), rec_r_hi ); |
2055 | |
2056 | int16x8_t dif_g_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[1] ) ) ), rec_g_lo ); |
2057 | int16x8_t dif_g_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[1] ) ) ), rec_g_hi ); |
2058 | |
2059 | int16x8_t dif_b_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[0] ) ) ), rec_b_lo ); |
2060 | int16x8_t dif_b_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[0] ) ) ), rec_b_hi ); |
2061 | |
2062 | int16x8_t dif_lo = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_lo, 38 ), dif_g_lo, 76 ), dif_b_lo, 14 ); |
2063 | int16x8_t dif_hi = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_hi, 38 ), dif_g_hi, 76 ), dif_b_hi, 14 ); |
2064 | |
2065 | int16x4_t tmpDif = vget_low_s16( dif_lo ); |
2066 | int32x4_t difsq_0 = vmull_s16( tmpDif, tmpDif ); |
2067 | tmpDif = vget_high_s16( dif_lo ); |
2068 | int32x4_t difsq_1 = vmull_s16( tmpDif, tmpDif ); |
2069 | tmpDif = vget_low_s16( dif_hi ); |
2070 | int32x4_t difsq_2 = vmull_s16( tmpDif, tmpDif ); |
2071 | tmpDif = vget_high_s16( dif_hi ); |
2072 | int32x4_t difsq_3 = vmull_s16( tmpDif, tmpDif ); |
2073 | |
2074 | uint32x4_t difsq_5 = vaddq_u32( vreinterpretq_u32_s32( difsq_0 ), vreinterpretq_u32_s32( difsq_1 ) ); |
2075 | uint32x4_t difsq_6 = vaddq_u32( vreinterpretq_u32_s32( difsq_2 ), vreinterpretq_u32_s32( difsq_3 ) ); |
2076 | |
2077 | uint64x2_t difsq_7 = vaddl_u32( vget_low_u32( difsq_5 ), vget_high_u32( difsq_5 ) ); |
2078 | uint64x2_t difsq_8 = vaddl_u32( vget_low_u32( difsq_6 ), vget_high_u32( difsq_6 ) ); |
2079 | |
2080 | uint64x2_t difsq_9 = vaddq_u64( difsq_7, difsq_8 ); |
2081 | |
2082 | #ifdef __aarch64__ |
2083 | error = vaddvq_u64( difsq_9 ); |
2084 | #else |
2085 | error = vgetq_lane_u64( difsq_9, 0 ) + vgetq_lane_u64( difsq_9, 1 ); |
2086 | #endif |
2087 | } |
2088 | |
2089 | int32_t coR = c_hvoo_br_6[6]; |
2090 | int32_t coG = c_hvox_g_7[2]; |
2091 | int32_t coB = c_hvoo_br_6[4]; |
2092 | |
2093 | int32_t chR = c_hvoo_br_6[2]; |
2094 | int32_t chG = c_hvox_g_7[0]; |
2095 | int32_t chB = c_hvoo_br_6[0]; |
2096 | |
2097 | int32_t cvR = c_hvoo_br_6[3]; |
2098 | int32_t cvG = c_hvox_g_7[1]; |
2099 | int32_t cvB = c_hvoo_br_6[1]; |
2100 | |
2101 | uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 ); |
2102 | uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 ); |
2103 | uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 ); |
2104 | uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C ); |
2105 | lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 ); |
2106 | lo |= ( ( coG & 0x3F) << 17) | ( (coG & 0x40 ) << 18 ); |
2107 | lo |= coR << 25; |
2108 | |
2109 | const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 ); |
2110 | |
2111 | lo |= g_flags[idx]; |
2112 | |
2113 | uint64_t result = static_cast<uint32_t>( _bswap(lo) ); |
2114 | result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32; |
2115 | |
2116 | return std::make_pair( result, error ); |
2117 | } |
2118 | |
2119 | #endif |
2120 | |
2121 | #ifdef __AVX2__ |
2122 | uint32_t calculateErrorTH( bool tMode, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist, __m128i r8, __m128i g8, __m128i b8 ) |
2123 | #else |
2124 | uint32_t calculateErrorTH( bool tMode, uint8_t* src, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist ) |
2125 | #endif |
2126 | { |
2127 | uint32_t blockErr = 0, bestBlockErr = MaxError; |
2128 | |
2129 | uint32_t pixColors; |
2130 | uint8_t possibleColors[4][3]; |
2131 | uint8_t colors[2][3]; |
2132 | |
2133 | decompressColor( colorsRGB444, colors ); |
2134 | |
2135 | #ifdef __AVX2__ |
2136 | __m128i reverseMask = _mm_set_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 ); |
2137 | #endif |
2138 | |
2139 | // test distances |
2140 | for( uint8_t d = startDist; d < 8; ++d ) |
2141 | { |
2142 | if( d >= 2 && dist == d - 2 ) break; |
2143 | |
2144 | blockErr = 0; |
2145 | pixColors = 0; |
2146 | |
2147 | if( tMode ) |
2148 | { |
2149 | calculatePaintColors59T( d, colors, possibleColors ); |
2150 | } |
2151 | else |
2152 | { |
2153 | calculatePaintColors58H( d, colors, possibleColors ); |
2154 | } |
2155 | |
2156 | #ifdef __AVX2__ |
2157 | // RGB ordering |
2158 | __m128i b8Rev = _mm_shuffle_epi8( b8, reverseMask ); |
2159 | __m128i g8Rev = _mm_shuffle_epi8( g8, reverseMask ); |
2160 | __m128i r8Rev = _mm_shuffle_epi8( r8, reverseMask ); |
2161 | |
2162 | // extends 3x128 bits RGB into 3x256 bits RGB for error comparisions |
2163 | static const __m128i zero = _mm_setzero_si128(); |
2164 | __m128i b8Lo = _mm_unpacklo_epi8( b8Rev, zero ); |
2165 | __m128i g8Lo = _mm_unpacklo_epi8( g8Rev, zero ); |
2166 | __m128i r8Lo = _mm_unpacklo_epi8( r8Rev, zero ); |
2167 | __m128i b8Hi = _mm_unpackhi_epi8( b8Rev, zero ); |
2168 | __m128i g8Hi = _mm_unpackhi_epi8( g8Rev, zero ); |
2169 | __m128i r8Hi = _mm_unpackhi_epi8( r8Rev, zero ); |
2170 | |
2171 | __m256i b8 = _mm256_set_m128i( b8Hi, b8Lo ); |
2172 | __m256i g8 = _mm256_set_m128i( g8Hi, g8Lo ); |
2173 | __m256i r8 = _mm256_set_m128i( r8Hi, r8Lo ); |
2174 | |
2175 | // caculates differences between the pixel colrs and the palette colors |
2176 | __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[0][B] ) ) ); |
2177 | __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[0][G] ) ) ); |
2178 | __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[0][R] ) ) ); |
2179 | |
2180 | // luma-based error calculations |
2181 | static const __m256i bWeight = _mm256_set1_epi16( 14 ); |
2182 | static const __m256i gWeight = _mm256_set1_epi16( 76 ); |
2183 | static const __m256i rWeight = _mm256_set1_epi16( 38 ); |
2184 | |
2185 | diffb = _mm256_mullo_epi16( diffb, bWeight ); |
2186 | diffg = _mm256_mullo_epi16( diffg, gWeight ); |
2187 | diffr = _mm256_mullo_epi16( diffr, rWeight ); |
2188 | |
2189 | // obtains the error with the current palette color |
2190 | __m256i lowestPixErr = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr ); |
2191 | |
2192 | // error calucations with the remaining three palette colors |
2193 | static const uint32_t masks[4] = { 0, 0x55555555, 0xAAAAAAAA, 0xFFFFFFFF }; |
2194 | for( uint8_t c = 1; c < 4; c++ ) |
2195 | { |
2196 | __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[c][B] ) ) ); |
2197 | __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[c][G] ) ) ); |
2198 | __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[c][R] ) ) ); |
2199 | |
2200 | diffb = _mm256_mullo_epi16( diffb, bWeight ); |
2201 | diffg = _mm256_mullo_epi16( diffg, gWeight ); |
2202 | diffr = _mm256_mullo_epi16( diffr, rWeight ); |
2203 | |
2204 | // error comparison with the previous best color |
2205 | __m256i pixErrors = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr ); |
2206 | __m256i minErr = _mm256_min_epu16( lowestPixErr, pixErrors ); |
2207 | __m256i cmpRes = _mm256_cmpeq_epi16( pixErrors, minErr ); |
2208 | lowestPixErr = minErr; |
2209 | |
2210 | // update pixel colors |
2211 | uint32_t updPixColors = _mm256_movemask_epi8( cmpRes ); |
2212 | uint32_t prevPixColors = pixColors & ~updPixColors; |
2213 | uint32_t mskPixColors = masks[c] & updPixColors; |
2214 | pixColors = prevPixColors | mskPixColors; |
2215 | } |
2216 | |
2217 | // accumulate the block error |
2218 | alignas( 32 ) uint16_t pixErr16[16] = { 0, }; |
2219 | _mm256_storeu_si256( (__m256i*)pixErr16, lowestPixErr ); |
2220 | for( uint8_t p = 0; p < 16; p++ ) |
2221 | { |
2222 | blockErr += (int)( pixErr16[p] ) * pixErr16[p]; |
2223 | } |
2224 | #else |
2225 | for( size_t y = 0; y < 4; ++y ) |
2226 | { |
2227 | for( size_t x = 0; x < 4; ++x ) |
2228 | { |
2229 | uint32_t bestPixErr = MaxError; |
2230 | pixColors <<= 2; // Make room for next value |
2231 | |
2232 | // Loop possible block colors |
2233 | for( uint8_t c = 0; c < 4; ++c ) |
2234 | { |
2235 | int diff[3]; |
2236 | diff[R] = src[4 * ( x * 4 + y ) + R] - possibleColors[c][R]; |
2237 | diff[G] = src[4 * ( x * 4 + y ) + G] - possibleColors[c][G]; |
2238 | diff[B] = src[4 * ( x * 4 + y ) + B] - possibleColors[c][B]; |
2239 | |
2240 | const uint32_t err = 38 * abs( diff[R] ) + 76 * abs( diff[G] ) + 14 * abs( diff[B] ); |
2241 | uint32_t pixErr = err * err; |
2242 | |
2243 | // Choose best error |
2244 | if( pixErr < bestPixErr ) |
2245 | { |
2246 | bestPixErr = pixErr; |
2247 | pixColors ^= ( pixColors & 3 ); // Reset the two first bits |
2248 | pixColors |= c; |
2249 | } |
2250 | } |
2251 | blockErr += bestPixErr; |
2252 | } |
2253 | } |
2254 | #endif |
2255 | |
2256 | if( blockErr < bestBlockErr ) |
2257 | { |
2258 | bestBlockErr = blockErr; |
2259 | dist = d; |
2260 | pixIndices = pixColors; |
2261 | } |
2262 | } |
2263 | |
2264 | return bestBlockErr; |
2265 | } |
2266 | |
2267 | |
2268 | // main T-/H-mode compression function |
2269 | #ifdef __AVX2__ |
2270 | uint32_t compressBlockTH( uint8_t* src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool& tMode, __m128i r8, __m128i g8, __m128i b8 ) |
2271 | #else |
2272 | uint32_t compressBlockTH( uint8_t *src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool &tMode ) |
2273 | #endif |
2274 | { |
2275 | #ifdef __AVX2__ |
2276 | alignas( 8 ) uint8_t luma[16] = { 0, }; |
2277 | _mm_storeu_si128 ( (__m128i* )luma, l.luma8 ); |
2278 | #elif defined __ARM_NEON && defined __aarch64__ |
2279 | alignas( 8 ) uint8_t luma[16] = { 0 }; |
2280 | vst1q_u8( luma, l.luma8 ); |
2281 | #else |
2282 | uint8_t* luma = l.val; |
2283 | #endif |
2284 | |
2285 | uint8_t pixIdx[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; |
2286 | |
2287 | // 1) sorts the pairs of (luma, pix_idx) |
2288 | insertionSort( luma, pixIdx ); |
2289 | |
2290 | // 2) finds the min (left+right) |
2291 | uint8_t minSumRangeIdx = 0; |
2292 | uint16_t minSumRangeValue; |
2293 | uint16_t sum; |
2294 | static const uint8_t diffBonus[15] = {8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 8}; |
2295 | const int16_t temp = luma[15] - luma[0]; |
2296 | |
2297 | minSumRangeValue = luma[15] - luma[1] + diffBonus[0]; |
2298 | for( uint8_t i = 1; i < 14; i++ ) |
2299 | { |
2300 | sum = temp - luma[i+1] + luma[i] + diffBonus[i]; |
2301 | if( minSumRangeValue > sum ) |
2302 | { |
2303 | minSumRangeValue = sum; |
2304 | minSumRangeIdx = i; |
2305 | } |
2306 | } |
2307 | |
2308 | sum = luma[14] - luma[0] + diffBonus[14]; |
2309 | if( minSumRangeValue > sum ) |
2310 | { |
2311 | minSumRangeValue = sum; |
2312 | minSumRangeIdx = 14; |
2313 | } |
2314 | uint8_t lRange, rRange; |
2315 | |
2316 | lRange = luma[minSumRangeIdx] - luma[0]; |
2317 | rRange = luma[15] - luma[minSumRangeIdx + 1]; |
2318 | |
2319 | // 3) sets a proper mode |
2320 | bool swap = false; |
2321 | if( lRange >= rRange ) |
2322 | { |
2323 | if( lRange >= rRange * 2 ) |
2324 | { |
2325 | swap = true; |
2326 | tMode = true; |
2327 | } |
2328 | } |
2329 | else |
2330 | { |
2331 | if( lRange * 2 <= rRange ) tMode = true; |
2332 | } |
2333 | // 4) calculates the two base colors |
2334 | uint8_t rangeIdx[4] = { pixIdx[0], pixIdx[minSumRangeIdx], pixIdx[minSumRangeIdx + 1], pixIdx[15] }; |
2335 | |
2336 | uint16_t r[4], g[4], b[4]; |
2337 | for( uint8_t i = 0; i < 4; ++i ) |
2338 | { |
2339 | uint8_t idx = rangeIdx[i] * 4; |
2340 | b[i] = src[idx]; |
2341 | g[i] = src[idx + 1]; |
2342 | r[i] = src[idx + 2]; |
2343 | } |
2344 | |
2345 | uint8_t mid_rgb[2][3]; |
2346 | if( swap ) |
2347 | { |
2348 | mid_rgb[1][B] = ( b[0] + b[1] ) / 2; |
2349 | mid_rgb[1][G] = ( g[0] + g[1] ) / 2; |
2350 | mid_rgb[1][R] = ( r[0] + r[1] ) / 2; |
2351 | |
2352 | uint16_t sum_rgb[3] = { 0, 0, 0 }; |
2353 | for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ ) |
2354 | { |
2355 | uint8_t idx = pixIdx[i] * 4; |
2356 | sum_rgb[B] += src[idx]; |
2357 | sum_rgb[G] += src[idx + 1]; |
2358 | sum_rgb[R] += src[idx + 2]; |
2359 | } |
2360 | const uint8_t temp = 15 - minSumRangeIdx; |
2361 | mid_rgb[0][B] = sum_rgb[B] / temp; |
2362 | mid_rgb[0][G] = sum_rgb[G] / temp; |
2363 | mid_rgb[0][R] = sum_rgb[R] / temp; |
2364 | } |
2365 | else |
2366 | { |
2367 | mid_rgb[0][B] = (b[0] + b[1]) / 2; |
2368 | mid_rgb[0][G] = (g[0] + g[1]) / 2; |
2369 | mid_rgb[0][R] = (r[0] + r[1]) / 2; |
2370 | if( tMode ) |
2371 | { |
2372 | uint16_t sum_rgb[3] = { 0, 0, 0 }; |
2373 | for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ ) |
2374 | { |
2375 | uint8_t idx = pixIdx[i] * 4; |
2376 | sum_rgb[B] += src[idx]; |
2377 | sum_rgb[G] += src[idx + 1]; |
2378 | sum_rgb[R] += src[idx + 2]; |
2379 | } |
2380 | const uint8_t temp = 15 - minSumRangeIdx; |
2381 | mid_rgb[1][B] = sum_rgb[B] / temp; |
2382 | mid_rgb[1][G] = sum_rgb[G] / temp; |
2383 | mid_rgb[1][R] = sum_rgb[R] / temp; |
2384 | } |
2385 | else |
2386 | { |
2387 | mid_rgb[1][B] = (b[2] + b[3]) / 2; |
2388 | mid_rgb[1][G] = (g[2] + g[3]) / 2; |
2389 | mid_rgb[1][R] = (r[2] + r[3]) / 2; |
2390 | } |
2391 | } |
2392 | |
2393 | // 5) sets the start distance index |
2394 | uint32_t startDistCandidate; |
2395 | uint32_t avgDist; |
2396 | if( tMode ) |
2397 | { |
2398 | if( swap ) |
2399 | { |
2400 | avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] ) / 6; |
2401 | } |
2402 | else |
2403 | { |
2404 | avgDist = ( b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 6; |
2405 | } |
2406 | } |
2407 | else |
2408 | { |
2409 | avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] + b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 12; |
2410 | } |
2411 | |
2412 | if( avgDist <= 16) |
2413 | { |
2414 | startDistCandidate = 0; |
2415 | } |
2416 | else if( avgDist <= 23 ) |
2417 | { |
2418 | startDistCandidate = 1; |
2419 | } |
2420 | else if( avgDist <= 32 ) |
2421 | { |
2422 | startDistCandidate = 2; |
2423 | } |
2424 | else if( avgDist <= 41 ) |
2425 | { |
2426 | startDistCandidate = 3; |
2427 | } |
2428 | else |
2429 | { |
2430 | startDistCandidate = 4; |
2431 | } |
2432 | |
2433 | uint32_t bestErr = MaxError; |
2434 | uint32_t bestPixIndices; |
2435 | uint8_t bestDist = 10; |
2436 | uint8_t colorsRGB444[2][3]; |
2437 | compressColor( mid_rgb, colorsRGB444, tMode ); |
2438 | compressed1 = 0; |
2439 | |
2440 | // 6) finds the best candidate with the lowest error |
2441 | #ifdef __AVX2__ |
2442 | // Vectorized ver |
2443 | bestErr = calculateErrorTH( tMode, colorsRGB444, bestDist, bestPixIndices, startDistCandidate, r8, g8, b8 ); |
2444 | #else |
2445 | // Scalar ver |
2446 | bestErr = calculateErrorTH( tMode, src, colorsRGB444, bestDist, bestPixIndices, startDistCandidate ); |
2447 | #endif |
2448 | |
2449 | // 7) outputs the final T or H block |
2450 | if( tMode ) |
2451 | { |
2452 | // Put the compress params into the compression block |
2453 | compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 23; |
2454 | compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 19; |
2455 | compressed1 |= ( colorsRGB444[0][B] ) << 15; |
2456 | compressed1 |= ( colorsRGB444[1][R] ) << 11; |
2457 | compressed1 |= ( colorsRGB444[1][G] ) << 7; |
2458 | compressed1 |= ( colorsRGB444[1][B] ) << 3; |
2459 | compressed1 |= bestDist & 0x7; |
2460 | } |
2461 | else |
2462 | { |
2463 | int bestRGB444ColPacked[2]; |
2464 | bestRGB444ColPacked[0] = (colorsRGB444[0][R] << 8) + (colorsRGB444[0][G] << 4) + colorsRGB444[0][B]; |
2465 | bestRGB444ColPacked[1] = (colorsRGB444[1][R] << 8) + (colorsRGB444[1][G] << 4) + colorsRGB444[1][B]; |
2466 | if( ( bestRGB444ColPacked[0] >= bestRGB444ColPacked[1] ) ^ ( ( bestDist & 1 ) == 1 ) ) |
2467 | { |
2468 | swapColors( colorsRGB444 ); |
2469 | // Reshuffle pixel indices to to exchange C1 with C3, and C2 with C4 |
2470 | bestPixIndices = ( 0x55555555 & bestPixIndices ) | ( 0xaaaaaaaa & ( ~bestPixIndices ) ); |
2471 | } |
2472 | |
2473 | // Put the compress params into the compression block |
2474 | compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 22; |
2475 | compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 18; |
2476 | compressed1 |= ( colorsRGB444[0][B] & 0xf ) << 14; |
2477 | compressed1 |= ( colorsRGB444[1][R] & 0xf ) << 10; |
2478 | compressed1 |= ( colorsRGB444[1][G] & 0xf ) << 6; |
2479 | compressed1 |= ( colorsRGB444[1][B] & 0xf ) << 2; |
2480 | compressed1 |= ( bestDist >> 1 ) & 0x3; |
2481 | } |
2482 | |
2483 | bestPixIndices = indexConversion( bestPixIndices ); |
2484 | compressed2 = 0; |
2485 | compressed2 = ( compressed2 & ~( ( 0x2 << 31 ) - 1 ) ) | ( bestPixIndices & ( ( 2 << 31 ) - 1 ) ); |
2486 | |
2487 | return bestErr; |
2488 | } |
2489 | //#endif |
2490 | |
2491 | template<class T, class S> |
2492 | static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error) |
2493 | { |
2494 | size_t tidx[2]; |
2495 | tidx[0] = GetLeastError( terr[0], 8 ); |
2496 | tidx[1] = GetLeastError( terr[1], 8 ); |
2497 | |
2498 | if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error) |
2499 | { |
2500 | return value; |
2501 | } |
2502 | |
2503 | d |= tidx[0] << 26; |
2504 | d |= tidx[1] << 29; |
2505 | for( int i=0; i<16; i++ ) |
2506 | { |
2507 | uint64_t t = tsel[i][tidx[id[i]%2]]; |
2508 | d |= ( t & 0x1 ) << ( i + 32 ); |
2509 | d |= ( t & 0x2 ) << ( i + 47 ); |
2510 | } |
2511 | |
2512 | return FixByteOrder(d); |
2513 | } |
2514 | |
2515 | } |
2516 | |
2517 | static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src ) |
2518 | { |
2519 | #ifdef __AVX2__ |
2520 | uint64_t d = CheckSolid_AVX2( src ); |
2521 | if( d != 0 ) return d; |
2522 | |
2523 | alignas(32) v4i a[8]; |
2524 | |
2525 | __m128i err0 = PrepareAverages_AVX2( a, src ); |
2526 | |
2527 | // Get index of minimum error (err0) |
2528 | __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1)); |
2529 | __m128i errMin0 = _mm_min_epu32(err0, err1); |
2530 | |
2531 | __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2)); |
2532 | __m128i errMin2 = _mm_min_epu32(errMin1, errMin0); |
2533 | |
2534 | __m128i errMask = _mm_cmpeq_epi32(errMin2, err0); |
2535 | |
2536 | uint32_t mask = _mm_movemask_epi8(errMask); |
2537 | |
2538 | uint32_t idx = _bit_scan_forward(mask) >> 2; |
2539 | |
2540 | d |= EncodeAverages_AVX2( a, idx ); |
2541 | |
2542 | alignas(32) uint32_t terr[2][8] = {}; |
2543 | alignas(32) uint32_t tsel[8]; |
2544 | |
2545 | if ((idx == 0) || (idx == 2)) |
2546 | { |
2547 | FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src ); |
2548 | } |
2549 | else |
2550 | { |
2551 | FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src ); |
2552 | } |
2553 | |
2554 | return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1 ); |
2555 | #else |
2556 | uint64_t d = CheckSolid( src ); |
2557 | if( d != 0 ) return d; |
2558 | |
2559 | v4i a[8]; |
2560 | unsigned int err[4] = {}; |
2561 | PrepareAverages( a, src, err ); |
2562 | size_t idx = GetLeastError( err, 4 ); |
2563 | EncodeAverages( d, a, idx ); |
2564 | |
2565 | #if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION |
2566 | uint32_t terr[2][8] = {}; |
2567 | #else |
2568 | uint64_t terr[2][8] = {}; |
2569 | #endif |
2570 | uint16_t tsel[16][8]; |
2571 | auto id = g_id[idx]; |
2572 | FindBestFit( terr, tsel, a, id, src ); |
2573 | |
2574 | return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) ); |
2575 | #endif |
2576 | } |
2577 | |
2578 | #ifdef __AVX2__ |
2579 | // horizontal min/max functions. https://stackoverflow.com/questions/22256525/horizontal-minimum-and-maximum-using-sse |
2580 | // if an error occurs in GCC, please change the value of -march in CFLAGS to a specific value for your CPU (e.g., skylake). |
2581 | static inline int16_t hMax( __m128i buffer, uint8_t& idx ) |
2582 | { |
2583 | __m128i tmp1 = _mm_sub_epi8( _mm_set1_epi8( (char)( 255 ) ), buffer ); |
2584 | __m128i tmp2 = _mm_min_epu8( tmp1, _mm_srli_epi16( tmp1, 8 ) ); |
2585 | __m128i tmp3 = _mm_minpos_epu16( tmp2 ); |
2586 | uint8_t result = 255 - (uint8_t)_mm_cvtsi128_si32( tmp3 ); |
2587 | __m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) ); |
2588 | idx = _tzcnt_u32( _mm_movemask_epi8( mask ) ); |
2589 | |
2590 | return result; |
2591 | } |
2592 | #elif defined __ARM_NEON && defined __aarch64__ |
2593 | static inline int16_t hMax( uint8x16_t buffer, uint8_t& idx ) |
2594 | { |
2595 | const uint8_t max = vmaxvq_u8( buffer ); |
2596 | const uint16x8_t vmax = vdupq_n_u16( max ); |
2597 | uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() ); |
2598 | uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[0] ); |
2599 | uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[1] ); |
2600 | uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmax ); |
2601 | uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmax ); |
2602 | |
2603 | static const uint16_t mask_lsb[] = { |
2604 | 0x1, 0x2, 0x4, 0x8, |
2605 | 0x10, 0x20, 0x40, 0x80 }; |
2606 | |
2607 | static const uint16_t mask_msb[] = { |
2608 | 0x100, 0x200, 0x400, 0x800, |
2609 | 0x1000, 0x2000, 0x4000, 0x8000 }; |
2610 | |
2611 | uint16x8_t vmask_lsb = vld1q_u16( mask_lsb ); |
2612 | uint16x8_t vmask_msb = vld1q_u16( mask_msb ); |
2613 | uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask ); |
2614 | uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask ); |
2615 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
2616 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
2617 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
2618 | uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), 0 ); |
2619 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
2620 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
2621 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
2622 | uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), 0 ); |
2623 | idx = idx_lane1 != 0 ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 ); |
2624 | |
2625 | return max; |
2626 | } |
2627 | #endif |
2628 | |
2629 | #ifdef __AVX2__ |
2630 | static inline int16_t hMin( __m128i buffer, uint8_t& idx ) |
2631 | { |
2632 | __m128i tmp2 = _mm_min_epu8( buffer, _mm_srli_epi16( buffer, 8 ) ); |
2633 | __m128i tmp3 = _mm_minpos_epu16( tmp2 ); |
2634 | uint8_t result = (uint8_t)_mm_cvtsi128_si32( tmp3 ); |
2635 | __m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) ); |
2636 | idx = _tzcnt_u32( _mm_movemask_epi8( mask ) ); |
2637 | return result; |
2638 | } |
2639 | #elif defined __ARM_NEON && defined __aarch64__ |
2640 | static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx ) |
2641 | { |
2642 | const uint8_t min = vminvq_u8( buffer ); |
2643 | const uint16x8_t vmin = vdupq_n_u16( min ); |
2644 | uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() ); |
2645 | uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[0] ); |
2646 | uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[1] ); |
2647 | uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmin ); |
2648 | uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmin ); |
2649 | |
2650 | static const uint16_t mask_lsb[] = { |
2651 | 0x1, 0x2, 0x4, 0x8, |
2652 | 0x10, 0x20, 0x40, 0x80 }; |
2653 | |
2654 | static const uint16_t mask_msb[] = { |
2655 | 0x100, 0x200, 0x400, 0x800, |
2656 | 0x1000, 0x2000, 0x4000, 0x8000 }; |
2657 | |
2658 | uint16x8_t vmask_lsb = vld1q_u16( mask_lsb ); |
2659 | uint16x8_t vmask_msb = vld1q_u16( mask_msb ); |
2660 | uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask ); |
2661 | uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask ); |
2662 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
2663 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
2664 | pos_lsb = vpaddq_u16( pos_lsb, pos_lsb ); |
2665 | uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), 0 ); |
2666 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
2667 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
2668 | pos_msb = vpaddq_u16( pos_msb, pos_msb ); |
2669 | uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), 0 ); |
2670 | idx = idx_lane1 != 0 ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 ); |
2671 | |
2672 | return min; |
2673 | } |
2674 | #endif |
2675 | |
2676 | // During search it is not convenient to store the bits the way they are stored in the |
2677 | // file format. Hence, after search, it is converted to this format. |
2678 | // NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved. |
2679 | static inline void stuff59bits( unsigned int thumbT59W1, unsigned int thumbT59W2, unsigned int& thumbTW1, unsigned int& thumbTW2 ) |
2680 | { |
2681 | // Put bits in twotimer configuration for 59 (red overflows) |
2682 | // |
2683 | // Go from this bit layout: |
2684 | // |
2685 | // |63 62 61 60 59|58 57 56 55|54 53 52 51|50 49 48 47|46 45 44 43|42 41 40 39|38 37 36 35|34 33 32| |
2686 | // |----empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|--dist--| |
2687 | // |
2688 | // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| |
2689 | // |----------------------------------------index bits---------------------------------------------| |
2690 | // |
2691 | // |
2692 | // To this: |
2693 | // |
2694 | // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 |
2695 | // ----------------------------------------------------------------------------------------------- |
2696 | // |// // //|R0a |//|R0b |G0 |B0 |R1 |G1 |B1 |da |df|db| |
2697 | // ----------------------------------------------------------------------------------------------- |
2698 | // |
2699 | // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| |
2700 | // |----------------------------------------index bits---------------------------------------------| |
2701 | // |
2702 | // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 |
2703 | // ----------------------------------------------------------------------------------------------- |
2704 | // | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp| |
2705 | // | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt| |
2706 | // ------------------------------------------------------------------------------------------------ |
2707 | |
2708 | uint8_t R0a; |
2709 | uint8_t bit, a, b, c, d, bits; |
2710 | |
2711 | R0a = ( thumbT59W1 >> 25 ) & 0x3; |
2712 | |
2713 | // Fix middle part |
2714 | thumbTW1 = thumbT59W1 << 1; |
2715 | // Fix R0a (top two bits of R0) |
2716 | thumbTW1 = ( thumbTW1 & ~( 0x3 << 27 ) ) | ( ( R0a & 0x3 ) << 27 ); |
2717 | // Fix db (lowest bit of d) |
2718 | thumbTW1 = ( thumbTW1 & ~0x1 ) | ( thumbT59W1 & 0x1 ); |
2719 | |
2720 | // Make sure that red overflows: |
2721 | a = ( thumbTW1 >> 28 ) & 0x1; |
2722 | b = ( thumbTW1 >> 27 ) & 0x1; |
2723 | c = ( thumbTW1 >> 25 ) & 0x1; |
2724 | d = ( thumbTW1 >> 24 ) & 0x1; |
2725 | |
2726 | // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111 |
2727 | // The following logical expression checks for the presence of any of those: |
2728 | bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d ); |
2729 | bits = 0xf * bit; |
2730 | thumbTW1 = ( thumbTW1 & ~( 0x7 << 29 ) ) | ( bits & 0x7 ) << 29; |
2731 | thumbTW1 = ( thumbTW1 & ~( 0x1 << 26 ) ) | ( !bit & 0x1 ) << 26; |
2732 | |
2733 | // Set diffbit |
2734 | thumbTW1 = ( thumbTW1 & ~0x2 ) | 0x2; |
2735 | thumbTW2 = thumbT59W2; |
2736 | } |
2737 | |
2738 | // During search it is not convenient to store the bits the way they are stored in the |
2739 | // file format. Hence, after search, it is converted to this format. |
2740 | // NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved. |
2741 | static inline void stuff58bits( unsigned int thumbH58W1, unsigned int thumbH58W2, unsigned int& thumbHW1, unsigned int& thumbHW2 ) |
2742 | { |
2743 | // Put bits in twotimer configuration for 58 (red doesn't overflow, green does) |
2744 | // |
2745 | // Go from this bit layout: |
2746 | // |
2747 | // |
2748 | // |63 62 61 60 59 58|57 56 55 54|53 52 51 50|49 48 47 46|45 44 43 42|41 40 39 38|37 36 35 34|33 32| |
2749 | // |-------empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|d2 d1| |
2750 | // |
2751 | // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| |
2752 | // |---------------------------------------index bits----------------------------------------------| |
2753 | // |
2754 | // To this: |
2755 | // |
2756 | // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 |
2757 | // ----------------------------------------------------------------------------------------------- |
2758 | // |//|R0 |G0 |// // //|G0|B0|//|B0b |R1 |G1 |B0 |d2|df|d1| |
2759 | // ----------------------------------------------------------------------------------------------- |
2760 | // |
2761 | // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| |
2762 | // |---------------------------------------index bits----------------------------------------------| |
2763 | // |
2764 | // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 |
2765 | // ----------------------------------------------------------------------------------------------- |
2766 | // | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp| |
2767 | // | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt| |
2768 | // ----------------------------------------------------------------------------------------------- |
2769 | // |
2770 | // |
2771 | // Thus, what we are really doing is going from this bit layout: |
2772 | // |
2773 | // |
2774 | // |63 62 61 60 59 58|57 56 55 54 53 52 51|50 49|48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33|32 | |
2775 | // |-------empty-----|part0---------------|part1|part2------------------------------------------|part3| |
2776 | // |
2777 | // To this: |
2778 | // |
2779 | // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 |
2780 | // --------------------------------------------------------------------------------------------------| |
2781 | // |//|part0 |// // //|part1|//|part2 |df|part3| |
2782 | // --------------------------------------------------------------------------------------------------| |
2783 | |
2784 | unsigned int part0, part1, part2, part3; |
2785 | uint8_t bit, a, b, c, d, bits; |
2786 | |
2787 | // move parts |
2788 | part0 = ( thumbH58W1 >> 19 ) & 0x7f; |
2789 | part1 = ( thumbH58W1 >> 17 ) & 0x3; |
2790 | part2 = ( thumbH58W1 >> 1 ) & 0xffff; |
2791 | part3 = thumbH58W1 & 0x1; |
2792 | thumbHW1 = 0; |
2793 | thumbHW1 = ( thumbHW1 & ~( 0x7f << 24 ) ) | ( ( part0 & 0x7f ) << 24 ); |
2794 | thumbHW1 = ( thumbHW1 & ~( 0x3 << 19 ) ) | ( ( part1 & 0x3 ) << 19 ); |
2795 | thumbHW1 = ( thumbHW1 & ~( 0xffff << 2 ) ) | ( ( part2 & 0xffff ) << 2 ); |
2796 | thumbHW1 = ( thumbHW1 & ~0x1 ) | ( part3 & 0x1 ); |
2797 | |
2798 | // Make sure that red does not overflow: |
2799 | bit = ( thumbHW1 >> 30 ) & 0x1; |
2800 | thumbHW1 = ( thumbHW1 & ~( 0x1 << 31 ) ) | ( ( !bit & 0x1 ) << 31 ); |
2801 | |
2802 | // Make sure that green overflows: |
2803 | a = ( thumbHW1 >> 20 ) & 0x1; |
2804 | b = ( thumbHW1 >> 19 ) & 0x1; |
2805 | c = ( thumbHW1 >> 17 ) & 0x1; |
2806 | d = ( thumbHW1 >> 16 ) & 0x1; |
2807 | // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111 |
2808 | // The following logical expression checks for the presence of any of those: |
2809 | bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d ); |
2810 | bits = 0xf * bit; |
2811 | thumbHW1 = ( thumbHW1 & ~( 0x7 << 21 ) ) | ( ( bits & 0x7 ) << 21 ); |
2812 | thumbHW1 = ( thumbHW1 & ~( 0x1 << 18 ) ) | ( ( !bit & 0x1 ) << 18 ); |
2813 | |
2814 | // Set diffbit |
2815 | thumbHW1 = ( thumbHW1 & ~0x2 ) | 0x2; |
2816 | thumbHW2 = thumbH58W2; |
2817 | } |
2818 | |
2819 | #if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__) |
2820 | static etcpak_force_inline Channels GetChannels( const uint8_t* src ) |
2821 | { |
2822 | Channels ch; |
2823 | #ifdef __AVX2__ |
2824 | __m128i d0 = _mm_loadu_si128( ( (__m128i*)src ) + 0 ); |
2825 | __m128i d1 = _mm_loadu_si128( ( (__m128i*)src ) + 1 ); |
2826 | __m128i d2 = _mm_loadu_si128( ( (__m128i*)src ) + 2 ); |
2827 | __m128i d3 = _mm_loadu_si128( ( (__m128i*)src ) + 3 ); |
2828 | |
2829 | __m128i rgb0 = _mm_shuffle_epi8( d0, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) ); |
2830 | __m128i rgb1 = _mm_shuffle_epi8( d1, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) ); |
2831 | __m128i rgb2 = _mm_shuffle_epi8( d2, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) ); |
2832 | __m128i rgb3 = _mm_shuffle_epi8( d3, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) ); |
2833 | |
2834 | __m128i rg0 = _mm_unpacklo_epi32( rgb0, rgb1 ); |
2835 | __m128i rg1 = _mm_unpacklo_epi32( rgb2, rgb3 ); |
2836 | __m128i b0 = _mm_unpackhi_epi32( rgb0, rgb1 ); |
2837 | __m128i b1 = _mm_unpackhi_epi32( rgb2, rgb3 ); |
2838 | |
2839 | // swap channels |
2840 | ch.b8 = _mm_unpacklo_epi64( rg0, rg1 ); |
2841 | ch.g8 = _mm_unpackhi_epi64( rg0, rg1 ); |
2842 | ch.r8 = _mm_unpacklo_epi64( b0, b1 ); |
2843 | #elif defined __ARM_NEON && defined __aarch64__ |
2844 | //load pixel data into 4 rows |
2845 | uint8x16_t px0 = vld1q_u8( src + 0 ); |
2846 | uint8x16_t px1 = vld1q_u8( src + 16 ); |
2847 | uint8x16_t px2 = vld1q_u8( src + 32 ); |
2848 | uint8x16_t px3 = vld1q_u8( src + 48 ); |
2849 | |
2850 | uint8x16x2_t px0z1 = vzipq_u8( px0, px1 ); |
2851 | uint8x16x2_t px2z3 = vzipq_u8( px2, px3 ); |
2852 | uint8x16x2_t px01 = vzipq_u8( px0z1.val[0], px0z1.val[1] ); |
2853 | uint8x16x2_t rgb01 = vzipq_u8( px01.val[0], px01.val[1] ); |
2854 | uint8x16x2_t px23 = vzipq_u8( px2z3.val[0], px2z3.val[1] ); |
2855 | uint8x16x2_t rgb23 = vzipq_u8( px23.val[0], px23.val[1] ); |
2856 | |
2857 | uint8x16_t rr = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[0] ), vreinterpretq_u64_u8( rgb23.val[0] ) ) ); |
2858 | uint8x16_t gg = vreinterpretq_u8_u64( vzip2q_u64( vreinterpretq_u64_u8( rgb01.val[0] ), vreinterpretq_u64_u8( rgb23.val[0] ) ) ); |
2859 | uint8x16_t bb = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[1] ), vreinterpretq_u64_u8( rgb23.val[1] ) ) ); |
2860 | |
2861 | uint8x16x2_t red = vzipq_u8( rr, uint8x16_t() ); |
2862 | uint8x16x2_t grn = vzipq_u8( gg, uint8x16_t() ); |
2863 | uint8x16x2_t blu = vzipq_u8( bb, uint8x16_t() ); |
2864 | ch.r = red; |
2865 | ch.b = blu; |
2866 | ch.g = grn; |
2867 | #endif |
2868 | return ch; |
2869 | } |
2870 | #endif |
2871 | |
2872 | #if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__) |
2873 | static etcpak_force_inline void CalculateLuma( Channels& ch, Luma& luma ) |
2874 | #else |
2875 | static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma ) |
2876 | #endif |
2877 | { |
2878 | #ifdef __AVX2__ |
2879 | __m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.b8 ), _mm256_set1_epi16( 14 ) ); |
2880 | __m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.g8 ), _mm256_set1_epi16( 76 ) ); |
2881 | __m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.r8 ), _mm256_set1_epi16( 38 ) ); |
2882 | |
2883 | __m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma ); |
2884 | __m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 ); |
2885 | __m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 ); |
2886 | __m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 ); |
2887 | |
2888 | static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 ); |
2889 | static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 ); |
2890 | __m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo ); |
2891 | __m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi ); |
2892 | __m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved ); |
2893 | luma.luma8 = luma_8bit; |
2894 | |
2895 | // min/max calculation |
2896 | luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f; |
2897 | luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f; |
2898 | #elif defined __ARM_NEON && defined __aarch64__ |
2899 | //load pixel data into 4 rows |
2900 | uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[0] ), 14 ); |
2901 | uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[1] ), 14 ); |
2902 | uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[0] ), 76 ); |
2903 | uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[1] ), 76 ); |
2904 | uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[0] ), 38 ); |
2905 | uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[1] ), 38 ); |
2906 | |
2907 | //calculate luma for rows 0,1 and 2,3 |
2908 | uint16x8_t lum_r01 = vaddq_u16( vaddq_u16( red0, grn0 ), blu0 ); |
2909 | uint16x8_t lum_r23 = vaddq_u16( vaddq_u16( red1, grn1 ), blu1 ); |
2910 | |
2911 | //divide luma values with right shift and narrow results to 8bit |
2912 | uint8x8_t lum_r01_d = vshrn_n_u16( lum_r01, 7 ); |
2913 | uint8x8_t lum_r02_d = vshrn_n_u16( lum_r23, 7 ); |
2914 | |
2915 | luma.luma8 = vcombine_u8( lum_r01_d, lum_r02_d ); |
2916 | //find min and max luma value |
2917 | luma.min = hMin( luma.luma8, luma.minIdx ) * 0.00392156f; |
2918 | luma.max = hMax( luma.luma8, luma.maxIdx ) * 0.00392156f; |
2919 | #else |
2920 | for( int i = 0; i < 16; ++i ) |
2921 | { |
2922 | luma.val[i] = ( src[i * 4 + 2] * 76 + src[i * 4 + 1] * 150 + src[i * 4] * 28 ) / 254; // luma calculation |
2923 | if( luma.min > luma.val[i] ) |
2924 | { |
2925 | luma.min = luma.val[i]; |
2926 | luma.minIdx = i; |
2927 | } |
2928 | if( luma.max < luma.val[i] ) |
2929 | { |
2930 | luma.max = luma.val[i]; |
2931 | luma.maxIdx = i; |
2932 | } |
2933 | } |
2934 | #endif |
2935 | } |
2936 | |
2937 | static etcpak_force_inline uint8_t SelectModeETC2( const Luma& luma ) |
2938 | { |
2939 | #if defined __AVX2__ || defined __ARM_NEON |
2940 | const float lumaRange = ( luma.max - luma.min ); |
2941 | #else |
2942 | const float lumaRange = ( luma.max - luma.min ) * ( 1.f / 255.f ); |
2943 | #endif |
2944 | // filters a very-low-contrast block |
2945 | if( lumaRange <= ecmd_threshold[0] ) |
2946 | { |
2947 | return ModePlanar; |
2948 | } |
2949 | // checks whether a pair of the corner pixels in a block has the min/max luma values; |
2950 | // if so, the ETC2 planar mode is enabled, and otherwise, the ETC1 mode is enabled |
2951 | else if( lumaRange <= ecmd_threshold[1] ) |
2952 | { |
2953 | #ifdef __AVX2__ |
2954 | static const __m128i corner_pair = _mm_set_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 0, 15, 3, 12, 12, 3, 15, 0 ); |
2955 | __m128i current_max_min = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx ); |
2956 | |
2957 | __m128i max_min_result = _mm_cmpeq_epi16( corner_pair, current_max_min ); |
2958 | |
2959 | int mask = _mm_movemask_epi8( max_min_result ); |
2960 | if( mask ) |
2961 | { |
2962 | return ModePlanar; |
2963 | } |
2964 | #else |
2965 | // check whether a pair of the corner pixels in a block has the min/max luma values; |
2966 | // if so, the ETC2 planar mode is enabled. |
2967 | if( ( luma.minIdx == 0 && luma.maxIdx == 15 ) || |
2968 | ( luma.minIdx == 15 && luma.maxIdx == 0 ) || |
2969 | ( luma.minIdx == 3 && luma.maxIdx == 12 ) || |
2970 | ( luma.minIdx == 12 && luma.maxIdx == 3 ) ) |
2971 | { |
2972 | return ModePlanar; |
2973 | } |
2974 | #endif |
2975 | } |
2976 | // filters a high-contrast block for checking both ETC1 mode and the ETC2 T/H mode |
2977 | else if( lumaRange >= ecmd_threshold[2] ) |
2978 | { |
2979 | return ModeTH; |
2980 | } |
2981 | return ModeUndecided; |
2982 | } |
2983 | |
2984 | static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool useHeuristics ) |
2985 | { |
2986 | #ifdef __AVX2__ |
2987 | uint64_t d = CheckSolid_AVX2( src ); |
2988 | if( d != 0 ) return d; |
2989 | #else |
2990 | uint64_t d = CheckSolid( src ); |
2991 | if (d != 0) return d; |
2992 | #endif |
2993 | |
2994 | uint8_t mode = ModeUndecided; |
2995 | Luma luma; |
2996 | #ifdef __AVX2__ |
2997 | Channels ch = GetChannels( src ); |
2998 | if( useHeuristics ) |
2999 | { |
3000 | CalculateLuma( ch, luma ); |
3001 | mode = SelectModeETC2( luma ); |
3002 | } |
3003 | |
3004 | auto plane = Planar_AVX2( ch, mode, useHeuristics ); |
3005 | if( useHeuristics && mode == ModePlanar ) return plane.plane; |
3006 | |
3007 | alignas( 32 ) v4i a[8]; |
3008 | __m128i err0 = PrepareAverages_AVX2( a, plane.sum4 ); |
3009 | |
3010 | // Get index of minimum error (err0) |
3011 | __m128i err1 = _mm_shuffle_epi32( err0, _MM_SHUFFLE( 2, 3, 0, 1 ) ); |
3012 | __m128i errMin0 = _mm_min_epu32(err0, err1); |
3013 | |
3014 | __m128i errMin1 = _mm_shuffle_epi32( errMin0, _MM_SHUFFLE( 1, 0, 3, 2 ) ); |
3015 | __m128i errMin2 = _mm_min_epu32( errMin1, errMin0 ); |
3016 | |
3017 | __m128i errMask = _mm_cmpeq_epi32( errMin2, err0 ); |
3018 | |
3019 | uint32_t mask = _mm_movemask_epi8( errMask ); |
3020 | |
3021 | size_t idx = _bit_scan_forward( mask ) >> 2; |
3022 | |
3023 | d = EncodeAverages_AVX2( a, idx ); |
3024 | |
3025 | alignas(32) uint32_t terr[2][8] = {}; |
3026 | alignas(32) uint32_t tsel[8]; |
3027 | |
3028 | if ((idx == 0) || (idx == 2)) |
3029 | { |
3030 | FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src ); |
3031 | } |
3032 | else |
3033 | { |
3034 | FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src ); |
3035 | } |
3036 | |
3037 | if( useHeuristics ) |
3038 | { |
3039 | if( mode == ModeTH ) |
3040 | { |
3041 | uint64_t result = 0; |
3042 | uint64_t error = 0; |
3043 | uint32_t compressed[4] = { 0, 0, 0, 0 }; |
3044 | bool tMode = false; |
3045 | |
3046 | error = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode, ch.r8, ch.g8, ch.b8 ); |
3047 | if( tMode ) |
3048 | { |
3049 | stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] ); |
3050 | } |
3051 | else |
3052 | { |
3053 | stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] ); |
3054 | } |
3055 | |
3056 | result = (uint32_t)_bswap( compressed[2] ); |
3057 | result |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32; |
3058 | |
3059 | plane.plane = result; |
3060 | plane.error = error; |
3061 | } |
3062 | else |
3063 | { |
3064 | plane.plane = 0; |
3065 | plane.error = MaxError; |
3066 | } |
3067 | } |
3068 | |
3069 | return EncodeSelectors_AVX2( d, terr, tsel, ( idx % 2 ) == 1, plane.plane, plane.error ); |
3070 | #else |
3071 | if( useHeuristics ) |
3072 | { |
3073 | #if defined __ARM_NEON && defined __aarch64__ |
3074 | Channels ch = GetChannels( src ); |
3075 | CalculateLuma( ch, luma ); |
3076 | #else |
3077 | CalculateLuma( src, luma ); |
3078 | #endif |
3079 | mode = SelectModeETC2( luma ); |
3080 | } |
3081 | #ifdef __ARM_NEON |
3082 | auto result = Planar_NEON( src, mode, useHeuristics ); |
3083 | #else |
3084 | auto result = Planar( src, mode, useHeuristics ); |
3085 | #endif |
3086 | if( result.second == 0 ) return result.first; |
3087 | |
3088 | v4i a[8]; |
3089 | unsigned int err[4] = {}; |
3090 | PrepareAverages( a, src, err ); |
3091 | size_t idx = GetLeastError( err, 4 ); |
3092 | EncodeAverages( d, a, idx ); |
3093 | |
3094 | #if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION |
3095 | uint32_t terr[2][8] = {}; |
3096 | #else |
3097 | uint64_t terr[2][8] = {}; |
3098 | #endif |
3099 | uint16_t tsel[16][8]; |
3100 | auto id = g_id[idx]; |
3101 | FindBestFit( terr, tsel, a, id, src ); |
3102 | |
3103 | if( useHeuristics ) |
3104 | { |
3105 | if( mode == ModeTH ) |
3106 | { |
3107 | uint32_t compressed[4] = { 0, 0, 0, 0 }; |
3108 | bool tMode = false; |
3109 | |
3110 | result.second = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode ); |
3111 | if( tMode ) |
3112 | { |
3113 | stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] ); |
3114 | } |
3115 | else |
3116 | { |
3117 | stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] ); |
3118 | } |
3119 | |
3120 | result.first = (uint32_t)_bswap( compressed[2] ); |
3121 | result.first |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32; |
3122 | } |
3123 | else |
3124 | { |
3125 | result.first = 0; |
3126 | result.second = MaxError; |
3127 | } |
3128 | } |
3129 | |
3130 | return EncodeSelectors( d, terr, tsel, id, result.first, result.second ); |
3131 | #endif |
3132 | } |
3133 | |
3134 | #ifdef __SSE4_1__ |
3135 | template<int K> |
3136 | static etcpak_force_inline __m128i Widen( const __m128i src ) |
3137 | { |
3138 | static_assert( K >= 0 && K <= 7, "Index out of range" ); |
3139 | |
3140 | __m128i tmp; |
3141 | switch( K ) |
3142 | { |
3143 | case 0: |
3144 | tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
3145 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
3146 | case 1: |
3147 | tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) ); |
3148 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
3149 | case 2: |
3150 | tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
3151 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
3152 | case 3: |
3153 | tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) ); |
3154 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
3155 | case 4: |
3156 | tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) ); |
3157 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
3158 | case 5: |
3159 | tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) ); |
3160 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
3161 | case 6: |
3162 | tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
3163 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
3164 | case 7: |
3165 | tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) ); |
3166 | return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) ); |
3167 | } |
3168 | } |
3169 | |
3170 | static etcpak_force_inline int GetMulSel( int sel ) |
3171 | { |
3172 | switch( sel ) |
3173 | { |
3174 | case 0: |
3175 | return 0; |
3176 | case 1: |
3177 | case 2: |
3178 | case 3: |
3179 | return 1; |
3180 | case 4: |
3181 | return 2; |
3182 | case 5: |
3183 | case 6: |
3184 | case 7: |
3185 | return 3; |
3186 | case 8: |
3187 | case 9: |
3188 | case 10: |
3189 | case 11: |
3190 | case 12: |
3191 | case 13: |
3192 | return 4; |
3193 | case 14: |
3194 | case 15: |
3195 | return 5; |
3196 | } |
3197 | } |
3198 | |
3199 | #endif |
3200 | |
3201 | #ifdef __ARM_NEON |
3202 | |
3203 | static constexpr etcpak_force_inline int GetMulSel(int sel) |
3204 | { |
3205 | return ( sel < 1 ) ? 0 : ( sel < 4 ) ? 1 : ( sel < 5 ) ? 2 : ( sel < 8 ) ? 3 : ( sel < 14 ) ? 4 : 5; |
3206 | } |
3207 | |
3208 | static constexpr int ClampConstant( int x, int min, int max ) |
3209 | { |
3210 | return x < min ? min : x > max ? max : x; |
3211 | } |
3212 | |
3213 | template <int Index> |
3214 | etcpak_force_inline static uint16x8_t ErrorProbe_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock ) |
3215 | { |
3216 | uint8x8_t srcValWide; |
3217 | #ifndef __aarch64__ |
3218 | if( Index < 8 ) |
3219 | srcValWide = vdup_lane_u8( vget_low_u8( alphaBlock ), ClampConstant( Index, 0, 7 ) ); |
3220 | else |
3221 | srcValWide = vdup_lane_u8( vget_high_u8( alphaBlock ), ClampConstant( Index - 8, 0, 7 ) ); |
3222 | #else |
3223 | srcValWide = vdup_laneq_u8( alphaBlock, Index ); |
3224 | #endif |
3225 | |
3226 | uint8x8_t deltaVal = vabd_u8( srcValWide, recVal ); |
3227 | return vmull_u8( deltaVal, deltaVal ); |
3228 | } |
3229 | |
3230 | etcpak_force_inline static uint16_t MinError_EAC_NEON( uint16x8_t errProbe ) |
3231 | { |
3232 | #ifndef __aarch64__ |
3233 | uint16x4_t tmpErr = vpmin_u16( vget_low_u16( errProbe ), vget_high_u16( errProbe ) ); |
3234 | tmpErr = vpmin_u16( tmpErr, tmpErr ); |
3235 | return vpmin_u16( tmpErr, tmpErr )[0]; |
3236 | #else |
3237 | return vminvq_u16( errProbe ); |
3238 | #endif |
3239 | } |
3240 | |
3241 | template <int Index> |
3242 | etcpak_force_inline static uint64_t MinErrorIndex_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock ) |
3243 | { |
3244 | uint16x8_t errProbe = ErrorProbe_EAC_NEON<Index>( recVal, alphaBlock ); |
3245 | uint16x8_t minErrMask = vceqq_u16( errProbe, vdupq_n_u16( MinError_EAC_NEON( errProbe ) ) ); |
3246 | uint64_t idx = __builtin_ctzll( vget_lane_u64( vreinterpret_u64_u8( vqmovn_u16( minErrMask ) ), 0 ) ); |
3247 | idx >>= 3; |
3248 | idx <<= 45 - Index * 3; |
3249 | |
3250 | return idx; |
3251 | } |
3252 | |
3253 | template <int Index> |
3254 | etcpak_force_inline static int16x8_t WidenMultiplier_EAC_NEON( int16x8_t multipliers ) |
3255 | { |
3256 | constexpr int Lane = GetMulSel( Index ); |
3257 | #ifndef __aarch64__ |
3258 | if( Lane < 4 ) |
3259 | return vdupq_lane_s16( vget_low_s16( multipliers ), ClampConstant( Lane, 0, 3 ) ); |
3260 | else |
3261 | return vdupq_lane_s16( vget_high_s16( multipliers ), ClampConstant( Lane - 4, 0, 3 ) ); |
3262 | #else |
3263 | return vdupq_laneq_s16( multipliers, Lane ); |
3264 | #endif |
3265 | } |
3266 | |
3267 | #endif |
3268 | |
3269 | static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src ) |
3270 | { |
3271 | #if defined __SSE4_1__ |
3272 | // Check solid |
3273 | __m128i s = _mm_loadu_si128( (__m128i*)src ); |
3274 | __m128i solidCmp = _mm_set1_epi8( src[0] ); |
3275 | __m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp ); |
3276 | if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) ) |
3277 | { |
3278 | return src[0]; |
3279 | } |
3280 | |
3281 | // Calculate min, max |
3282 | __m128i s1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 2, 3, 0, 1 ) ); |
3283 | __m128i max1 = _mm_max_epu8( s, s1 ); |
3284 | __m128i min1 = _mm_min_epu8( s, s1 ); |
3285 | __m128i smax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) ); |
3286 | __m128i smin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) ); |
3287 | __m128i max2 = _mm_max_epu8( max1, smax2 ); |
3288 | __m128i min2 = _mm_min_epu8( min1, smin2 ); |
3289 | __m128i smax3 = _mm_alignr_epi8( max2, max2, 2 ); |
3290 | __m128i smin3 = _mm_alignr_epi8( min2, min2, 2 ); |
3291 | __m128i max3 = _mm_max_epu8( max2, smax3 ); |
3292 | __m128i min3 = _mm_min_epu8( min2, smin3 ); |
3293 | __m128i smax4 = _mm_alignr_epi8( max3, max3, 1 ); |
3294 | __m128i smin4 = _mm_alignr_epi8( min3, min3, 1 ); |
3295 | __m128i max = _mm_max_epu8( max3, smax4 ); |
3296 | __m128i min = _mm_min_epu8( min3, smin4 ); |
3297 | __m128i max16 = _mm_unpacklo_epi8( max, _mm_setzero_si128() ); |
3298 | __m128i min16 = _mm_unpacklo_epi8( min, _mm_setzero_si128() ); |
3299 | |
3300 | // src range, mid |
3301 | __m128i srcRange = _mm_sub_epi16( max16, min16 ); |
3302 | __m128i srcRangeHalf = _mm_srli_epi16( srcRange, 1 ); |
3303 | __m128i srcMid = _mm_add_epi16( min16, srcRangeHalf ); |
3304 | |
3305 | // multiplier |
3306 | __m128i mul1 = _mm_mulhi_epi16( srcRange, g_alphaRange_SIMD ); |
3307 | __m128i mul = _mm_add_epi16( mul1, _mm_set1_epi16( 1 ) ); |
3308 | |
3309 | // wide source |
3310 | __m128i s16_1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 3, 2, 3, 2 ) ); |
3311 | __m128i s16[2] = { _mm_unpacklo_epi8( s, _mm_setzero_si128() ), _mm_unpacklo_epi8( s16_1, _mm_setzero_si128() ) }; |
3312 | |
3313 | __m128i sr[16] = { |
3314 | Widen<0>( s16[0] ), |
3315 | Widen<1>( s16[0] ), |
3316 | Widen<2>( s16[0] ), |
3317 | Widen<3>( s16[0] ), |
3318 | Widen<4>( s16[0] ), |
3319 | Widen<5>( s16[0] ), |
3320 | Widen<6>( s16[0] ), |
3321 | Widen<7>( s16[0] ), |
3322 | Widen<0>( s16[1] ), |
3323 | Widen<1>( s16[1] ), |
3324 | Widen<2>( s16[1] ), |
3325 | Widen<3>( s16[1] ), |
3326 | Widen<4>( s16[1] ), |
3327 | Widen<5>( s16[1] ), |
3328 | Widen<6>( s16[1] ), |
3329 | Widen<7>( s16[1] ) |
3330 | }; |
3331 | |
3332 | #ifdef __AVX2__ |
3333 | __m256i srcRangeWide = _mm256_broadcastsi128_si256( srcRange ); |
3334 | __m256i srcMidWide = _mm256_broadcastsi128_si256( srcMid ); |
3335 | |
3336 | __m256i mulWide1 = _mm256_mulhi_epi16( srcRangeWide, g_alphaRange_AVX ); |
3337 | __m256i mulWide = _mm256_add_epi16( mulWide1, _mm256_set1_epi16( 1 ) ); |
3338 | |
3339 | __m256i modMul[8] = { |
3340 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ) ), _mm256_setzero_si256() ), |
3341 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ) ), _mm256_setzero_si256() ), |
3342 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ) ), _mm256_setzero_si256() ), |
3343 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ) ), _mm256_setzero_si256() ), |
3344 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ) ), _mm256_setzero_si256() ), |
3345 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ) ), _mm256_setzero_si256() ), |
3346 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ) ), _mm256_setzero_si256() ), |
3347 | _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ) ), _mm256_setzero_si256() ), |
3348 | }; |
3349 | |
3350 | // find selector |
3351 | __m256i mulErr = _mm256_setzero_si256(); |
3352 | for( int j=0; j<16; j++ ) |
3353 | { |
3354 | __m256i s16Wide = _mm256_broadcastsi128_si256( sr[j] ); |
3355 | __m256i err1, err2; |
3356 | |
3357 | err1 = _mm256_sub_epi16( s16Wide, modMul[0] ); |
3358 | __m256i localErr = _mm256_mullo_epi16( err1, err1 ); |
3359 | |
3360 | err1 = _mm256_sub_epi16( s16Wide, modMul[1] ); |
3361 | err2 = _mm256_mullo_epi16( err1, err1 ); |
3362 | localErr = _mm256_min_epu16( localErr, err2 ); |
3363 | |
3364 | err1 = _mm256_sub_epi16( s16Wide, modMul[2] ); |
3365 | err2 = _mm256_mullo_epi16( err1, err1 ); |
3366 | localErr = _mm256_min_epu16( localErr, err2 ); |
3367 | |
3368 | err1 = _mm256_sub_epi16( s16Wide, modMul[3] ); |
3369 | err2 = _mm256_mullo_epi16( err1, err1 ); |
3370 | localErr = _mm256_min_epu16( localErr, err2 ); |
3371 | |
3372 | err1 = _mm256_sub_epi16( s16Wide, modMul[4] ); |
3373 | err2 = _mm256_mullo_epi16( err1, err1 ); |
3374 | localErr = _mm256_min_epu16( localErr, err2 ); |
3375 | |
3376 | err1 = _mm256_sub_epi16( s16Wide, modMul[5] ); |
3377 | err2 = _mm256_mullo_epi16( err1, err1 ); |
3378 | localErr = _mm256_min_epu16( localErr, err2 ); |
3379 | |
3380 | err1 = _mm256_sub_epi16( s16Wide, modMul[6] ); |
3381 | err2 = _mm256_mullo_epi16( err1, err1 ); |
3382 | localErr = _mm256_min_epu16( localErr, err2 ); |
3383 | |
3384 | err1 = _mm256_sub_epi16( s16Wide, modMul[7] ); |
3385 | err2 = _mm256_mullo_epi16( err1, err1 ); |
3386 | localErr = _mm256_min_epu16( localErr, err2 ); |
3387 | |
3388 | // note that this can overflow, but since we're looking for the smallest error, it shouldn't matter |
3389 | mulErr = _mm256_adds_epu16( mulErr, localErr ); |
3390 | } |
3391 | uint64_t minPos1 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_castsi256_si128( mulErr ) ) ); |
3392 | uint64_t minPos2 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_extracti128_si256( mulErr, 1 ) ) ); |
3393 | int sel = ( ( minPos1 & 0xFFFF ) < ( minPos2 & 0xFFFF ) ) ? ( minPos1 >> 16 ) : ( 8 + ( minPos2 >> 16 ) ); |
3394 | |
3395 | __m128i recVal16; |
3396 | switch( sel ) |
3397 | { |
3398 | case 0: |
3399 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ); |
3400 | break; |
3401 | case 1: |
3402 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ); |
3403 | break; |
3404 | case 2: |
3405 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ); |
3406 | break; |
3407 | case 3: |
3408 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ); |
3409 | break; |
3410 | case 4: |
3411 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ); |
3412 | break; |
3413 | case 5: |
3414 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ); |
3415 | break; |
3416 | case 6: |
3417 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ); |
3418 | break; |
3419 | case 7: |
3420 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ); |
3421 | break; |
3422 | case 8: |
3423 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ); |
3424 | break; |
3425 | case 9: |
3426 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ); |
3427 | break; |
3428 | case 10: |
3429 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ); |
3430 | break; |
3431 | case 11: |
3432 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ); |
3433 | break; |
3434 | case 12: |
3435 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ); |
3436 | break; |
3437 | case 13: |
3438 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ); |
3439 | break; |
3440 | case 14: |
3441 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ); |
3442 | break; |
3443 | case 15: |
3444 | recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() ); |
3445 | break; |
3446 | default: |
3447 | assert( false ); |
3448 | break; |
3449 | } |
3450 | #else |
3451 | // wide multiplier |
3452 | __m128i rangeMul[16] = { |
3453 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ), |
3454 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ), |
3455 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ), |
3456 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ), |
3457 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ), |
3458 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ), |
3459 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ), |
3460 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ), |
3461 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ), |
3462 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ), |
3463 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ), |
3464 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ), |
3465 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ), |
3466 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ), |
3467 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ), |
3468 | _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() ) |
3469 | }; |
3470 | |
3471 | // find selector |
3472 | int err = std::numeric_limits<int>::max(); |
3473 | int sel; |
3474 | for( int r=0; r<16; r++ ) |
3475 | { |
3476 | __m128i err1, err2, minerr; |
3477 | __m128i recVal16 = rangeMul[r]; |
3478 | int rangeErr; |
3479 | |
3480 | err1 = _mm_sub_epi16( sr[0], recVal16 ); |
3481 | err2 = _mm_mullo_epi16( err1, err1 ); |
3482 | minerr = _mm_minpos_epu16( err2 ); |
3483 | rangeErr = _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3484 | |
3485 | err1 = _mm_sub_epi16( sr[1], recVal16 ); |
3486 | err2 = _mm_mullo_epi16( err1, err1 ); |
3487 | minerr = _mm_minpos_epu16( err2 ); |
3488 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3489 | |
3490 | err1 = _mm_sub_epi16( sr[2], recVal16 ); |
3491 | err2 = _mm_mullo_epi16( err1, err1 ); |
3492 | minerr = _mm_minpos_epu16( err2 ); |
3493 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3494 | |
3495 | err1 = _mm_sub_epi16( sr[3], recVal16 ); |
3496 | err2 = _mm_mullo_epi16( err1, err1 ); |
3497 | minerr = _mm_minpos_epu16( err2 ); |
3498 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3499 | |
3500 | err1 = _mm_sub_epi16( sr[4], recVal16 ); |
3501 | err2 = _mm_mullo_epi16( err1, err1 ); |
3502 | minerr = _mm_minpos_epu16( err2 ); |
3503 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3504 | |
3505 | err1 = _mm_sub_epi16( sr[5], recVal16 ); |
3506 | err2 = _mm_mullo_epi16( err1, err1 ); |
3507 | minerr = _mm_minpos_epu16( err2 ); |
3508 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3509 | |
3510 | err1 = _mm_sub_epi16( sr[6], recVal16 ); |
3511 | err2 = _mm_mullo_epi16( err1, err1 ); |
3512 | minerr = _mm_minpos_epu16( err2 ); |
3513 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3514 | |
3515 | err1 = _mm_sub_epi16( sr[7], recVal16 ); |
3516 | err2 = _mm_mullo_epi16( err1, err1 ); |
3517 | minerr = _mm_minpos_epu16( err2 ); |
3518 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3519 | |
3520 | err1 = _mm_sub_epi16( sr[8], recVal16 ); |
3521 | err2 = _mm_mullo_epi16( err1, err1 ); |
3522 | minerr = _mm_minpos_epu16( err2 ); |
3523 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3524 | |
3525 | err1 = _mm_sub_epi16( sr[9], recVal16 ); |
3526 | err2 = _mm_mullo_epi16( err1, err1 ); |
3527 | minerr = _mm_minpos_epu16( err2 ); |
3528 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3529 | |
3530 | err1 = _mm_sub_epi16( sr[10], recVal16 ); |
3531 | err2 = _mm_mullo_epi16( err1, err1 ); |
3532 | minerr = _mm_minpos_epu16( err2 ); |
3533 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3534 | |
3535 | err1 = _mm_sub_epi16( sr[11], recVal16 ); |
3536 | err2 = _mm_mullo_epi16( err1, err1 ); |
3537 | minerr = _mm_minpos_epu16( err2 ); |
3538 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3539 | |
3540 | err1 = _mm_sub_epi16( sr[12], recVal16 ); |
3541 | err2 = _mm_mullo_epi16( err1, err1 ); |
3542 | minerr = _mm_minpos_epu16( err2 ); |
3543 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3544 | |
3545 | err1 = _mm_sub_epi16( sr[13], recVal16 ); |
3546 | err2 = _mm_mullo_epi16( err1, err1 ); |
3547 | minerr = _mm_minpos_epu16( err2 ); |
3548 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3549 | |
3550 | err1 = _mm_sub_epi16( sr[14], recVal16 ); |
3551 | err2 = _mm_mullo_epi16( err1, err1 ); |
3552 | minerr = _mm_minpos_epu16( err2 ); |
3553 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3554 | |
3555 | err1 = _mm_sub_epi16( sr[15], recVal16 ); |
3556 | err2 = _mm_mullo_epi16( err1, err1 ); |
3557 | minerr = _mm_minpos_epu16( err2 ); |
3558 | rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF; |
3559 | |
3560 | if( rangeErr < err ) |
3561 | { |
3562 | err = rangeErr; |
3563 | sel = r; |
3564 | if( err == 0 ) break; |
3565 | } |
3566 | } |
3567 | |
3568 | __m128i recVal16 = rangeMul[sel]; |
3569 | #endif |
3570 | |
3571 | // find indices |
3572 | __m128i err1, err2, minerr; |
3573 | uint64_t idx = 0, tmp; |
3574 | |
3575 | err1 = _mm_sub_epi16( sr[0], recVal16 ); |
3576 | err2 = _mm_mullo_epi16( err1, err1 ); |
3577 | minerr = _mm_minpos_epu16( err2 ); |
3578 | tmp = _mm_cvtsi128_si64( minerr ); |
3579 | idx |= ( tmp >> 16 ) << 15*3; |
3580 | |
3581 | err1 = _mm_sub_epi16( sr[1], recVal16 ); |
3582 | err2 = _mm_mullo_epi16( err1, err1 ); |
3583 | minerr = _mm_minpos_epu16( err2 ); |
3584 | tmp = _mm_cvtsi128_si64( minerr ); |
3585 | idx |= ( tmp >> 16 ) << 14*3; |
3586 | |
3587 | err1 = _mm_sub_epi16( sr[2], recVal16 ); |
3588 | err2 = _mm_mullo_epi16( err1, err1 ); |
3589 | minerr = _mm_minpos_epu16( err2 ); |
3590 | tmp = _mm_cvtsi128_si64( minerr ); |
3591 | idx |= ( tmp >> 16 ) << 13*3; |
3592 | |
3593 | err1 = _mm_sub_epi16( sr[3], recVal16 ); |
3594 | err2 = _mm_mullo_epi16( err1, err1 ); |
3595 | minerr = _mm_minpos_epu16( err2 ); |
3596 | tmp = _mm_cvtsi128_si64( minerr ); |
3597 | idx |= ( tmp >> 16 ) << 12*3; |
3598 | |
3599 | err1 = _mm_sub_epi16( sr[4], recVal16 ); |
3600 | err2 = _mm_mullo_epi16( err1, err1 ); |
3601 | minerr = _mm_minpos_epu16( err2 ); |
3602 | tmp = _mm_cvtsi128_si64( minerr ); |
3603 | idx |= ( tmp >> 16 ) << 11*3; |
3604 | |
3605 | err1 = _mm_sub_epi16( sr[5], recVal16 ); |
3606 | err2 = _mm_mullo_epi16( err1, err1 ); |
3607 | minerr = _mm_minpos_epu16( err2 ); |
3608 | tmp = _mm_cvtsi128_si64( minerr ); |
3609 | idx |= ( tmp >> 16 ) << 10*3; |
3610 | |
3611 | err1 = _mm_sub_epi16( sr[6], recVal16 ); |
3612 | err2 = _mm_mullo_epi16( err1, err1 ); |
3613 | minerr = _mm_minpos_epu16( err2 ); |
3614 | tmp = _mm_cvtsi128_si64( minerr ); |
3615 | idx |= ( tmp >> 16 ) << 9*3; |
3616 | |
3617 | err1 = _mm_sub_epi16( sr[7], recVal16 ); |
3618 | err2 = _mm_mullo_epi16( err1, err1 ); |
3619 | minerr = _mm_minpos_epu16( err2 ); |
3620 | tmp = _mm_cvtsi128_si64( minerr ); |
3621 | idx |= ( tmp >> 16 ) << 8*3; |
3622 | |
3623 | err1 = _mm_sub_epi16( sr[8], recVal16 ); |
3624 | err2 = _mm_mullo_epi16( err1, err1 ); |
3625 | minerr = _mm_minpos_epu16( err2 ); |
3626 | tmp = _mm_cvtsi128_si64( minerr ); |
3627 | idx |= ( tmp >> 16 ) << 7*3; |
3628 | |
3629 | err1 = _mm_sub_epi16( sr[9], recVal16 ); |
3630 | err2 = _mm_mullo_epi16( err1, err1 ); |
3631 | minerr = _mm_minpos_epu16( err2 ); |
3632 | tmp = _mm_cvtsi128_si64( minerr ); |
3633 | idx |= ( tmp >> 16 ) << 6*3; |
3634 | |
3635 | err1 = _mm_sub_epi16( sr[10], recVal16 ); |
3636 | err2 = _mm_mullo_epi16( err1, err1 ); |
3637 | minerr = _mm_minpos_epu16( err2 ); |
3638 | tmp = _mm_cvtsi128_si64( minerr ); |
3639 | idx |= ( tmp >> 16 ) << 5*3; |
3640 | |
3641 | err1 = _mm_sub_epi16( sr[11], recVal16 ); |
3642 | err2 = _mm_mullo_epi16( err1, err1 ); |
3643 | minerr = _mm_minpos_epu16( err2 ); |
3644 | tmp = _mm_cvtsi128_si64( minerr ); |
3645 | idx |= ( tmp >> 16 ) << 4*3; |
3646 | |
3647 | err1 = _mm_sub_epi16( sr[12], recVal16 ); |
3648 | err2 = _mm_mullo_epi16( err1, err1 ); |
3649 | minerr = _mm_minpos_epu16( err2 ); |
3650 | tmp = _mm_cvtsi128_si64( minerr ); |
3651 | idx |= ( tmp >> 16 ) << 3*3; |
3652 | |
3653 | err1 = _mm_sub_epi16( sr[13], recVal16 ); |
3654 | err2 = _mm_mullo_epi16( err1, err1 ); |
3655 | minerr = _mm_minpos_epu16( err2 ); |
3656 | tmp = _mm_cvtsi128_si64( minerr ); |
3657 | idx |= ( tmp >> 16 ) << 2*3; |
3658 | |
3659 | err1 = _mm_sub_epi16( sr[14], recVal16 ); |
3660 | err2 = _mm_mullo_epi16( err1, err1 ); |
3661 | minerr = _mm_minpos_epu16( err2 ); |
3662 | tmp = _mm_cvtsi128_si64( minerr ); |
3663 | idx |= ( tmp >> 16 ) << 1*3; |
3664 | |
3665 | err1 = _mm_sub_epi16( sr[15], recVal16 ); |
3666 | err2 = _mm_mullo_epi16( err1, err1 ); |
3667 | minerr = _mm_minpos_epu16( err2 ); |
3668 | tmp = _mm_cvtsi128_si64( minerr ); |
3669 | idx |= ( tmp >> 16 ) << 0*3; |
3670 | |
3671 | uint16_t rm[8]; |
3672 | _mm_storeu_si128( (__m128i*)rm, mul ); |
3673 | uint16_t sm = _mm_cvtsi128_si64( srcMid ); |
3674 | |
3675 | uint64_t d = ( uint64_t( sm ) << 56 ) | |
3676 | ( uint64_t( rm[GetMulSel( sel )] ) << 52 ) | |
3677 | ( uint64_t( sel ) << 48 ) | |
3678 | idx; |
3679 | |
3680 | return _bswap64( d ); |
3681 | #elif defined __ARM_NEON |
3682 | |
3683 | int16x8_t srcMidWide, multipliers; |
3684 | int srcMid; |
3685 | uint8x16_t srcAlphaBlock = vld1q_u8( src ); |
3686 | { |
3687 | uint8_t ref = src[0]; |
3688 | uint8x16_t a0 = vdupq_n_u8( ref ); |
3689 | uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 ); |
3690 | int64x2_t m = vreinterpretq_s64_u8( r ); |
3691 | if( m[0] == -1 && m[1] == -1 ) |
3692 | return ref; |
3693 | |
3694 | // srcRange |
3695 | #ifdef __aarch64__ |
3696 | uint8_t min = vminvq_u8( srcAlphaBlock ); |
3697 | uint8_t max = vmaxvq_u8( srcAlphaBlock ); |
3698 | uint8_t srcRange = max - min; |
3699 | multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_n_s16( g_alphaRange_NEON, srcRange ), 1 ), vdupq_n_s16( 1 ) ); |
3700 | srcMid = min + srcRange / 2; |
3701 | srcMidWide = vdupq_n_s16( srcMid ); |
3702 | #else |
3703 | uint8x8_t vmin = vpmin_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) ); |
3704 | vmin = vpmin_u8( vmin, vmin ); |
3705 | vmin = vpmin_u8( vmin, vmin ); |
3706 | vmin = vpmin_u8( vmin, vmin ); |
3707 | uint8x8_t vmax = vpmax_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) ); |
3708 | vmax = vpmax_u8( vmax, vmax ); |
3709 | vmax = vpmax_u8( vmax, vmax ); |
3710 | vmax = vpmax_u8( vmax, vmax ); |
3711 | |
3712 | int16x8_t srcRangeWide = vreinterpretq_s16_u16( vsubl_u8( vmax, vmin ) ); |
3713 | multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_s16( g_alphaRange_NEON, srcRangeWide ), 1 ), vdupq_n_s16( 1 ) ); |
3714 | srcMidWide = vsraq_n_s16( vreinterpretq_s16_u16(vmovl_u8(vmin)), srcRangeWide, 1); |
3715 | srcMid = vgetq_lane_s16( srcMidWide, 0 ); |
3716 | #endif |
3717 | } |
3718 | |
3719 | // calculate reconstructed values |
3720 | #define EAC_APPLY_16X( m ) m( 0 ) m( 1 ) m( 2 ) m( 3 ) m( 4 ) m( 5 ) m( 6 ) m( 7 ) m( 8 ) m( 9 ) m( 10 ) m( 11 ) m( 12 ) m( 13 ) m( 14 ) m( 15 ) |
3721 | |
3722 | #define EAC_RECONSTRUCT_VALUE( n ) vqmovun_s16( vmlaq_s16( srcMidWide, g_alpha_NEON[n], WidenMultiplier_EAC_NEON<n>( multipliers ) ) ), |
3723 | uint8x8_t recVals[16] = { EAC_APPLY_16X( EAC_RECONSTRUCT_VALUE ) }; |
3724 | |
3725 | // find selector |
3726 | int err = std::numeric_limits<int>::max(); |
3727 | int sel = 0; |
3728 | for( int r = 0; r < 16; r++ ) |
3729 | { |
3730 | uint8x8_t recVal = recVals[r]; |
3731 | |
3732 | int rangeErr = 0; |
3733 | #define EAC_ACCUMULATE_ERROR( n ) rangeErr += MinError_EAC_NEON( ErrorProbe_EAC_NEON<n>( recVal, srcAlphaBlock ) ); |
3734 | EAC_APPLY_16X( EAC_ACCUMULATE_ERROR ) |
3735 | |
3736 | if( rangeErr < err ) |
3737 | { |
3738 | err = rangeErr; |
3739 | sel = r; |
3740 | if ( err == 0 ) break; |
3741 | } |
3742 | } |
3743 | |
3744 | // combine results |
3745 | uint64_t d = ( uint64_t( srcMid ) << 56 ) | |
3746 | ( uint64_t( multipliers[GetMulSel( sel )] ) << 52 ) | |
3747 | ( uint64_t( sel ) << 48); |
3748 | |
3749 | // generate indices |
3750 | uint8x8_t recVal = recVals[sel]; |
3751 | #define EAC_INSERT_INDEX(n) d |= MinErrorIndex_EAC_NEON<n>( recVal, srcAlphaBlock ); |
3752 | EAC_APPLY_16X( EAC_INSERT_INDEX ) |
3753 | |
3754 | return _bswap64( d ); |
3755 | |
3756 | #undef EAC_APPLY_16X |
3757 | #undef EAC_INSERT_INDEX |
3758 | #undef EAC_ACCUMULATE_ERROR |
3759 | #undef EAC_RECONSTRUCT_VALUE |
3760 | |
3761 | #else |
3762 | { |
3763 | bool solid = true; |
3764 | const uint8_t* ptr = src + 1; |
3765 | const uint8_t ref = *src; |
3766 | for( int i=1; i<16; i++ ) |
3767 | { |
3768 | if( ref != *ptr++ ) |
3769 | { |
3770 | solid = false; |
3771 | break; |
3772 | } |
3773 | } |
3774 | if( solid ) |
3775 | { |
3776 | return ref; |
3777 | } |
3778 | } |
3779 | |
3780 | uint8_t min = src[0]; |
3781 | uint8_t max = src[0]; |
3782 | for( int i=1; i<16; i++ ) |
3783 | { |
3784 | if( min > src[i] ) min = src[i]; |
3785 | else if( max < src[i] ) max = src[i]; |
3786 | } |
3787 | int srcRange = max - min; |
3788 | int srcMid = min + srcRange / 2; |
3789 | |
3790 | uint8_t buf[16][16]; |
3791 | int err = std::numeric_limits<int>::max(); |
3792 | int sel; |
3793 | int selmul; |
3794 | for( int r=0; r<16; r++ ) |
3795 | { |
3796 | int mul = ( ( srcRange * g_alphaRange[r] ) >> 16 ) + 1; |
3797 | |
3798 | int rangeErr = 0; |
3799 | for( int i=0; i<16; i++ ) |
3800 | { |
3801 | const auto srcVal = src[i]; |
3802 | |
3803 | int idx = 0; |
3804 | const auto modVal = g_alpha[r][0] * mul; |
3805 | const auto recVal = clampu8( srcMid + modVal ); |
3806 | int localErr = sq( srcVal - recVal ); |
3807 | |
3808 | if( localErr != 0 ) |
3809 | { |
3810 | for( int j=1; j<8; j++ ) |
3811 | { |
3812 | const auto modVal = g_alpha[r][j] * mul; |
3813 | const auto recVal = clampu8( srcMid + modVal ); |
3814 | const auto errProbe = sq( srcVal - recVal ); |
3815 | if( errProbe < localErr ) |
3816 | { |
3817 | localErr = errProbe; |
3818 | idx = j; |
3819 | } |
3820 | } |
3821 | } |
3822 | |
3823 | buf[r][i] = idx; |
3824 | rangeErr += localErr; |
3825 | } |
3826 | |
3827 | if( rangeErr < err ) |
3828 | { |
3829 | err = rangeErr; |
3830 | sel = r; |
3831 | selmul = mul; |
3832 | if( err == 0 ) break; |
3833 | } |
3834 | } |
3835 | |
3836 | uint64_t d = ( uint64_t( srcMid ) << 56 ) | |
3837 | ( uint64_t( selmul ) << 52 ) | |
3838 | ( uint64_t( sel ) << 48 ); |
3839 | |
3840 | int offset = 45; |
3841 | auto ptr = buf[sel]; |
3842 | for( int i=0; i<16; i++ ) |
3843 | { |
3844 | d |= uint64_t( *ptr++ ) << offset; |
3845 | offset -= 3; |
3846 | } |
3847 | |
3848 | return _bswap64( d ); |
3849 | #endif |
3850 | } |
3851 | |
3852 | |
3853 | void CompressEtc1Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) |
3854 | { |
3855 | int w = 0; |
3856 | uint32_t buf[4*4]; |
3857 | do |
3858 | { |
3859 | #ifdef __SSE4_1__ |
3860 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
3861 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
3862 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
3863 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
3864 | |
3865 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
3866 | |
3867 | __m128i c0 = _mm_castps_si128( px0 ); |
3868 | __m128i c1 = _mm_castps_si128( px1 ); |
3869 | __m128i c2 = _mm_castps_si128( px2 ); |
3870 | __m128i c3 = _mm_castps_si128( px3 ); |
3871 | |
3872 | __m128i mask = _mm_setr_epi32( 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f ); |
3873 | __m128i p0 = _mm_shuffle_epi8( c0, mask ); |
3874 | __m128i p1 = _mm_shuffle_epi8( c1, mask ); |
3875 | __m128i p2 = _mm_shuffle_epi8( c2, mask ); |
3876 | __m128i p3 = _mm_shuffle_epi8( c3, mask ); |
3877 | |
3878 | _mm_store_si128( (__m128i*)(buf + 0), p0 ); |
3879 | _mm_store_si128( (__m128i*)(buf + 4), p1 ); |
3880 | _mm_store_si128( (__m128i*)(buf + 8), p2 ); |
3881 | _mm_store_si128( (__m128i*)(buf + 12), p3 ); |
3882 | |
3883 | src += 4; |
3884 | #else |
3885 | auto ptr = buf; |
3886 | for( int x=0; x<4; x++ ) |
3887 | { |
3888 | unsigned int a = *src >> 24; |
3889 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
3890 | src += width; |
3891 | a = *src >> 24; |
3892 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
3893 | src += width; |
3894 | a = *src >> 24; |
3895 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
3896 | src += width; |
3897 | a = *src >> 24; |
3898 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
3899 | src -= width * 3 - 1; |
3900 | } |
3901 | #endif |
3902 | if( ++w == width/4 ) |
3903 | { |
3904 | src += width * 3; |
3905 | w = 0; |
3906 | } |
3907 | *dst++ = ProcessRGB( (uint8_t*)buf ); |
3908 | } |
3909 | while( --blocks ); |
3910 | } |
3911 | |
3912 | void CompressEtc2Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics ) |
3913 | { |
3914 | int w = 0; |
3915 | uint32_t buf[4*4]; |
3916 | do |
3917 | { |
3918 | #ifdef __SSE4_1__ |
3919 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
3920 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
3921 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
3922 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
3923 | |
3924 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
3925 | |
3926 | __m128i c0 = _mm_castps_si128( px0 ); |
3927 | __m128i c1 = _mm_castps_si128( px1 ); |
3928 | __m128i c2 = _mm_castps_si128( px2 ); |
3929 | __m128i c3 = _mm_castps_si128( px3 ); |
3930 | |
3931 | __m128i mask = _mm_setr_epi32( 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f ); |
3932 | __m128i p0 = _mm_shuffle_epi8( c0, mask ); |
3933 | __m128i p1 = _mm_shuffle_epi8( c1, mask ); |
3934 | __m128i p2 = _mm_shuffle_epi8( c2, mask ); |
3935 | __m128i p3 = _mm_shuffle_epi8( c3, mask ); |
3936 | |
3937 | _mm_store_si128( (__m128i*)(buf + 0), p0 ); |
3938 | _mm_store_si128( (__m128i*)(buf + 4), p1 ); |
3939 | _mm_store_si128( (__m128i*)(buf + 8), p2 ); |
3940 | _mm_store_si128( (__m128i*)(buf + 12), p3 ); |
3941 | |
3942 | src += 4; |
3943 | #else |
3944 | auto ptr = buf; |
3945 | for( int x=0; x<4; x++ ) |
3946 | { |
3947 | unsigned int a = *src >> 24; |
3948 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
3949 | src += width; |
3950 | a = *src >> 24; |
3951 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
3952 | src += width; |
3953 | a = *src >> 24; |
3954 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
3955 | src += width; |
3956 | a = *src >> 24; |
3957 | *ptr++ = a | ( a << 8 ) | ( a << 16 ); |
3958 | src -= width * 3 - 1; |
3959 | } |
3960 | #endif |
3961 | if( ++w == width/4 ) |
3962 | { |
3963 | src += width * 3; |
3964 | w = 0; |
3965 | } |
3966 | *dst++ = ProcessRGB_ETC2( (uint8_t*)buf, useHeuristics ); |
3967 | } |
3968 | while( --blocks ); |
3969 | } |
3970 | |
3971 | #include <chrono> |
3972 | #include <thread> |
3973 | |
3974 | void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) |
3975 | { |
3976 | int w = 0; |
3977 | uint32_t buf[4*4]; |
3978 | do |
3979 | { |
3980 | #ifdef __SSE4_1__ |
3981 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
3982 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
3983 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
3984 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
3985 | |
3986 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
3987 | |
3988 | _mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) ); |
3989 | _mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) ); |
3990 | _mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) ); |
3991 | _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) ); |
3992 | |
3993 | src += 4; |
3994 | #else |
3995 | auto ptr = buf; |
3996 | for( int x=0; x<4; x++ ) |
3997 | { |
3998 | *ptr++ = *src; |
3999 | src += width; |
4000 | *ptr++ = *src; |
4001 | src += width; |
4002 | *ptr++ = *src; |
4003 | src += width; |
4004 | *ptr++ = *src; |
4005 | src -= width * 3 - 1; |
4006 | } |
4007 | #endif |
4008 | if( ++w == width/4 ) |
4009 | { |
4010 | src += width * 3; |
4011 | w = 0; |
4012 | } |
4013 | *dst++ = ProcessRGB( (uint8_t*)buf ); |
4014 | } |
4015 | while( --blocks ); |
4016 | } |
4017 | |
4018 | void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) |
4019 | { |
4020 | int w = 0; |
4021 | uint32_t buf[4*4]; |
4022 | do |
4023 | { |
4024 | #ifdef __SSE4_1__ |
4025 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
4026 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
4027 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
4028 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
4029 | |
4030 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
4031 | |
4032 | # ifdef __AVX2__ |
4033 | DitherAvx2( (uint8_t*)buf, _mm_castps_si128( px0 ), _mm_castps_si128( px1 ), _mm_castps_si128( px2 ), _mm_castps_si128( px3 ) ); |
4034 | # else |
4035 | _mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) ); |
4036 | _mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) ); |
4037 | _mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) ); |
4038 | _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) ); |
4039 | |
4040 | Dither( (uint8_t*)buf ); |
4041 | # endif |
4042 | |
4043 | src += 4; |
4044 | #else |
4045 | auto ptr = buf; |
4046 | for( int x=0; x<4; x++ ) |
4047 | { |
4048 | *ptr++ = *src; |
4049 | src += width; |
4050 | *ptr++ = *src; |
4051 | src += width; |
4052 | *ptr++ = *src; |
4053 | src += width; |
4054 | *ptr++ = *src; |
4055 | src -= width * 3 - 1; |
4056 | } |
4057 | #endif |
4058 | if( ++w == width/4 ) |
4059 | { |
4060 | src += width * 3; |
4061 | w = 0; |
4062 | } |
4063 | *dst++ = ProcessRGB( (uint8_t*)buf ); |
4064 | } |
4065 | while( --blocks ); |
4066 | } |
4067 | |
4068 | void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics ) |
4069 | { |
4070 | int w = 0; |
4071 | uint32_t buf[4*4]; |
4072 | do |
4073 | { |
4074 | #ifdef __SSE4_1__ |
4075 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
4076 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
4077 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
4078 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
4079 | |
4080 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
4081 | |
4082 | _mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) ); |
4083 | _mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) ); |
4084 | _mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) ); |
4085 | _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) ); |
4086 | |
4087 | src += 4; |
4088 | #else |
4089 | auto ptr = buf; |
4090 | for( int x=0; x<4; x++ ) |
4091 | { |
4092 | *ptr++ = *src; |
4093 | src += width; |
4094 | *ptr++ = *src; |
4095 | src += width; |
4096 | *ptr++ = *src; |
4097 | src += width; |
4098 | *ptr++ = *src; |
4099 | src -= width * 3 - 1; |
4100 | } |
4101 | #endif |
4102 | if( ++w == width/4 ) |
4103 | { |
4104 | src += width * 3; |
4105 | w = 0; |
4106 | } |
4107 | *dst++ = ProcessRGB_ETC2( (uint8_t*)buf, useHeuristics ); |
4108 | } |
4109 | while( --blocks ); |
4110 | } |
4111 | |
4112 | void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics ) |
4113 | { |
4114 | int w = 0; |
4115 | uint32_t rgba[4*4]; |
4116 | uint8_t alpha[4*4]; |
4117 | do |
4118 | { |
4119 | #ifdef __SSE4_1__ |
4120 | __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) ); |
4121 | __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) ); |
4122 | __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) ); |
4123 | __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) ); |
4124 | |
4125 | _MM_TRANSPOSE4_PS( px0, px1, px2, px3 ); |
4126 | |
4127 | __m128i c0 = _mm_castps_si128( px0 ); |
4128 | __m128i c1 = _mm_castps_si128( px1 ); |
4129 | __m128i c2 = _mm_castps_si128( px2 ); |
4130 | __m128i c3 = _mm_castps_si128( px3 ); |
4131 | |
4132 | _mm_store_si128( (__m128i*)(rgba + 0), c0 ); |
4133 | _mm_store_si128( (__m128i*)(rgba + 4), c1 ); |
4134 | _mm_store_si128( (__m128i*)(rgba + 8), c2 ); |
4135 | _mm_store_si128( (__m128i*)(rgba + 12), c3 ); |
4136 | |
4137 | __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 ); |
4138 | |
4139 | __m128i a0 = _mm_shuffle_epi8( c0, mask ); |
4140 | __m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); |
4141 | __m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); |
4142 | __m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); |
4143 | |
4144 | __m128i s0 = _mm_or_si128( a0, a1 ); |
4145 | __m128i s1 = _mm_or_si128( a2, a3 ); |
4146 | __m128i s2 = _mm_or_si128( s0, s1 ); |
4147 | |
4148 | _mm_store_si128( (__m128i*)alpha, s2 ); |
4149 | |
4150 | src += 4; |
4151 | #else |
4152 | auto ptr = rgba; |
4153 | auto ptr8 = alpha; |
4154 | for( int x=0; x<4; x++ ) |
4155 | { |
4156 | auto v = *src; |
4157 | *ptr++ = v; |
4158 | *ptr8++ = v >> 24; |
4159 | src += width; |
4160 | v = *src; |
4161 | *ptr++ = v; |
4162 | *ptr8++ = v >> 24; |
4163 | src += width; |
4164 | v = *src; |
4165 | *ptr++ = v; |
4166 | *ptr8++ = v >> 24; |
4167 | src += width; |
4168 | v = *src; |
4169 | *ptr++ = v; |
4170 | *ptr8++ = v >> 24; |
4171 | src -= width * 3 - 1; |
4172 | } |
4173 | #endif |
4174 | if( ++w == width/4 ) |
4175 | { |
4176 | src += width * 3; |
4177 | w = 0; |
4178 | } |
4179 | *dst++ = ProcessAlpha_ETC2( alpha ); |
4180 | *dst++ = ProcessRGB_ETC2( (uint8_t*)rgba, useHeuristics ); |
4181 | } |
4182 | while( --blocks ); |
4183 | } |
4184 | |