1#include "Dither.hpp"
2#include "ForceInline.hpp"
3#include "ProcessDxtc.hpp"
4
5#include <assert.h>
6#include <stdint.h>
7#include <string.h>
8
9#ifdef __ARM_NEON
10# include <arm_neon.h>
11#endif
12
13#if defined __AVX__ && !defined __SSE4_1__
14# define __SSE4_1__
15#endif
16
17#if defined __SSE4_1__ || defined __AVX2__
18# ifdef _MSC_VER
19# include <intrin.h>
20# else
21# include <x86intrin.h>
22# ifndef _mm256_cvtsi256_si32
23# define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) )
24# endif
25# endif
26#endif
27
28
29static etcpak_force_inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b )
30{
31 return ( ( r & 0xF8 ) << 8 ) | ( ( g & 0xFC ) << 3 ) | ( b >> 3 );
32}
33
34static etcpak_force_inline uint16_t to565( uint32_t c )
35{
36 return
37 ( ( c & 0xF80000 ) >> 19 ) |
38 ( ( c & 0x00FC00 ) >> 5 ) |
39 ( ( c & 0x0000F8 ) << 8 );
40}
41
42static const uint8_t DxtcIndexTable[256] = {
43 85, 87, 86, 84, 93, 95, 94, 92, 89, 91, 90, 88, 81, 83, 82, 80,
44 117, 119, 118, 116, 125, 127, 126, 124, 121, 123, 122, 120, 113, 115, 114, 112,
45 101, 103, 102, 100, 109, 111, 110, 108, 105, 107, 106, 104, 97, 99, 98, 96,
46 69, 71, 70, 68, 77, 79, 78, 76, 73, 75, 74, 72, 65, 67, 66, 64,
47 213, 215, 214, 212, 221, 223, 222, 220, 217, 219, 218, 216, 209, 211, 210, 208,
48 245, 247, 246, 244, 253, 255, 254, 252, 249, 251, 250, 248, 241, 243, 242, 240,
49 229, 231, 230, 228, 237, 239, 238, 236, 233, 235, 234, 232, 225, 227, 226, 224,
50 197, 199, 198, 196, 205, 207, 206, 204, 201, 203, 202, 200, 193, 195, 194, 192,
51 149, 151, 150, 148, 157, 159, 158, 156, 153, 155, 154, 152, 145, 147, 146, 144,
52 181, 183, 182, 180, 189, 191, 190, 188, 185, 187, 186, 184, 177, 179, 178, 176,
53 165, 167, 166, 164, 173, 175, 174, 172, 169, 171, 170, 168, 161, 163, 162, 160,
54 133, 135, 134, 132, 141, 143, 142, 140, 137, 139, 138, 136, 129, 131, 130, 128,
55 21, 23, 22, 20, 29, 31, 30, 28, 25, 27, 26, 24, 17, 19, 18, 16,
56 53, 55, 54, 52, 61, 63, 62, 60, 57, 59, 58, 56, 49, 51, 50, 48,
57 37, 39, 38, 36, 45, 47, 46, 44, 41, 43, 42, 40, 33, 35, 34, 32,
58 5, 7, 6, 4, 13, 15, 14, 12, 9, 11, 10, 8, 1, 3, 2, 0
59};
60
61static const uint8_t AlphaIndexTable_SSE[64] = {
62 9, 15, 14, 13, 12, 11, 10, 8, 57, 63, 62, 61, 60, 59, 58, 56,
63 49, 55, 54, 53, 52, 51, 50, 48, 41, 47, 46, 45, 44, 43, 42, 40,
64 33, 39, 38, 37, 36, 35, 34, 32, 25, 31, 30, 29, 28, 27, 26, 24,
65 17, 23, 22, 21, 20, 19, 18, 16, 1, 7, 6, 5, 4, 3, 2, 0,
66};
67
68static const uint16_t DivTable[255*3+1] = {
69 0xffff, 0xffff, 0xffff, 0xffff, 0xcccc, 0xaaaa, 0x9249, 0x8000, 0x71c7, 0x6666, 0x5d17, 0x5555, 0x4ec4, 0x4924, 0x4444, 0x4000,
70 0x3c3c, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
71 0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
72 0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
73 0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
74 0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
75 0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
76 0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
77 0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
78 0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
79 0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
80 0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
81 0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
82 0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
83 0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
84 0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
85 0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
86 0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
87 0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
88 0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
89 0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
90 0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
91 0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
92 0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
93 0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
94 0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
95 0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
96 0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
97 0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
98 0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
99 0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
100 0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
101 0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
102 0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
103 0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
104 0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
105 0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
106 0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
107 0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
108 0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
109 0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
110 0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
111 0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
112 0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
113 0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
114 0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
115 0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
116 0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
117};
118static const uint16_t DivTableNEON[255*3+1] = {
119 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
120 0x0000, 0x1c71, 0x1af2, 0x1999, 0x1861, 0x1745, 0x1642, 0x1555, 0x147a, 0x13b1, 0x12f6, 0x1249, 0x11a7, 0x1111, 0x1084, 0x1000,
121 0x0f83, 0x0f0f, 0x0ea0, 0x0e38, 0x0dd6, 0x0d79, 0x0d20, 0x0ccc, 0x0c7c, 0x0c30, 0x0be8, 0x0ba2, 0x0b60, 0x0b21, 0x0ae4, 0x0aaa,
122 0x0a72, 0x0a3d, 0x0a0a, 0x09d8, 0x09a9, 0x097b, 0x094f, 0x0924, 0x08fb, 0x08d3, 0x08ad, 0x0888, 0x0864, 0x0842, 0x0820, 0x0800,
123 0x07e0, 0x07c1, 0x07a4, 0x0787, 0x076b, 0x0750, 0x0736, 0x071c, 0x0703, 0x06eb, 0x06d3, 0x06bc, 0x06a6, 0x0690, 0x067b, 0x0666,
124 0x0652, 0x063e, 0x062b, 0x0618, 0x0606, 0x05f4, 0x05e2, 0x05d1, 0x05c0, 0x05b0, 0x05a0, 0x0590, 0x0581, 0x0572, 0x0563, 0x0555,
125 0x0547, 0x0539, 0x052b, 0x051e, 0x0511, 0x0505, 0x04f8, 0x04ec, 0x04e0, 0x04d4, 0x04c8, 0x04bd, 0x04b2, 0x04a7, 0x049c, 0x0492,
126 0x0487, 0x047d, 0x0473, 0x0469, 0x0460, 0x0456, 0x044d, 0x0444, 0x043b, 0x0432, 0x0429, 0x0421, 0x0418, 0x0410, 0x0408, 0x0400,
127 0x03f8, 0x03f0, 0x03e8, 0x03e0, 0x03d9, 0x03d2, 0x03ca, 0x03c3, 0x03bc, 0x03b5, 0x03ae, 0x03a8, 0x03a1, 0x039b, 0x0394, 0x038e,
128 0x0387, 0x0381, 0x037b, 0x0375, 0x036f, 0x0369, 0x0364, 0x035e, 0x0358, 0x0353, 0x034d, 0x0348, 0x0342, 0x033d, 0x0338, 0x0333,
129 0x032e, 0x0329, 0x0324, 0x031f, 0x031a, 0x0315, 0x0310, 0x030c, 0x0307, 0x0303, 0x02fe, 0x02fa, 0x02f5, 0x02f1, 0x02ec, 0x02e8,
130 0x02e4, 0x02e0, 0x02dc, 0x02d8, 0x02d4, 0x02d0, 0x02cc, 0x02c8, 0x02c4, 0x02c0, 0x02bc, 0x02b9, 0x02b5, 0x02b1, 0x02ae, 0x02aa,
131 0x02a7, 0x02a3, 0x02a0, 0x029c, 0x0299, 0x0295, 0x0292, 0x028f, 0x028c, 0x0288, 0x0285, 0x0282, 0x027f, 0x027c, 0x0279, 0x0276,
132 0x0273, 0x0270, 0x026d, 0x026a, 0x0267, 0x0264, 0x0261, 0x025e, 0x025c, 0x0259, 0x0256, 0x0253, 0x0251, 0x024e, 0x024b, 0x0249,
133 0x0246, 0x0243, 0x0241, 0x023e, 0x023c, 0x0239, 0x0237, 0x0234, 0x0232, 0x0230, 0x022d, 0x022b, 0x0229, 0x0226, 0x0224, 0x0222,
134 0x021f, 0x021d, 0x021b, 0x0219, 0x0216, 0x0214, 0x0212, 0x0210, 0x020e, 0x020c, 0x020a, 0x0208, 0x0206, 0x0204, 0x0202, 0x0200,
135 0x01fe, 0x01fc, 0x01fa, 0x01f8, 0x01f6, 0x01f4, 0x01f2, 0x01f0, 0x01ee, 0x01ec, 0x01ea, 0x01e9, 0x01e7, 0x01e5, 0x01e3, 0x01e1,
136 0x01e0, 0x01de, 0x01dc, 0x01da, 0x01d9, 0x01d7, 0x01d5, 0x01d4, 0x01d2, 0x01d0, 0x01cf, 0x01cd, 0x01cb, 0x01ca, 0x01c8, 0x01c7,
137 0x01c5, 0x01c3, 0x01c2, 0x01c0, 0x01bf, 0x01bd, 0x01bc, 0x01ba, 0x01b9, 0x01b7, 0x01b6, 0x01b4, 0x01b3, 0x01b2, 0x01b0, 0x01af,
138 0x01ad, 0x01ac, 0x01aa, 0x01a9, 0x01a8, 0x01a6, 0x01a5, 0x01a4, 0x01a2, 0x01a1, 0x01a0, 0x019e, 0x019d, 0x019c, 0x019a, 0x0199,
139 0x0198, 0x0197, 0x0195, 0x0194, 0x0193, 0x0192, 0x0190, 0x018f, 0x018e, 0x018d, 0x018b, 0x018a, 0x0189, 0x0188, 0x0187, 0x0186,
140 0x0184, 0x0183, 0x0182, 0x0181, 0x0180, 0x017f, 0x017e, 0x017d, 0x017b, 0x017a, 0x0179, 0x0178, 0x0177, 0x0176, 0x0175, 0x0174,
141 0x0173, 0x0172, 0x0171, 0x0170, 0x016f, 0x016e, 0x016d, 0x016c, 0x016b, 0x016a, 0x0169, 0x0168, 0x0167, 0x0166, 0x0165, 0x0164,
142 0x0163, 0x0162, 0x0161, 0x0160, 0x015f, 0x015e, 0x015d, 0x015c, 0x015b, 0x015a, 0x0159, 0x0158, 0x0158, 0x0157, 0x0156, 0x0155,
143 0x0154, 0x0153, 0x0152, 0x0151, 0x0150, 0x0150, 0x014f, 0x014e, 0x014d, 0x014c, 0x014b, 0x014a, 0x014a, 0x0149, 0x0148, 0x0147,
144 0x0146, 0x0146, 0x0145, 0x0144, 0x0143, 0x0142, 0x0142, 0x0141, 0x0140, 0x013f, 0x013e, 0x013e, 0x013d, 0x013c, 0x013b, 0x013b,
145 0x013a, 0x0139, 0x0138, 0x0138, 0x0137, 0x0136, 0x0135, 0x0135, 0x0134, 0x0133, 0x0132, 0x0132, 0x0131, 0x0130, 0x0130, 0x012f,
146 0x012e, 0x012e, 0x012d, 0x012c, 0x012b, 0x012b, 0x012a, 0x0129, 0x0129, 0x0128, 0x0127, 0x0127, 0x0126, 0x0125, 0x0125, 0x0124,
147 0x0123, 0x0123, 0x0122, 0x0121, 0x0121, 0x0120, 0x0120, 0x011f, 0x011e, 0x011e, 0x011d, 0x011c, 0x011c, 0x011b, 0x011b, 0x011a,
148 0x0119, 0x0119, 0x0118, 0x0118, 0x0117, 0x0116, 0x0116, 0x0115, 0x0115, 0x0114, 0x0113, 0x0113, 0x0112, 0x0112, 0x0111, 0x0111,
149 0x0110, 0x010f, 0x010f, 0x010e, 0x010e, 0x010d, 0x010d, 0x010c, 0x010c, 0x010b, 0x010a, 0x010a, 0x0109, 0x0109, 0x0108, 0x0108,
150 0x0107, 0x0107, 0x0106, 0x0106, 0x0105, 0x0105, 0x0104, 0x0104, 0x0103, 0x0103, 0x0102, 0x0102, 0x0101, 0x0101, 0x0100, 0x0100,
151 0x00ff, 0x00ff, 0x00fe, 0x00fe, 0x00fd, 0x00fd, 0x00fc, 0x00fc, 0x00fb, 0x00fb, 0x00fa, 0x00fa, 0x00f9, 0x00f9, 0x00f8, 0x00f8,
152 0x00f7, 0x00f7, 0x00f6, 0x00f6, 0x00f5, 0x00f5, 0x00f4, 0x00f4, 0x00f4, 0x00f3, 0x00f3, 0x00f2, 0x00f2, 0x00f1, 0x00f1, 0x00f0,
153 0x00f0, 0x00f0, 0x00ef, 0x00ef, 0x00ee, 0x00ee, 0x00ed, 0x00ed, 0x00ed, 0x00ec, 0x00ec, 0x00eb, 0x00eb, 0x00ea, 0x00ea, 0x00ea,
154 0x00e9, 0x00e9, 0x00e8, 0x00e8, 0x00e7, 0x00e7, 0x00e7, 0x00e6, 0x00e6, 0x00e5, 0x00e5, 0x00e5, 0x00e4, 0x00e4, 0x00e3, 0x00e3,
155 0x00e3, 0x00e2, 0x00e2, 0x00e1, 0x00e1, 0x00e1, 0x00e0, 0x00e0, 0x00e0, 0x00df, 0x00df, 0x00de, 0x00de, 0x00de, 0x00dd, 0x00dd,
156 0x00dd, 0x00dc, 0x00dc, 0x00db, 0x00db, 0x00db, 0x00da, 0x00da, 0x00da, 0x00d9, 0x00d9, 0x00d9, 0x00d8, 0x00d8, 0x00d7, 0x00d7,
157 0x00d7, 0x00d6, 0x00d6, 0x00d6, 0x00d5, 0x00d5, 0x00d5, 0x00d4, 0x00d4, 0x00d4, 0x00d3, 0x00d3, 0x00d3, 0x00d2, 0x00d2, 0x00d2,
158 0x00d1, 0x00d1, 0x00d1, 0x00d0, 0x00d0, 0x00d0, 0x00cf, 0x00cf, 0x00cf, 0x00ce, 0x00ce, 0x00ce, 0x00cd, 0x00cd, 0x00cd, 0x00cc,
159 0x00cc, 0x00cc, 0x00cb, 0x00cb, 0x00cb, 0x00ca, 0x00ca, 0x00ca, 0x00c9, 0x00c9, 0x00c9, 0x00c9, 0x00c8, 0x00c8, 0x00c8, 0x00c7,
160 0x00c7, 0x00c7, 0x00c6, 0x00c6, 0x00c6, 0x00c5, 0x00c5, 0x00c5, 0x00c5, 0x00c4, 0x00c4, 0x00c4, 0x00c3, 0x00c3, 0x00c3, 0x00c3,
161 0x00c2, 0x00c2, 0x00c2, 0x00c1, 0x00c1, 0x00c1, 0x00c1, 0x00c0, 0x00c0, 0x00c0, 0x00bf, 0x00bf, 0x00bf, 0x00bf, 0x00be, 0x00be,
162 0x00be, 0x00bd, 0x00bd, 0x00bd, 0x00bd, 0x00bc, 0x00bc, 0x00bc, 0x00bc, 0x00bb, 0x00bb, 0x00bb, 0x00ba, 0x00ba, 0x00ba, 0x00ba,
163 0x00b9, 0x00b9, 0x00b9, 0x00b9, 0x00b8, 0x00b8, 0x00b8, 0x00b8, 0x00b7, 0x00b7, 0x00b7, 0x00b7, 0x00b6, 0x00b6, 0x00b6, 0x00b6,
164 0x00b5, 0x00b5, 0x00b5, 0x00b5, 0x00b4, 0x00b4, 0x00b4, 0x00b4, 0x00b3, 0x00b3, 0x00b3, 0x00b3, 0x00b2, 0x00b2, 0x00b2, 0x00b2,
165 0x00b1, 0x00b1, 0x00b1, 0x00b1, 0x00b0, 0x00b0, 0x00b0, 0x00b0, 0x00af, 0x00af, 0x00af, 0x00af, 0x00ae, 0x00ae, 0x00ae, 0x00ae,
166 0x00ae, 0x00ad, 0x00ad, 0x00ad, 0x00ad, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ab, 0x00ab, 0x00ab, 0x00ab,
167};
168
169static const uint16_t DivTableAlpha[256] = {
170 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xe38e, 0xcccc, 0xba2e, 0xaaaa, 0x9d89, 0x9249, 0x8888, 0x8000,
171 0x7878, 0x71c7, 0x6bca, 0x6666, 0x6186, 0x5d17, 0x590b, 0x5555, 0x51eb, 0x4ec4, 0x4bda, 0x4924, 0x469e, 0x4444, 0x4210, 0x4000,
172 0x3e0f, 0x3c3c, 0x3a83, 0x38e3, 0x3759, 0x35e5, 0x3483, 0x3333, 0x31f3, 0x30c3, 0x2fa0, 0x2e8b, 0x2d82, 0x2c85, 0x2b93, 0x2aaa,
173 0x29cb, 0x28f5, 0x2828, 0x2762, 0x26a4, 0x25ed, 0x253c, 0x2492, 0x23ee, 0x234f, 0x22b6, 0x2222, 0x2192, 0x2108, 0x2082, 0x2000,
174 0x1f81, 0x1f07, 0x1e91, 0x1e1e, 0x1dae, 0x1d41, 0x1cd8, 0x1c71, 0x1c0e, 0x1bac, 0x1b4e, 0x1af2, 0x1a98, 0x1a41, 0x19ec, 0x1999,
175 0x1948, 0x18f9, 0x18ac, 0x1861, 0x1818, 0x17d0, 0x178a, 0x1745, 0x1702, 0x16c1, 0x1681, 0x1642, 0x1605, 0x15c9, 0x158e, 0x1555,
176 0x151d, 0x14e5, 0x14af, 0x147a, 0x1446, 0x1414, 0x13e2, 0x13b1, 0x1381, 0x1352, 0x1323, 0x12f6, 0x12c9, 0x129e, 0x1273, 0x1249,
177 0x121f, 0x11f7, 0x11cf, 0x11a7, 0x1181, 0x115b, 0x1135, 0x1111, 0x10ec, 0x10c9, 0x10a6, 0x1084, 0x1062, 0x1041, 0x1020, 0x1000,
178 0x0fe0, 0x0fc0, 0x0fa2, 0x0f83, 0x0f66, 0x0f48, 0x0f2b, 0x0f0f, 0x0ef2, 0x0ed7, 0x0ebb, 0x0ea0, 0x0e86, 0x0e6c, 0x0e52, 0x0e38,
179 0x0e1f, 0x0e07, 0x0dee, 0x0dd6, 0x0dbe, 0x0da7, 0x0d90, 0x0d79, 0x0d62, 0x0d4c, 0x0d36, 0x0d20, 0x0d0b, 0x0cf6, 0x0ce1, 0x0ccc,
180 0x0cb8, 0x0ca4, 0x0c90, 0x0c7c, 0x0c69, 0x0c56, 0x0c43, 0x0c30, 0x0c1e, 0x0c0c, 0x0bfa, 0x0be8, 0x0bd6, 0x0bc5, 0x0bb3, 0x0ba2,
181 0x0b92, 0x0b81, 0x0b70, 0x0b60, 0x0b50, 0x0b40, 0x0b30, 0x0b21, 0x0b11, 0x0b02, 0x0af3, 0x0ae4, 0x0ad6, 0x0ac7, 0x0ab8, 0x0aaa,
182 0x0a9c, 0x0a8e, 0x0a80, 0x0a72, 0x0a65, 0x0a57, 0x0a4a, 0x0a3d, 0x0a30, 0x0a23, 0x0a16, 0x0a0a, 0x09fd, 0x09f1, 0x09e4, 0x09d8,
183 0x09cc, 0x09c0, 0x09b4, 0x09a9, 0x099d, 0x0991, 0x0986, 0x097b, 0x0970, 0x0964, 0x095a, 0x094f, 0x0944, 0x0939, 0x092f, 0x0924,
184 0x091a, 0x090f, 0x0905, 0x08fb, 0x08f1, 0x08e7, 0x08dd, 0x08d3, 0x08ca, 0x08c0, 0x08b7, 0x08ad, 0x08a4, 0x089a, 0x0891, 0x0888,
185 0x087f, 0x0876, 0x086d, 0x0864, 0x085b, 0x0853, 0x084a, 0x0842, 0x0839, 0x0831, 0x0828, 0x0820, 0x0818, 0x0810, 0x0808, 0x0800,
186};
187
188static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src )
189{
190#ifdef __SSE4_1__
191 __m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
192 __m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
193 __m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
194 __m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
195
196 __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
197 __m128i sd0 = _mm_and_si128( px0, smask );
198 __m128i sd1 = _mm_and_si128( px1, smask );
199 __m128i sd2 = _mm_and_si128( px2, smask );
200 __m128i sd3 = _mm_and_si128( px3, smask );
201
202 __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
203
204 __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
205 __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
206 __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
207 __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
208
209 __m128i sm0 = _mm_and_si128(sc0, sc1);
210 __m128i sm1 = _mm_and_si128(sc2, sc3);
211 __m128i sm = _mm_and_si128(sm0, sm1);
212
213 if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
214 {
215 uint32_t c;
216 memcpy( &c, src, 4 );
217 return uint64_t( to565( c ) ) << 16;
218 }
219
220 __m128i min0 = _mm_min_epu8( px0, px1 );
221 __m128i min1 = _mm_min_epu8( px2, px3 );
222 __m128i min2 = _mm_min_epu8( min0, min1 );
223
224 __m128i max0 = _mm_max_epu8( px0, px1 );
225 __m128i max1 = _mm_max_epu8( px2, px3 );
226 __m128i max2 = _mm_max_epu8( max0, max1 );
227
228 __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
229 __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
230 __m128i min4 = _mm_min_epu8( min2, min3 );
231 __m128i max4 = _mm_max_epu8( max2, max3 );
232
233 __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
234 __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
235 __m128i rmin = _mm_min_epu8( min4, min5 );
236 __m128i rmax = _mm_max_epu8( max4, max5 );
237
238 __m128i range1 = _mm_subs_epu8( rmax, rmin );
239 __m128i range2 = _mm_sad_epu8( rmax, rmin );
240
241 uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
242 __m128i range = _mm_set1_epi16( DivTable[vrange] );
243
244 __m128i inset1 = _mm_srli_epi16( range1, 4 );
245 __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
246 __m128i min = _mm_adds_epu8( rmin, inset );
247 __m128i max = _mm_subs_epu8( rmax, inset );
248
249 __m128i c0 = _mm_subs_epu8( px0, rmin );
250 __m128i c1 = _mm_subs_epu8( px1, rmin );
251 __m128i c2 = _mm_subs_epu8( px2, rmin );
252 __m128i c3 = _mm_subs_epu8( px3, rmin );
253
254 __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
255 __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
256 __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
257 __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
258
259 __m128i s0 = _mm_hadd_epi16( is0, is1 );
260 __m128i s1 = _mm_hadd_epi16( is2, is3 );
261
262 __m128i m0 = _mm_mulhi_epu16( s0, range );
263 __m128i m1 = _mm_mulhi_epu16( s1, range );
264
265 __m128i p0 = _mm_packus_epi16( m0, m1 );
266
267 __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
268 __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
269 __m128i p3 = _mm_or_si128( p1, p2 );
270 __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
271
272 uint32_t vmin = _mm_cvtsi128_si32( min );
273 uint32_t vmax = _mm_cvtsi128_si32( max );
274 uint32_t vp = _mm_cvtsi128_si32( p );
275
276 return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
277#elif defined __ARM_NEON
278# ifdef __aarch64__
279 uint8x16x4_t px = vld4q_u8( src );
280
281 uint8x16_t lr = px.val[0];
282 uint8x16_t lg = px.val[1];
283 uint8x16_t lb = px.val[2];
284
285 uint8_t rmaxr = vmaxvq_u8( lr );
286 uint8_t rmaxg = vmaxvq_u8( lg );
287 uint8_t rmaxb = vmaxvq_u8( lb );
288
289 uint8_t rminr = vminvq_u8( lr );
290 uint8_t rming = vminvq_u8( lg );
291 uint8_t rminb = vminvq_u8( lb );
292
293 int rr = rmaxr - rminr;
294 int rg = rmaxg - rming;
295 int rb = rmaxb - rminb;
296
297 int vrange1 = rr + rg + rb;
298 uint16_t vrange2 = DivTableNEON[vrange1];
299
300 uint8_t insetr = rr >> 4;
301 uint8_t insetg = rg >> 4;
302 uint8_t insetb = rb >> 4;
303
304 uint8_t minr = rminr + insetr;
305 uint8_t ming = rming + insetg;
306 uint8_t minb = rminb + insetb;
307
308 uint8_t maxr = rmaxr - insetr;
309 uint8_t maxg = rmaxg - insetg;
310 uint8_t maxb = rmaxb - insetb;
311
312 uint8x16_t cr = vsubq_u8( lr, vdupq_n_u8( rminr ) );
313 uint8x16_t cg = vsubq_u8( lg, vdupq_n_u8( rming ) );
314 uint8x16_t cb = vsubq_u8( lb, vdupq_n_u8( rminb ) );
315
316 uint16x8_t is0l = vaddl_u8( vget_low_u8( cr ), vget_low_u8( cg ) );
317 uint16x8_t is0h = vaddl_u8( vget_high_u8( cr ), vget_high_u8( cg ) );
318 uint16x8_t is1l = vaddw_u8( is0l, vget_low_u8( cb ) );
319 uint16x8_t is1h = vaddw_u8( is0h, vget_high_u8( cb ) );
320
321 int16x8_t range = vdupq_n_s16( vrange2 );
322 uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1l ), range ) );
323 uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1h ), range ) );
324
325 uint8x8_t p00 = vmovn_u16( m0 );
326 uint8x8_t p01 = vmovn_u16( m1 );
327 uint8x16_t p0 = vcombine_u8( p00, p01 );
328
329 uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
330 uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
331 uint32x4_t p3 = vaddq_u32( p1, p2 );
332
333 uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
334 uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
335
336 uint32_t vp;
337 vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
338
339 return uint64_t( ( uint64_t( to565( minr, ming, minb ) ) << 16 ) | to565( maxr, maxg, maxb ) | ( uint64_t( vp ) << 32 ) );
340# else
341 uint32x4_t px0 = vld1q_u32( (uint32_t*)src );
342 uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 );
343 uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 );
344 uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 );
345
346 uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 );
347 uint32x4_t sd0 = vandq_u32( smask, px0 );
348 uint32x4_t sd1 = vandq_u32( smask, px1 );
349 uint32x4_t sd2 = vandq_u32( smask, px2 );
350 uint32x4_t sd3 = vandq_u32( smask, px3 );
351
352 uint32x4_t sc = vdupq_n_u32( sd0[0] );
353
354 uint32x4_t sc0 = vceqq_u32( sd0, sc );
355 uint32x4_t sc1 = vceqq_u32( sd1, sc );
356 uint32x4_t sc2 = vceqq_u32( sd2, sc );
357 uint32x4_t sc3 = vceqq_u32( sd3, sc );
358
359 uint32x4_t sm0 = vandq_u32( sc0, sc1 );
360 uint32x4_t sm1 = vandq_u32( sc2, sc3 );
361 int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) );
362
363 if( sm[0] == -1 && sm[1] == -1 )
364 {
365 return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
366 }
367
368 uint32x4_t mask = vdupq_n_u32( 0xFFFFFF );
369 uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) );
370 uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) );
371 uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) );
372 uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) );
373
374 uint8x16_t min0 = vminq_u8( l0, l1 );
375 uint8x16_t min1 = vminq_u8( l2, l3 );
376 uint8x16_t min2 = vminq_u8( min0, min1 );
377
378 uint8x16_t max0 = vmaxq_u8( l0, l1 );
379 uint8x16_t max1 = vmaxq_u8( l2, l3 );
380 uint8x16_t max2 = vmaxq_u8( max0, max1 );
381
382 uint8x16_t min3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( min2 ) ) );
383 uint8x16_t max3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( max2 ) ) );
384
385 uint8x16_t min4 = vminq_u8( min2, min3 );
386 uint8x16_t max4 = vmaxq_u8( max2, max3 );
387
388 uint8x16_t min5 = vcombine_u8( vget_high_u8( min4 ), vget_low_u8( min4 ) );
389 uint8x16_t max5 = vcombine_u8( vget_high_u8( max4 ), vget_low_u8( max4 ) );
390
391 uint8x16_t rmin = vminq_u8( min4, min5 );
392 uint8x16_t rmax = vmaxq_u8( max4, max5 );
393
394 uint8x16_t range1 = vsubq_u8( rmax, rmin );
395 uint8x8_t range2 = vget_low_u8( range1 );
396 uint8x8x2_t range3 = vzip_u8( range2, vdup_n_u8( 0 ) );
397 uint16x4_t range4 = vreinterpret_u16_u8( range3.val[0] );
398
399 uint16_t vrange1;
400 uint16x4_t range5 = vpadd_u16( range4, range4 );
401 uint16x4_t range6 = vpadd_u16( range5, range5 );
402 vst1_lane_u16( &vrange1, range6, 0 );
403
404 uint32_t vrange2 = ( 2 << 16 ) / uint32_t( vrange1 + 1 );
405 uint16x8_t range = vdupq_n_u16( vrange2 );
406
407 uint8x16_t inset = vshrq_n_u8( range1, 4 );
408 uint8x16_t min = vaddq_u8( rmin, inset );
409 uint8x16_t max = vsubq_u8( rmax, inset );
410
411 uint8x16_t c0 = vsubq_u8( l0, rmin );
412 uint8x16_t c1 = vsubq_u8( l1, rmin );
413 uint8x16_t c2 = vsubq_u8( l2, rmin );
414 uint8x16_t c3 = vsubq_u8( l3, rmin );
415
416 uint16x8_t is0 = vpaddlq_u8( c0 );
417 uint16x8_t is1 = vpaddlq_u8( c1 );
418 uint16x8_t is2 = vpaddlq_u8( c2 );
419 uint16x8_t is3 = vpaddlq_u8( c3 );
420
421 uint16x4_t is4 = vpadd_u16( vget_low_u16( is0 ), vget_high_u16( is0 ) );
422 uint16x4_t is5 = vpadd_u16( vget_low_u16( is1 ), vget_high_u16( is1 ) );
423 uint16x4_t is6 = vpadd_u16( vget_low_u16( is2 ), vget_high_u16( is2 ) );
424 uint16x4_t is7 = vpadd_u16( vget_low_u16( is3 ), vget_high_u16( is3 ) );
425
426 uint16x8_t s0 = vcombine_u16( is4, is5 );
427 uint16x8_t s1 = vcombine_u16( is6, is7 );
428
429 uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s0 ), vreinterpretq_s16_u16( range ) ) );
430 uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s1 ), vreinterpretq_s16_u16( range ) ) );
431
432 uint8x8_t p00 = vmovn_u16( m0 );
433 uint8x8_t p01 = vmovn_u16( m1 );
434 uint8x16_t p0 = vcombine_u8( p00, p01 );
435
436 uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
437 uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
438 uint32x4_t p3 = vaddq_u32( p1, p2 );
439
440 uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
441 uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
442
443 uint32_t vmin, vmax, vp;
444 vst1q_lane_u32( &vmin, vreinterpretq_u32_u8( min ), 0 );
445 vst1q_lane_u32( &vmax, vreinterpretq_u32_u8( max ), 0 );
446 vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
447
448 return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
449# endif
450#else
451 uint32_t ref;
452 memcpy( &ref, src, 4 );
453 uint32_t refMask = ref & 0xF8FCF8;
454 auto stmp = src + 4;
455 for( int i=1; i<16; i++ )
456 {
457 uint32_t px;
458 memcpy( &px, stmp, 4 );
459 if( ( px & 0xF8FCF8 ) != refMask ) break;
460 stmp += 4;
461 }
462 if( stmp == src + 64 )
463 {
464 return uint64_t( to565( ref ) ) << 16;
465 }
466
467 uint8_t min[3] = { src[0], src[1], src[2] };
468 uint8_t max[3] = { src[0], src[1], src[2] };
469 auto tmp = src + 4;
470 for( int i=1; i<16; i++ )
471 {
472 for( int j=0; j<3; j++ )
473 {
474 if( tmp[j] < min[j] ) min[j] = tmp[j];
475 else if( tmp[j] > max[j] ) max[j] = tmp[j];
476 }
477 tmp += 4;
478 }
479
480 const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]];
481 const uint32_t rmin = min[0] + min[1] + min[2];
482 for( int i=0; i<3; i++ )
483 {
484 const uint8_t inset = ( max[i] - min[i] ) >> 4;
485 min[i] += inset;
486 max[i] -= inset;
487 }
488
489 uint32_t data = 0;
490 for( int i=0; i<16; i++ )
491 {
492 const uint32_t c = src[0] + src[1] + src[2] - rmin;
493 const uint8_t idx = ( c * range ) >> 16;
494 data |= idx << (i*2);
495 src += 4;
496 }
497
498 return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) );
499#endif
500}
501
502#ifdef __AVX2__
503static etcpak_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst )
504{
505 __m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0);
506 __m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1);
507 __m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2);
508 __m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3);
509
510 __m256i smask = _mm256_set1_epi32( 0xF8FCF8 );
511 __m256i sd0 = _mm256_and_si256( px0, smask );
512 __m256i sd1 = _mm256_and_si256( px1, smask );
513 __m256i sd2 = _mm256_and_si256( px2, smask );
514 __m256i sd3 = _mm256_and_si256( px3, smask );
515
516 __m256i sc = _mm256_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
517
518 __m256i sc0 = _mm256_cmpeq_epi8(sd0, sc);
519 __m256i sc1 = _mm256_cmpeq_epi8(sd1, sc);
520 __m256i sc2 = _mm256_cmpeq_epi8(sd2, sc);
521 __m256i sc3 = _mm256_cmpeq_epi8(sd3, sc);
522
523 __m256i sm0 = _mm256_and_si256(sc0, sc1);
524 __m256i sm1 = _mm256_and_si256(sc2, sc3);
525 __m256i sm = _mm256_and_si256(sm0, sm1);
526
527 const int64_t solid0 = 1 - _mm_testc_si128( _mm256_castsi256_si128( sm ), _mm_set1_epi32( -1 ) );
528 const int64_t solid1 = 1 - _mm_testc_si128( _mm256_extracti128_si256( sm, 1 ), _mm_set1_epi32( -1 ) );
529
530 if( solid0 + solid1 == 0 )
531 {
532 const auto c0 = uint64_t( to565( src[0], src[1], src[2] ) );
533 const auto c1 = uint64_t( to565( src[16], src[17], src[18] ) );
534 memcpy( dst, &c0, 8 );
535 memcpy( dst+8, &c1, 8 );
536 dst += 16;
537 return;
538 }
539
540 __m256i min0 = _mm256_min_epu8( px0, px1 );
541 __m256i min1 = _mm256_min_epu8( px2, px3 );
542 __m256i min2 = _mm256_min_epu8( min0, min1 );
543
544 __m256i max0 = _mm256_max_epu8( px0, px1 );
545 __m256i max1 = _mm256_max_epu8( px2, px3 );
546 __m256i max2 = _mm256_max_epu8( max0, max1 );
547
548 __m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
549 __m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
550 __m256i min4 = _mm256_min_epu8( min2, min3 );
551 __m256i max4 = _mm256_max_epu8( max2, max3 );
552
553 __m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
554 __m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
555 __m256i rmin = _mm256_min_epu8( min4, min5 );
556 __m256i rmax = _mm256_max_epu8( max4, max5 );
557
558 __m256i range1 = _mm256_subs_epu8( rmax, rmin );
559 __m256i range2 = _mm256_sad_epu8( rmax, rmin );
560
561 uint16_t vrange0 = DivTable[_mm256_cvtsi256_si32( range2 ) >> 1];
562 uint16_t vrange1 = DivTable[_mm256_extract_epi16( range2, 8 ) >> 1];
563 __m256i range00 = _mm256_set1_epi16( vrange0 );
564 __m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 );
565
566 __m256i inset1 = _mm256_srli_epi16( range1, 4 );
567 __m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) );
568 __m256i min = _mm256_adds_epu8( rmin, inset );
569 __m256i max = _mm256_subs_epu8( rmax, inset );
570
571 __m256i c0 = _mm256_subs_epu8( px0, rmin );
572 __m256i c1 = _mm256_subs_epu8( px1, rmin );
573 __m256i c2 = _mm256_subs_epu8( px2, rmin );
574 __m256i c3 = _mm256_subs_epu8( px3, rmin );
575
576 __m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) );
577 __m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) );
578 __m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) );
579 __m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) );
580
581 __m256i s0 = _mm256_hadd_epi16( is0, is1 );
582 __m256i s1 = _mm256_hadd_epi16( is2, is3 );
583
584 __m256i m0 = _mm256_mulhi_epu16( s0, range );
585 __m256i m1 = _mm256_mulhi_epu16( s1, range );
586
587 __m256i p0 = _mm256_packus_epi16( m0, m1 );
588
589 __m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) );
590 __m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 );
591 __m256i p3 = _mm256_or_si256( p1, p2 );
592 __m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) );
593
594 __m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min );
595 __m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max );
596 __m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 );
597 __m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 );
598 __m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 );
599 __m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 );
600 __m256i mm3 = _mm256_or_si256( mmr, mmg );
601 __m256i mm4 = _mm256_or_si256( mm3, mmb );
602 __m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) );
603
604 __m256i d0 = _mm256_unpacklo_epi32( mm5, p );
605 __m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) );
606 __m128i d2 = _mm256_castsi256_si128( d1 );
607
608 __m128i mask = _mm_set_epi64x( 0xFFFF0000 | -solid1, 0xFFFF0000 | -solid0 );
609 __m128i d3 = _mm_and_si128( d2, mask );
610 _mm_storeu_si128( (__m128i*)dst, d3 );
611
612 for( int j=4; j<8; j++ ) dst[j] = (char)DxtcIndexTable[(uint8_t)dst[j]];
613 for( int j=12; j<16; j++ ) dst[j] = (char)DxtcIndexTable[(uint8_t)dst[j]];
614
615 dst += 16;
616}
617#endif
618
619static const uint8_t AlphaIndexTable[8] = { 1, 7, 6, 5, 4, 3, 2, 0 };
620
621static etcpak_force_inline uint64_t ProcessAlpha( const uint8_t* src )
622{
623 uint8_t solid8 = *src;
624 uint16_t solid16 = uint16_t( solid8 ) | ( uint16_t( solid8 ) << 8 );
625 uint32_t solid32 = uint32_t( solid16 ) | ( uint32_t( solid16 ) << 16 );
626 uint64_t solid64 = uint64_t( solid32 ) | ( uint64_t( solid32 ) << 32 );
627 if( memcmp( src, &solid64, 8 ) == 0 && memcmp( src+8, &solid64, 8 ) == 0 )
628 {
629 return solid8;
630 }
631
632 uint8_t min = src[0];
633 uint8_t max = min;
634 for( int i=1; i<16; i++ )
635 {
636 const auto v = src[i];
637 if( v > max ) max = v;
638 else if( v < min ) min = v;
639 }
640
641 uint32_t range = ( 8 << 13 ) / ( 1 + max - min );
642 uint64_t data = 0;
643 for( int i=0; i<16; i++ )
644 {
645 uint8_t a = src[i] - min;
646 uint64_t idx = AlphaIndexTable[( a * range ) >> 13];
647 data |= idx << (i*3);
648 }
649
650 return max | ( min << 8 ) | ( data << 16 );
651}
652
653#ifdef __SSE4_1__
654static etcpak_force_inline uint64_t ProcessRGB_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
655{
656 __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
657 __m128i sd0 = _mm_and_si128( px0, smask );
658 __m128i sd1 = _mm_and_si128( px1, smask );
659 __m128i sd2 = _mm_and_si128( px2, smask );
660 __m128i sd3 = _mm_and_si128( px3, smask );
661
662 __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
663
664 __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
665 __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
666 __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
667 __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
668
669 __m128i sm0 = _mm_and_si128(sc0, sc1);
670 __m128i sm1 = _mm_and_si128(sc2, sc3);
671 __m128i sm = _mm_and_si128(sm0, sm1);
672
673 if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
674 {
675 return uint64_t( to565( _mm_cvtsi128_si32( px0 ) ) ) << 16;
676 }
677
678 px0 = _mm_and_si128( px0, _mm_set1_epi32( 0xFFFFFF ) );
679 px1 = _mm_and_si128( px1, _mm_set1_epi32( 0xFFFFFF ) );
680 px2 = _mm_and_si128( px2, _mm_set1_epi32( 0xFFFFFF ) );
681 px3 = _mm_and_si128( px3, _mm_set1_epi32( 0xFFFFFF ) );
682
683 __m128i min0 = _mm_min_epu8( px0, px1 );
684 __m128i min1 = _mm_min_epu8( px2, px3 );
685 __m128i min2 = _mm_min_epu8( min0, min1 );
686
687 __m128i max0 = _mm_max_epu8( px0, px1 );
688 __m128i max1 = _mm_max_epu8( px2, px3 );
689 __m128i max2 = _mm_max_epu8( max0, max1 );
690
691 __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
692 __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
693 __m128i min4 = _mm_min_epu8( min2, min3 );
694 __m128i max4 = _mm_max_epu8( max2, max3 );
695
696 __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
697 __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
698 __m128i rmin = _mm_min_epu8( min4, min5 );
699 __m128i rmax = _mm_max_epu8( max4, max5 );
700
701 __m128i range1 = _mm_subs_epu8( rmax, rmin );
702 __m128i range2 = _mm_sad_epu8( rmax, rmin );
703
704 uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
705 __m128i range = _mm_set1_epi16( DivTable[vrange] );
706
707 __m128i inset1 = _mm_srli_epi16( range1, 4 );
708 __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
709 __m128i min = _mm_adds_epu8( rmin, inset );
710 __m128i max = _mm_subs_epu8( rmax, inset );
711
712 __m128i c0 = _mm_subs_epu8( px0, rmin );
713 __m128i c1 = _mm_subs_epu8( px1, rmin );
714 __m128i c2 = _mm_subs_epu8( px2, rmin );
715 __m128i c3 = _mm_subs_epu8( px3, rmin );
716
717 __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
718 __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
719 __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
720 __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
721
722 __m128i s0 = _mm_hadd_epi16( is0, is1 );
723 __m128i s1 = _mm_hadd_epi16( is2, is3 );
724
725 __m128i m0 = _mm_mulhi_epu16( s0, range );
726 __m128i m1 = _mm_mulhi_epu16( s1, range );
727
728 __m128i p0 = _mm_packus_epi16( m0, m1 );
729
730 __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
731 __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
732 __m128i p3 = _mm_or_si128( p1, p2 );
733 __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
734
735 uint32_t vmin = _mm_cvtsi128_si32( min );
736 uint32_t vmax = _mm_cvtsi128_si32( max );
737 uint32_t vp = _mm_cvtsi128_si32( p );
738
739 return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
740}
741
742static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
743{
744 __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
745
746 __m128i m0 = _mm_shuffle_epi8( px0, mask );
747 __m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
748 __m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
749 __m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
750 __m128i m4 = _mm_or_si128( m0, m1 );
751 __m128i m5 = _mm_or_si128( m2, m3 );
752 __m128i a = _mm_or_si128( m4, m5 );
753
754 __m128i solidCmp = _mm_shuffle_epi8( a, _mm_setzero_si128() );
755 __m128i cmpRes = _mm_cmpeq_epi8( a, solidCmp );
756 if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
757 {
758 return _mm_cvtsi128_si32( a ) & 0xFF;
759 }
760
761 __m128i a1 = _mm_shuffle_epi32( a, _MM_SHUFFLE( 2, 3, 0, 1 ) );
762 __m128i max1 = _mm_max_epu8( a, a1 );
763 __m128i min1 = _mm_min_epu8( a, a1 );
764 __m128i amax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
765 __m128i amin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
766 __m128i max2 = _mm_max_epu8( max1, amax2 );
767 __m128i min2 = _mm_min_epu8( min1, amin2 );
768 __m128i amax3 = _mm_alignr_epi8( max2, max2, 2 );
769 __m128i amin3 = _mm_alignr_epi8( min2, min2, 2 );
770 __m128i max3 = _mm_max_epu8( max2, amax3 );
771 __m128i min3 = _mm_min_epu8( min2, amin3 );
772 __m128i amax4 = _mm_alignr_epi8( max3, max3, 1 );
773 __m128i amin4 = _mm_alignr_epi8( min3, min3, 1 );
774 __m128i max = _mm_max_epu8( max3, amax4 );
775 __m128i min = _mm_min_epu8( min3, amin4 );
776 __m128i minmax = _mm_unpacklo_epi8( max, min );
777
778 __m128i r = _mm_sub_epi8( max, min );
779 int range = _mm_cvtsi128_si32( r ) & 0xFF;
780 __m128i rv = _mm_set1_epi16( DivTableAlpha[range] );
781
782 __m128i v = _mm_sub_epi8( a, min );
783
784 __m128i lo16 = _mm_unpacklo_epi8( v, _mm_setzero_si128() );
785 __m128i hi16 = _mm_unpackhi_epi8( v, _mm_setzero_si128() );
786
787 __m128i lomul = _mm_mulhi_epu16( lo16, rv );
788 __m128i himul = _mm_mulhi_epu16( hi16, rv );
789
790 __m128i p0 = _mm_packus_epi16( lomul, himul );
791 __m128i p1 = _mm_or_si128( _mm_and_si128( p0, _mm_set1_epi16( 0x3F ) ), _mm_srai_epi16( _mm_and_si128( p0, _mm_set1_epi16( 0x3F00 ) ), 5 ) );
792 __m128i p2 = _mm_packus_epi16( p1, p1 );
793
794 uint64_t pi = _mm_cvtsi128_si64( p2 );
795 uint64_t data = 0;
796 for( int i=0; i<8; i++ )
797 {
798 uint64_t idx = AlphaIndexTable_SSE[(pi>>(i*8)) & 0x3F];
799 data |= idx << (i*6);
800 }
801 return (uint64_t)(uint16_t)_mm_cvtsi128_si32( minmax ) | ( data << 16 );
802}
803#endif
804
805void CompressDxt1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
806{
807#ifdef __AVX2__
808 if( width%8 == 0 )
809 {
810 blocks /= 2;
811 uint32_t buf[8*4];
812 int i = 0;
813 char* dst8 = (char*)dst;
814
815 do
816 {
817 auto tmp = (char*)buf;
818 memcpy( tmp, src + width * 0, 8*4 );
819 memcpy( tmp + 8*4, src + width * 1, 8*4 );
820 memcpy( tmp + 16*4, src + width * 2, 8*4 );
821 memcpy( tmp + 24*4, src + width * 3, 8*4 );
822 src += 8;
823 if( ++i == width/8 )
824 {
825 src += width * 3;
826 i = 0;
827 }
828
829 ProcessRGB_AVX( (uint8_t*)buf, dst8 );
830 }
831 while( --blocks );
832 }
833 else
834#endif
835 {
836 uint32_t buf[4*4];
837 int i = 0;
838
839 auto ptr = dst;
840 do
841 {
842 auto tmp = (char*)buf;
843 memcpy( tmp, src + width * 0, 4*4 );
844 memcpy( tmp + 4*4, src + width * 1, 4*4 );
845 memcpy( tmp + 8*4, src + width * 2, 4*4 );
846 memcpy( tmp + 12*4, src + width * 3, 4*4 );
847 src += 4;
848 if( ++i == width/4 )
849 {
850 src += width * 3;
851 i = 0;
852 }
853
854 const auto c = ProcessRGB( (uint8_t*)buf );
855 uint8_t fix[8];
856 memcpy( fix, &c, 8 );
857 for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
858 memcpy( ptr, fix, sizeof( uint64_t ) );
859 ptr++;
860 }
861 while( --blocks );
862 }
863}
864
865void CompressDxt1Dither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
866{
867 uint32_t buf[4*4];
868 int i = 0;
869
870 auto ptr = dst;
871 do
872 {
873 auto tmp = (char*)buf;
874 memcpy( tmp, src + width * 0, 4*4 );
875 memcpy( tmp + 4*4, src + width * 1, 4*4 );
876 memcpy( tmp + 8*4, src + width * 2, 4*4 );
877 memcpy( tmp + 12*4, src + width * 3, 4*4 );
878 src += 4;
879 if( ++i == width/4 )
880 {
881 src += width * 3;
882 i = 0;
883 }
884
885 Dither( (uint8_t*)buf );
886
887 const auto c = ProcessRGB( (uint8_t*)buf );
888 uint8_t fix[8];
889 memcpy( fix, &c, 8 );
890 for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
891 memcpy( ptr, fix, sizeof( uint64_t ) );
892 ptr++;
893 }
894 while( --blocks );
895}
896
897void CompressDxt5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
898{
899 int i = 0;
900 auto ptr = dst;
901 do
902 {
903#ifdef __SSE4_1__
904 __m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) );
905 __m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) );
906 __m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) );
907 __m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) );
908
909 src += 4;
910 if( ++i == width/4 )
911 {
912 src += width * 3;
913 i = 0;
914 }
915
916 *ptr++ = ProcessAlpha_SSE( px0, px1, px2, px3 );
917
918 const auto c = ProcessRGB_SSE( px0, px1, px2, px3 );
919 uint8_t fix[8];
920 memcpy( fix, &c, 8 );
921 for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
922 memcpy( ptr, fix, sizeof( uint64_t ) );
923 ptr++;
924#else
925 uint32_t rgba[4*4];
926 uint8_t alpha[4*4];
927
928 auto tmp = (char*)rgba;
929 memcpy( tmp, src + width * 0, 4*4 );
930 memcpy( tmp + 4*4, src + width * 1, 4*4 );
931 memcpy( tmp + 8*4, src + width * 2, 4*4 );
932 memcpy( tmp + 12*4, src + width * 3, 4*4 );
933 src += 4;
934 if( ++i == width/4 )
935 {
936 src += width * 3;
937 i = 0;
938 }
939
940 for( int i=0; i<16; i++ )
941 {
942 alpha[i] = rgba[i] >> 24;
943 rgba[i] &= 0xFFFFFF;
944 }
945 *ptr++ = ProcessAlpha( alpha );
946
947 const auto c = ProcessRGB( (uint8_t*)rgba );
948 uint8_t fix[8];
949 memcpy( fix, &c, 8 );
950 for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
951 memcpy( ptr, fix, sizeof( uint64_t ) );
952 ptr++;
953#endif
954 }
955 while( --blocks );
956}
957