1 | // Copyright 2009 Intel Corporation |
2 | // All Rights Reserved |
3 | // |
4 | // Permission is granted to use, copy, distribute and prepare derivative works of this |
5 | // software for any purpose and without fee, provided, that the above copyright notice |
6 | // and this statement appear in all copies. Intel makes no representations about the |
7 | // suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." |
8 | // INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, |
9 | // INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, |
10 | // INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE |
11 | // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not |
12 | // assume any responsibility for any errors which may appear in this software nor any |
13 | // responsibility to update it. |
14 | // |
15 | // From: |
16 | // https://software.intel.com/sites/default/files/m/d/4/1/d/8/UsingIntelAVXToImplementIDCT-r1_5.pdf |
17 | // https://software.intel.com/file/29048 |
18 | // |
19 | // Requires SSE |
20 | // |
21 | #ifdef _MSC_VER |
22 | #include <intrin.h> |
23 | #endif |
24 | #include <immintrin.h> |
25 | |
26 | #ifdef _MSC_VER |
27 | #define JPGD_SIMD_ALIGN(type, name) __declspec(align(16)) type name |
28 | #else |
29 | #define JPGD_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) |
30 | #endif |
31 | |
32 | #define BITS_INV_ACC 4 |
33 | #define SHIFT_INV_ROW 16 - BITS_INV_ACC |
34 | #define SHIFT_INV_COL 1 + BITS_INV_ACC |
35 | const short IRND_INV_ROW = 1024 * (6 - BITS_INV_ACC); //1 << (SHIFT_INV_ROW-1) |
36 | const short IRND_INV_COL = 16 * (BITS_INV_ACC - 3); // 1 << (SHIFT_INV_COL-1) |
37 | const short IRND_INV_CORR = IRND_INV_COL - 1; // correction -1.0 and round |
38 | |
39 | JPGD_SIMD_ALIGN(short, shortM128_one_corr[8]) = {1, 1, 1, 1, 1, 1, 1, 1}; |
40 | JPGD_SIMD_ALIGN(short, shortM128_round_inv_row[8]) = {IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0}; |
41 | JPGD_SIMD_ALIGN(short, shortM128_round_inv_col[8]) = {IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL}; |
42 | JPGD_SIMD_ALIGN(short, shortM128_round_inv_corr[8])= {IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR}; |
43 | JPGD_SIMD_ALIGN(short, shortM128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5 |
44 | JPGD_SIMD_ALIGN(short, shortM128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5 |
45 | JPGD_SIMD_ALIGN(short, shortM128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5 |
46 | JPGD_SIMD_ALIGN(short, shortM128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5 |
47 | |
48 | //----------------------------------------------------------------------------- |
49 | // Table for rows 0,4 - constants are multiplied on cos_4_16 |
50 | // w15 w14 w11 w10 w07 w06 w03 w02 |
51 | // w29 w28 w25 w24 w21 w20 w17 w16 |
52 | // w31 w30 w27 w26 w23 w22 w19 w18 |
53 | //movq -> w05 w04 w01 w00 |
54 | JPGD_SIMD_ALIGN(short, shortM128_tab_i_04[]) = { |
55 | 16384, 21407, 16384, 8867, |
56 | 16384, -8867, 16384, -21407, // w13 w12 w09 w08 |
57 | 16384, 8867, -16384, -21407, // w07 w06 w03 w02 |
58 | -16384, 21407, 16384, -8867, // w15 w14 w11 w10 |
59 | 22725, 19266, 19266, -4520, // w21 w20 w17 w16 |
60 | 12873, -22725, 4520, -12873, // w29 w28 w25 w24 |
61 | 12873, 4520, -22725, -12873, // w23 w22 w19 w18 |
62 | 4520, 19266, 19266, -22725}; // w31 w30 w27 w26 |
63 | |
64 | // Table for rows 1,7 - constants are multiplied on cos_1_16 |
65 | //movq -> w05 w04 w01 w00 |
66 | JPGD_SIMD_ALIGN(short, shortM128_tab_i_17[]) = { |
67 | 22725, 29692, 22725, 12299, |
68 | 22725, -12299, 22725, -29692, // w13 w12 w09 w08 |
69 | 22725, 12299, -22725, -29692, // w07 w06 w03 w02 |
70 | -22725, 29692, 22725, -12299, // w15 w14 w11 w10 |
71 | 31521, 26722, 26722, -6270, // w21 w20 w17 w16 |
72 | 17855, -31521, 6270, -17855, // w29 w28 w25 w24 |
73 | 17855, 6270, -31521, -17855, // w23 w22 w19 w18 |
74 | 6270, 26722, 26722, -31521}; // w31 w30 w27 w26 |
75 | |
76 | // Table for rows 2,6 - constants are multiplied on cos_2_16 |
77 | //movq -> w05 w04 w01 w00 |
78 | JPGD_SIMD_ALIGN(short, shortM128_tab_i_26[]) = { |
79 | 21407, 27969, 21407, 11585, |
80 | 21407, -11585, 21407, -27969, // w13 w12 w09 w08 |
81 | 21407, 11585, -21407, -27969, // w07 w06 w03 w02 |
82 | -21407, 27969, 21407, -11585, // w15 w14 w11 w10 |
83 | 29692, 25172, 25172, -5906, // w21 w20 w17 w16 |
84 | 16819, -29692, 5906, -16819, // w29 w28 w25 w24 |
85 | 16819, 5906, -29692, -16819, // w23 w22 w19 w18 |
86 | 5906, 25172, 25172, -29692}; // w31 w30 w27 w26 |
87 | // Table for rows 3,5 - constants are multiplied on cos_3_16 |
88 | //movq -> w05 w04 w01 w00 |
89 | JPGD_SIMD_ALIGN(short, shortM128_tab_i_35[]) = { |
90 | 19266, 25172, 19266, 10426, |
91 | 19266, -10426, 19266, -25172, // w13 w12 w09 w08 |
92 | 19266, 10426, -19266, -25172, // w07 w06 w03 w02 |
93 | -19266, 25172, 19266, -10426, // w15 w14 w11 w10 |
94 | 26722, 22654, 22654, -5315, // w21 w20 w17 w16 |
95 | 15137, -26722, 5315, -15137, // w29 w28 w25 w24 |
96 | 15137, 5315, -26722, -15137, // w23 w22 w19 w18 |
97 | 5315, 22654, 22654, -26722}; // w31 w30 w27 w26 |
98 | |
99 | JPGD_SIMD_ALIGN(short, shortM128_128[8]) = { 128, 128, 128, 128, 128, 128, 128, 128 }; |
100 | |
101 | void idctSSEShortU8(const short *pInput, uint8_t * pOutputUB) |
102 | { |
103 | __m128i r_xmm0, r_xmm4; |
104 | __m128i r_xmm1, r_xmm2, r_xmm3, r_xmm5, r_xmm6, r_xmm7; |
105 | __m128i row0, row1, row2, row3, row4, row5, row6, row7; |
106 | short * pTab_i_04 = shortM128_tab_i_04; |
107 | short * pTab_i_26 = shortM128_tab_i_26; |
108 | |
109 | //Get pointers for this input and output |
110 | pTab_i_04 = shortM128_tab_i_04; |
111 | pTab_i_26 = shortM128_tab_i_26; |
112 | |
113 | //Row 1 and Row 3 |
114 | r_xmm0 = _mm_load_si128((__m128i *) pInput); |
115 | r_xmm4 = _mm_load_si128((__m128i *) (&pInput[2*8])); |
116 | |
117 | // *** Work on the data in xmm0 |
118 | //low shuffle mask = 0xd8 = 11 01 10 00 |
119 | //get short 2 and short 0 into ls 32-bits |
120 | r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8); |
121 | |
122 | // copy short 2 and short 0 to all locations |
123 | r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0); |
124 | |
125 | // add to those copies |
126 | r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04)); |
127 | |
128 | // shuffle mask = 0x55 = 01 01 01 01 |
129 | // copy short 3 and short 1 to all locations |
130 | r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55); |
131 | |
132 | // high shuffle mask = 0xd8 = 11 01 10 00 |
133 | // get short 6 and short 4 into bit positions 64-95 |
134 | // get short 7 and short 5 into bit positions 96-127 |
135 | r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8); |
136 | |
137 | // add to short 3 and short 1 |
138 | r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16])); |
139 | |
140 | // shuffle mask = 0xaa = 10 10 10 10 |
141 | // copy short 6 and short 4 to all locations |
142 | r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa); |
143 | |
144 | // shuffle mask = 0xaa = 11 11 11 11 |
145 | // copy short 7 and short 5 to all locations |
146 | r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff); |
147 | |
148 | // add to short 6 and short 4 |
149 | r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); |
150 | |
151 | // *** Work on the data in xmm4 |
152 | // high shuffle mask = 0xd8 11 01 10 00 |
153 | // get short 6 and short 4 into bit positions 64-95 |
154 | // get short 7 and short 5 into bit positions 96-127 |
155 | r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8); |
156 | |
157 | // (xmm0 short 2 and short 0 plus pSi) + some constants |
158 | r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row)); |
159 | r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8); |
160 | r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24])); |
161 | r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0); |
162 | r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa); |
163 | r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0])); |
164 | r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2); |
165 | r_xmm2 = r_xmm1; |
166 | r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55); |
167 | r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8])); |
168 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3); |
169 | r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff); |
170 | r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0); |
171 | r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16])); |
172 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1); |
173 | r_xmm2 = _mm_srai_epi32(r_xmm2, 12); |
174 | r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row)); |
175 | r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24])); |
176 | r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6); |
177 | r_xmm6 = r_xmm5; |
178 | r_xmm0 = _mm_srai_epi32(r_xmm0, 12); |
179 | r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b); |
180 | row0 = _mm_packs_epi32(r_xmm0, r_xmm2); |
181 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7); |
182 | r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4); |
183 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5); |
184 | r_xmm6 = _mm_srai_epi32(r_xmm6, 12); |
185 | r_xmm4 = _mm_srai_epi32(r_xmm4, 12); |
186 | r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b); |
187 | row2 = _mm_packs_epi32(r_xmm4, r_xmm6); |
188 | |
189 | //Row 5 and row 7 |
190 | r_xmm0 = _mm_load_si128((__m128i *) (&pInput[4*8])); |
191 | r_xmm4 = _mm_load_si128((__m128i *) (&pInput[6*8])); |
192 | |
193 | r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8); |
194 | r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0); |
195 | r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04)); |
196 | r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55); |
197 | r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8); |
198 | r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16])); |
199 | r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa); |
200 | r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff); |
201 | r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); |
202 | r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8); |
203 | r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row)); |
204 | r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8); |
205 | r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24])); |
206 | r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0); |
207 | r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa); |
208 | r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0])); |
209 | r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2); |
210 | r_xmm2 = r_xmm1; |
211 | r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55); |
212 | r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8])); |
213 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3); |
214 | r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff); |
215 | r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0); |
216 | r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16])); |
217 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1); |
218 | r_xmm2 = _mm_srai_epi32(r_xmm2, 12); |
219 | r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row)); |
220 | r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24])); |
221 | r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6); |
222 | r_xmm6 = r_xmm5; |
223 | r_xmm0 = _mm_srai_epi32(r_xmm0, 12); |
224 | r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b); |
225 | row4 = _mm_packs_epi32(r_xmm0, r_xmm2); |
226 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7); |
227 | r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4); |
228 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5); |
229 | r_xmm6 = _mm_srai_epi32(r_xmm6, 12); |
230 | r_xmm4 = _mm_srai_epi32(r_xmm4, 12); |
231 | r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b); |
232 | row6 = _mm_packs_epi32(r_xmm4, r_xmm6); |
233 | |
234 | //Row 4 and row 2 |
235 | pTab_i_04 = shortM128_tab_i_35; |
236 | pTab_i_26 = shortM128_tab_i_17; |
237 | r_xmm0 = _mm_load_si128((__m128i *) (&pInput[3*8])); |
238 | r_xmm4 = _mm_load_si128((__m128i *) (&pInput[1*8])); |
239 | |
240 | r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8); |
241 | r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0); |
242 | r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04)); |
243 | r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55); |
244 | r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8); |
245 | r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16])); |
246 | r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa); |
247 | r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff); |
248 | r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); |
249 | r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8); |
250 | r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row)); |
251 | r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8); |
252 | r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24])); |
253 | r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0); |
254 | r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa); |
255 | r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0])); |
256 | r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2); |
257 | r_xmm2 = r_xmm1; |
258 | r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55); |
259 | r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8])); |
260 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3); |
261 | r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff); |
262 | r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0); |
263 | r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16])); |
264 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1); |
265 | r_xmm2 = _mm_srai_epi32(r_xmm2, 12); |
266 | r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row)); |
267 | r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24])); |
268 | r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6); |
269 | r_xmm6 = r_xmm5; |
270 | r_xmm0 = _mm_srai_epi32(r_xmm0, 12); |
271 | r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b); |
272 | row3 = _mm_packs_epi32(r_xmm0, r_xmm2); |
273 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7); |
274 | r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4); |
275 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5); |
276 | r_xmm6 = _mm_srai_epi32(r_xmm6, 12); |
277 | r_xmm4 = _mm_srai_epi32(r_xmm4, 12); |
278 | r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b); |
279 | row1 = _mm_packs_epi32(r_xmm4, r_xmm6); |
280 | |
281 | //Row 6 and row 8 |
282 | r_xmm0 = _mm_load_si128((__m128i *) (&pInput[5*8])); |
283 | r_xmm4 = _mm_load_si128((__m128i *) (&pInput[7*8])); |
284 | |
285 | r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8); |
286 | r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0); |
287 | r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04)); |
288 | r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55); |
289 | r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8); |
290 | r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16])); |
291 | r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa); |
292 | r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff); |
293 | r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); |
294 | r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8); |
295 | r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row)); |
296 | r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8); |
297 | r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24])); |
298 | r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0); |
299 | r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa); |
300 | r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0])); |
301 | r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2); |
302 | r_xmm2 = r_xmm1; |
303 | r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55); |
304 | r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8])); |
305 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3); |
306 | r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff); |
307 | r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0); |
308 | r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16])); |
309 | r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1); |
310 | r_xmm2 = _mm_srai_epi32(r_xmm2, 12); |
311 | r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row)); |
312 | r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24])); |
313 | r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6); |
314 | r_xmm6 = r_xmm5; |
315 | r_xmm0 = _mm_srai_epi32(r_xmm0, 12); |
316 | r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b); |
317 | row5 = _mm_packs_epi32(r_xmm0, r_xmm2); |
318 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7); |
319 | r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4); |
320 | r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5); |
321 | r_xmm6 = _mm_srai_epi32(r_xmm6, 12); |
322 | r_xmm4 = _mm_srai_epi32(r_xmm4, 12); |
323 | r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b); |
324 | row7 = _mm_packs_epi32(r_xmm4, r_xmm6); |
325 | |
326 | r_xmm1 = _mm_load_si128((__m128i *) shortM128_tg_3_16); |
327 | r_xmm2 = row5; |
328 | r_xmm3 = row3; |
329 | r_xmm0 = _mm_mulhi_epi16(row5, r_xmm1); |
330 | |
331 | r_xmm1 = _mm_mulhi_epi16(r_xmm1, r_xmm3); |
332 | r_xmm5 = _mm_load_si128((__m128i *) shortM128_tg_1_16); |
333 | r_xmm6 = row7; |
334 | r_xmm4 = _mm_mulhi_epi16(row7, r_xmm5); |
335 | |
336 | r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm2); |
337 | r_xmm5 = _mm_mulhi_epi16(r_xmm5, row1); |
338 | r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm3); |
339 | r_xmm7 = row6; |
340 | |
341 | r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm3); |
342 | r_xmm3 = _mm_load_si128((__m128i *) shortM128_tg_2_16); |
343 | r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm1); |
344 | r_xmm7 = _mm_mulhi_epi16(r_xmm7, r_xmm3); |
345 | r_xmm1 = r_xmm0; |
346 | r_xmm3 = _mm_mulhi_epi16(r_xmm3, row2); |
347 | r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm6); |
348 | r_xmm4 = _mm_adds_epi16(r_xmm4, row1); |
349 | r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm4); |
350 | r_xmm0 = _mm_adds_epi16(r_xmm0, *((__m128i *) shortM128_one_corr)); |
351 | r_xmm4 = _mm_subs_epi16(r_xmm4, r_xmm1); |
352 | r_xmm6 = r_xmm5; |
353 | r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm2); |
354 | r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_one_corr)); |
355 | r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2); |
356 | |
357 | //Intermediate results, needed later |
358 | __m128i temp3, temp7; |
359 | temp7 = r_xmm0; |
360 | |
361 | r_xmm1 = r_xmm4; |
362 | r_xmm0 = _mm_load_si128((__m128i *) shortM128_cos_4_16); |
363 | r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm5); |
364 | r_xmm2 = _mm_load_si128((__m128i *) shortM128_cos_4_16); |
365 | r_xmm2 = _mm_mulhi_epi16(r_xmm2, r_xmm4); |
366 | |
367 | //Intermediate results, needed later |
368 | temp3 = r_xmm6; |
369 | |
370 | r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm5); |
371 | r_xmm7 = _mm_adds_epi16(r_xmm7, row2); |
372 | r_xmm3 = _mm_subs_epi16(r_xmm3, row6); |
373 | r_xmm6 = row0; |
374 | r_xmm0 = _mm_mulhi_epi16(r_xmm0, r_xmm1); |
375 | r_xmm5 = row4; |
376 | r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm6); |
377 | r_xmm6 = _mm_subs_epi16(r_xmm6, row4); |
378 | r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm2); |
379 | |
380 | r_xmm4 = _mm_or_si128(r_xmm4, *((__m128i *) shortM128_one_corr)); |
381 | r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm1); |
382 | r_xmm0 = _mm_or_si128(r_xmm0, *((__m128i *) shortM128_one_corr)); |
383 | |
384 | r_xmm2 = r_xmm5; |
385 | r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm7); |
386 | r_xmm1 = r_xmm6; |
387 | r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_round_inv_col)); |
388 | r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm7); |
389 | r_xmm7 = temp7; |
390 | r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm3); |
391 | r_xmm6 = _mm_adds_epi16(r_xmm6, *((__m128i *) shortM128_round_inv_col)); |
392 | r_xmm7 = _mm_adds_epi16(r_xmm7, r_xmm5); |
393 | r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL); |
394 | r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm3); |
395 | r_xmm1 = _mm_adds_epi16(r_xmm1, *((__m128i *) shortM128_round_inv_corr)); |
396 | r_xmm3 = r_xmm6; |
397 | r_xmm2 = _mm_adds_epi16(r_xmm2, *((__m128i *) shortM128_round_inv_corr)); |
398 | r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm4); |
399 | |
400 | //Store results for row 0 |
401 | //_mm_store_si128((__m128i *) pOutput, r_xmm7); |
402 | __m128i r0 = r_xmm7; |
403 | |
404 | r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL); |
405 | r_xmm7 = r_xmm1; |
406 | r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm0); |
407 | |
408 | //Store results for row 1 |
409 | //_mm_store_si128((__m128i *) (&pOutput[1*8]), r_xmm6); |
410 | __m128i r1 = r_xmm6; |
411 | |
412 | r_xmm1 = _mm_srai_epi16(r_xmm1, SHIFT_INV_COL); |
413 | r_xmm6 = temp3; |
414 | r_xmm7 = _mm_subs_epi16(r_xmm7, r_xmm0); |
415 | r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL); |
416 | |
417 | //Store results for row 2 |
418 | //_mm_store_si128((__m128i *) (&pOutput[2*8]), r_xmm1); |
419 | __m128i r2 = r_xmm1; |
420 | |
421 | r_xmm5 = _mm_subs_epi16(r_xmm5, temp7); |
422 | r_xmm5 = _mm_srai_epi16(r_xmm5, SHIFT_INV_COL); |
423 | |
424 | //Store results for row 7 |
425 | //_mm_store_si128((__m128i *) (&pOutput[7*8]), r_xmm5); |
426 | __m128i r7 = r_xmm5; |
427 | |
428 | r_xmm3 = _mm_subs_epi16(r_xmm3, r_xmm4); |
429 | r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2); |
430 | r_xmm2 = _mm_subs_epi16(r_xmm2, temp3); |
431 | r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL); |
432 | r_xmm2 = _mm_srai_epi16(r_xmm2, SHIFT_INV_COL); |
433 | |
434 | //Store results for row 3 |
435 | //_mm_store_si128((__m128i *) (&pOutput[3*8]), r_xmm6); |
436 | __m128i r3 = r_xmm6; |
437 | |
438 | r_xmm3 = _mm_srai_epi16(r_xmm3, SHIFT_INV_COL); |
439 | |
440 | //Store results for rows 4, 5, and 6 |
441 | //_mm_store_si128((__m128i *) (&pOutput[4*8]), r_xmm2); |
442 | //_mm_store_si128((__m128i *) (&pOutput[5*8]), r_xmm7); |
443 | //_mm_store_si128((__m128i *) (&pOutput[6*8]), r_xmm3); |
444 | |
445 | __m128i r4 = r_xmm2; |
446 | __m128i r5 = r_xmm7; |
447 | __m128i r6 = r_xmm3; |
448 | |
449 | r0 = _mm_add_epi16(*(const __m128i *)shortM128_128, r0); |
450 | r1 = _mm_add_epi16(*(const __m128i *)shortM128_128, r1); |
451 | r2 = _mm_add_epi16(*(const __m128i *)shortM128_128, r2); |
452 | r3 = _mm_add_epi16(*(const __m128i *)shortM128_128, r3); |
453 | r4 = _mm_add_epi16(*(const __m128i *)shortM128_128, r4); |
454 | r5 = _mm_add_epi16(*(const __m128i *)shortM128_128, r5); |
455 | r6 = _mm_add_epi16(*(const __m128i *)shortM128_128, r6); |
456 | r7 = _mm_add_epi16(*(const __m128i *)shortM128_128, r7); |
457 | |
458 | ((__m128i *)pOutputUB)[0] = _mm_packus_epi16(r0, r1); |
459 | ((__m128i *)pOutputUB)[1] = _mm_packus_epi16(r2, r3); |
460 | ((__m128i *)pOutputUB)[2] = _mm_packus_epi16(r4, r5); |
461 | ((__m128i *)pOutputUB)[3] = _mm_packus_epi16(r6, r7); |
462 | } |
463 | |