1 | // Copyright 2016 Adrien Descamps |
2 | // Distributed under BSD 3-Clause License |
3 | |
4 | /* You need to define the following macros before including this file: |
5 | SSE_FUNCTION_NAME |
6 | STD_FUNCTION_NAME |
7 | YUV_FORMAT |
8 | RGB_FORMAT |
9 | */ |
10 | /* You may define the following macro, which affects generated code: |
11 | SSE_ALIGNED |
12 | */ |
13 | |
14 | #ifdef SSE_ALIGNED |
15 | /* Unaligned instructions seem faster, even on aligned data? */ |
16 | /* |
17 | #define LOAD_SI128 _mm_load_si128 |
18 | #define SAVE_SI128 _mm_stream_si128 |
19 | */ |
20 | #define LOAD_SI128 _mm_loadu_si128 |
21 | #define SAVE_SI128 _mm_storeu_si128 |
22 | #else |
23 | #define LOAD_SI128 _mm_loadu_si128 |
24 | #define SAVE_SI128 _mm_storeu_si128 |
25 | #endif |
26 | |
27 | #define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \ |
28 | r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \ |
29 | g_tmp = _mm_add_epi16( \ |
30 | _mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \ |
31 | _mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \ |
32 | b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \ |
33 | R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \ |
34 | G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \ |
35 | B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \ |
36 | R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \ |
37 | G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \ |
38 | B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \ |
39 | |
40 | #define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \ |
41 | Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \ |
42 | Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \ |
43 | \ |
44 | R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \ |
45 | G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \ |
46 | B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \ |
47 | R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \ |
48 | G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \ |
49 | B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \ |
50 | |
51 | #define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \ |
52 | { \ |
53 | __m128i red_mask, tmp1, tmp2, tmp3, tmp4; \ |
54 | \ |
55 | red_mask = _mm_set1_epi16((short)0xF800); \ |
56 | RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \ |
57 | RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \ |
58 | RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \ |
59 | RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \ |
60 | tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \ |
61 | tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \ |
62 | tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \ |
63 | tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \ |
64 | RGB1 = _mm_or_si128(RGB1, tmp1); \ |
65 | RGB2 = _mm_or_si128(RGB2, tmp2); \ |
66 | RGB3 = _mm_or_si128(RGB3, tmp3); \ |
67 | RGB4 = _mm_or_si128(RGB4, tmp4); \ |
68 | tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \ |
69 | tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \ |
70 | tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \ |
71 | tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \ |
72 | RGB1 = _mm_or_si128(RGB1, tmp1); \ |
73 | RGB2 = _mm_or_si128(RGB2, tmp2); \ |
74 | RGB3 = _mm_or_si128(RGB3, tmp3); \ |
75 | RGB4 = _mm_or_si128(RGB4, tmp4); \ |
76 | } |
77 | |
78 | #define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ |
79 | RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \ |
80 | RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \ |
81 | RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \ |
82 | RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \ |
83 | RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \ |
84 | RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \ |
85 | |
86 | #define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ |
87 | R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \ |
88 | R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \ |
89 | G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \ |
90 | G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \ |
91 | B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \ |
92 | B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \ |
93 | |
94 | #define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ |
95 | PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ |
96 | PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ |
97 | PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ |
98 | PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ |
99 | PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ |
100 | |
101 | #define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \ |
102 | { \ |
103 | __m128i lo_ab, hi_ab, lo_gr, hi_gr; \ |
104 | \ |
105 | lo_ab = _mm_unpacklo_epi8( A1, B1 ); \ |
106 | hi_ab = _mm_unpackhi_epi8( A1, B1 ); \ |
107 | lo_gr = _mm_unpacklo_epi8( G1, R1 ); \ |
108 | hi_gr = _mm_unpackhi_epi8( G1, R1 ); \ |
109 | RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \ |
110 | RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \ |
111 | RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \ |
112 | RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \ |
113 | \ |
114 | lo_ab = _mm_unpacklo_epi8( A2, B2 ); \ |
115 | hi_ab = _mm_unpackhi_epi8( A2, B2 ); \ |
116 | lo_gr = _mm_unpacklo_epi8( G2, R2 ); \ |
117 | hi_gr = _mm_unpackhi_epi8( G2, R2 ); \ |
118 | RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \ |
119 | RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \ |
120 | RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \ |
121 | RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \ |
122 | } |
123 | |
124 | #if RGB_FORMAT == RGB_FORMAT_RGB565 |
125 | |
126 | #define PACK_PIXEL \ |
127 | __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ |
128 | \ |
129 | PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \ |
130 | \ |
131 | PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \ |
132 | |
133 | #elif RGB_FORMAT == RGB_FORMAT_RGB24 |
134 | |
135 | #define PACK_PIXEL \ |
136 | __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \ |
137 | __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \ |
138 | \ |
139 | PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \ |
140 | \ |
141 | PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \ |
142 | |
143 | #elif RGB_FORMAT == RGB_FORMAT_RGBA |
144 | |
145 | #define PACK_PIXEL \ |
146 | __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ |
147 | __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ |
148 | __m128i a = _mm_set1_epi8((char)0xFF); \ |
149 | \ |
150 | PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ |
151 | \ |
152 | PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ |
153 | |
154 | #elif RGB_FORMAT == RGB_FORMAT_BGRA |
155 | |
156 | #define PACK_PIXEL \ |
157 | __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ |
158 | __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ |
159 | __m128i a = _mm_set1_epi8((char)0xFF); \ |
160 | \ |
161 | PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ |
162 | \ |
163 | PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ |
164 | |
165 | #elif RGB_FORMAT == RGB_FORMAT_ARGB |
166 | |
167 | #define PACK_PIXEL \ |
168 | __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ |
169 | __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ |
170 | __m128i a = _mm_set1_epi8((char)0xFF); \ |
171 | \ |
172 | PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ |
173 | \ |
174 | PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ |
175 | |
176 | #elif RGB_FORMAT == RGB_FORMAT_ABGR |
177 | |
178 | #define PACK_PIXEL \ |
179 | __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ |
180 | __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ |
181 | __m128i a = _mm_set1_epi8((char)0xFF); \ |
182 | \ |
183 | PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ |
184 | \ |
185 | PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ |
186 | |
187 | #else |
188 | #error PACK_PIXEL unimplemented |
189 | #endif |
190 | |
191 | #if RGB_FORMAT == RGB_FORMAT_RGB565 |
192 | |
193 | #define SAVE_LINE1 \ |
194 | SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ |
195 | SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ |
196 | SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ |
197 | SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ |
198 | |
199 | #define SAVE_LINE2 \ |
200 | SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \ |
201 | SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \ |
202 | SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \ |
203 | SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \ |
204 | |
205 | #elif RGB_FORMAT == RGB_FORMAT_RGB24 |
206 | |
207 | #define SAVE_LINE1 \ |
208 | SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ |
209 | SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ |
210 | SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ |
211 | SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ |
212 | SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \ |
213 | SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \ |
214 | |
215 | #define SAVE_LINE2 \ |
216 | SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \ |
217 | SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \ |
218 | SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \ |
219 | SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \ |
220 | SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \ |
221 | SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \ |
222 | |
223 | #elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \ |
224 | RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR |
225 | |
226 | #define SAVE_LINE1 \ |
227 | SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ |
228 | SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ |
229 | SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ |
230 | SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ |
231 | SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \ |
232 | SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \ |
233 | SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \ |
234 | SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \ |
235 | |
236 | #define SAVE_LINE2 \ |
237 | SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \ |
238 | SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \ |
239 | SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \ |
240 | SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \ |
241 | SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \ |
242 | SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \ |
243 | SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \ |
244 | SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \ |
245 | |
246 | #else |
247 | #error SAVE_LINE unimplemented |
248 | #endif |
249 | |
250 | #if YUV_FORMAT == YUV_FORMAT_420 |
251 | |
252 | #define READ_Y(y_ptr) \ |
253 | y = LOAD_SI128((const __m128i*)(y_ptr)); \ |
254 | |
255 | #define READ_UV \ |
256 | u = LOAD_SI128((const __m128i*)(u_ptr)); \ |
257 | v = LOAD_SI128((const __m128i*)(v_ptr)); \ |
258 | |
259 | #elif YUV_FORMAT == YUV_FORMAT_422 |
260 | |
261 | #define READ_Y(y_ptr) \ |
262 | { \ |
263 | __m128i y1, y2; \ |
264 | y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \ |
265 | y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \ |
266 | y = _mm_packus_epi16(y1, y2); \ |
267 | } |
268 | |
269 | #define READ_UV \ |
270 | { \ |
271 | __m128i u1, u2, u3, u4, v1, v2, v3, v4; \ |
272 | u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \ |
273 | u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \ |
274 | u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \ |
275 | u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \ |
276 | u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \ |
277 | v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \ |
278 | v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \ |
279 | v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \ |
280 | v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \ |
281 | v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \ |
282 | } |
283 | |
284 | #elif YUV_FORMAT == YUV_FORMAT_NV12 |
285 | |
286 | #define READ_Y(y_ptr) \ |
287 | y = LOAD_SI128((const __m128i*)(y_ptr)); \ |
288 | |
289 | #define READ_UV \ |
290 | { \ |
291 | __m128i u1, u2, v1, v2; \ |
292 | u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \ |
293 | u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \ |
294 | u = _mm_packus_epi16(u1, u2); \ |
295 | v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \ |
296 | v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \ |
297 | v = _mm_packus_epi16(v1, v2); \ |
298 | } |
299 | |
300 | #else |
301 | #error READ_UV unimplemented |
302 | #endif |
303 | |
304 | #define YUV2RGB_32 \ |
305 | __m128i r_tmp, g_tmp, b_tmp; \ |
306 | __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \ |
307 | __m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \ |
308 | __m128i y_16_1, y_16_2; \ |
309 | __m128i y, u, v, u_16, v_16; \ |
310 | __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \ |
311 | __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \ |
312 | \ |
313 | READ_UV \ |
314 | \ |
315 | /* process first 16 pixels of first line */\ |
316 | u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \ |
317 | v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \ |
318 | u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \ |
319 | v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \ |
320 | \ |
321 | UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ |
322 | r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \ |
323 | r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \ |
324 | \ |
325 | READ_Y(y_ptr1) \ |
326 | y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ |
327 | y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ |
328 | \ |
329 | ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ |
330 | \ |
331 | r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \ |
332 | g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \ |
333 | b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \ |
334 | \ |
335 | /* process first 16 pixels of second line */\ |
336 | r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ |
337 | r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ |
338 | \ |
339 | READ_Y(y_ptr2) \ |
340 | y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ |
341 | y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ |
342 | \ |
343 | ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ |
344 | \ |
345 | r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \ |
346 | g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \ |
347 | b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \ |
348 | \ |
349 | /* process last 16 pixels of first line */\ |
350 | u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \ |
351 | v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \ |
352 | u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \ |
353 | v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \ |
354 | \ |
355 | UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ |
356 | r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \ |
357 | r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \ |
358 | \ |
359 | READ_Y(y_ptr1+16*y_pixel_stride) \ |
360 | y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ |
361 | y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ |
362 | \ |
363 | ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ |
364 | \ |
365 | r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \ |
366 | g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \ |
367 | b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \ |
368 | \ |
369 | /* process last 16 pixels of second line */\ |
370 | r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ |
371 | r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ |
372 | \ |
373 | READ_Y(y_ptr2+16*y_pixel_stride) \ |
374 | y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ |
375 | y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ |
376 | \ |
377 | ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ |
378 | \ |
379 | r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \ |
380 | g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \ |
381 | b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \ |
382 | \ |
383 | |
384 | |
385 | void SSE_FUNCTION_NAME(uint32_t width, uint32_t height, |
386 | const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, |
387 | uint8_t *RGB, uint32_t RGB_stride, |
388 | YCbCrType yuv_type) |
389 | { |
390 | const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); |
391 | #if YUV_FORMAT == YUV_FORMAT_420 |
392 | const int y_pixel_stride = 1; |
393 | const int uv_pixel_stride = 1; |
394 | const int uv_x_sample_interval = 2; |
395 | const int uv_y_sample_interval = 2; |
396 | #elif YUV_FORMAT == YUV_FORMAT_422 |
397 | const int y_pixel_stride = 2; |
398 | const int uv_pixel_stride = 4; |
399 | const int uv_x_sample_interval = 2; |
400 | const int uv_y_sample_interval = 1; |
401 | #elif YUV_FORMAT == YUV_FORMAT_NV12 |
402 | const int y_pixel_stride = 1; |
403 | const int uv_pixel_stride = 2; |
404 | const int uv_x_sample_interval = 2; |
405 | const int uv_y_sample_interval = 2; |
406 | #endif |
407 | #if RGB_FORMAT == RGB_FORMAT_RGB565 |
408 | const int rgb_pixel_stride = 2; |
409 | #elif RGB_FORMAT == RGB_FORMAT_RGB24 |
410 | const int rgb_pixel_stride = 3; |
411 | #elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \ |
412 | RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR |
413 | const int rgb_pixel_stride = 4; |
414 | #else |
415 | #error Unknown RGB pixel size |
416 | #endif |
417 | |
418 | if (width >= 32) { |
419 | uint32_t xpos, ypos; |
420 | for(ypos=0; ypos<(height-(uv_y_sample_interval-1)); ypos+=uv_y_sample_interval) |
421 | { |
422 | const uint8_t *y_ptr1=Y+ypos*Y_stride, |
423 | *y_ptr2=Y+(ypos+1)*Y_stride, |
424 | *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride, |
425 | *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride; |
426 | |
427 | uint8_t *rgb_ptr1=RGB+ypos*RGB_stride, |
428 | *rgb_ptr2=RGB+(ypos+1)*RGB_stride; |
429 | |
430 | for(xpos=0; xpos<(width-31); xpos+=32) |
431 | { |
432 | YUV2RGB_32 |
433 | { |
434 | PACK_PIXEL |
435 | SAVE_LINE1 |
436 | if (uv_y_sample_interval > 1) |
437 | { |
438 | SAVE_LINE2 |
439 | } |
440 | } |
441 | |
442 | y_ptr1+=32*y_pixel_stride; |
443 | y_ptr2+=32*y_pixel_stride; |
444 | u_ptr+=32*uv_pixel_stride/uv_x_sample_interval; |
445 | v_ptr+=32*uv_pixel_stride/uv_x_sample_interval; |
446 | rgb_ptr1+=32*rgb_pixel_stride; |
447 | rgb_ptr2+=32*rgb_pixel_stride; |
448 | } |
449 | } |
450 | |
451 | /* Catch the last line, if needed */ |
452 | if (uv_y_sample_interval == 2 && ypos == (height-1)) |
453 | { |
454 | const uint8_t *y_ptr=Y+ypos*Y_stride, |
455 | *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride, |
456 | *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride; |
457 | |
458 | uint8_t *rgb_ptr=RGB+ypos*RGB_stride; |
459 | |
460 | STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type); |
461 | } |
462 | } |
463 | |
464 | /* Catch the right column, if needed */ |
465 | { |
466 | int converted = (width & ~31); |
467 | if (converted != width) |
468 | { |
469 | const uint8_t *y_ptr=Y+converted*y_pixel_stride, |
470 | *u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval, |
471 | *v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval; |
472 | |
473 | uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride; |
474 | |
475 | STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type); |
476 | } |
477 | } |
478 | } |
479 | |
480 | #undef SSE_FUNCTION_NAME |
481 | #undef STD_FUNCTION_NAME |
482 | #undef YUV_FORMAT |
483 | #undef RGB_FORMAT |
484 | #undef SSE_ALIGNED |
485 | #undef LOAD_SI128 |
486 | #undef SAVE_SI128 |
487 | #undef UV2RGB_16 |
488 | #undef ADD_Y2RGB_16 |
489 | #undef PACK_RGB24_32_STEP1 |
490 | #undef PACK_RGB24_32_STEP2 |
491 | #undef PACK_RGB24_32 |
492 | #undef PACK_RGBA_32 |
493 | #undef PACK_PIXEL |
494 | #undef SAVE_LINE1 |
495 | #undef SAVE_LINE2 |
496 | #undef READ_Y |
497 | #undef READ_UV |
498 | #undef YUV2RGB_32 |
499 | |