1// Copyright 2016 Adrien Descamps
2// Distributed under BSD 3-Clause License
3
4/* You need to define the following macros before including this file:
5 SSE_FUNCTION_NAME
6 STD_FUNCTION_NAME
7 YUV_FORMAT
8 RGB_FORMAT
9*/
10/* You may define the following macro, which affects generated code:
11 SSE_ALIGNED
12*/
13
14#ifdef SSE_ALIGNED
15/* Unaligned instructions seem faster, even on aligned data? */
16/*
17#define LOAD_SI128 _mm_load_si128
18#define SAVE_SI128 _mm_stream_si128
19*/
20#define LOAD_SI128 _mm_loadu_si128
21#define SAVE_SI128 _mm_storeu_si128
22#else
23#define LOAD_SI128 _mm_loadu_si128
24#define SAVE_SI128 _mm_storeu_si128
25#endif
26
27#define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \
28 r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \
29 g_tmp = _mm_add_epi16( \
30 _mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \
31 _mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \
32 b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \
33 R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \
34 G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \
35 B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \
36 R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \
37 G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \
38 B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \
39
40#define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \
41 Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
42 Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
43 \
44 R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \
45 G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \
46 B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \
47 R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \
48 G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \
49 B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \
50
51#define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \
52{ \
53 __m128i red_mask, tmp1, tmp2, tmp3, tmp4; \
54\
55 red_mask = _mm_set1_epi16((short)0xF800); \
56 RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \
57 RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \
58 RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \
59 RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \
60 tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \
61 tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \
62 tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \
63 tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \
64 RGB1 = _mm_or_si128(RGB1, tmp1); \
65 RGB2 = _mm_or_si128(RGB2, tmp2); \
66 RGB3 = _mm_or_si128(RGB3, tmp3); \
67 RGB4 = _mm_or_si128(RGB4, tmp4); \
68 tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \
69 tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \
70 tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \
71 tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \
72 RGB1 = _mm_or_si128(RGB1, tmp1); \
73 RGB2 = _mm_or_si128(RGB2, tmp2); \
74 RGB3 = _mm_or_si128(RGB3, tmp3); \
75 RGB4 = _mm_or_si128(RGB4, tmp4); \
76}
77
78#define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
79RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \
80RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \
81RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \
82RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \
83RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \
84RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \
85
86#define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
87R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \
88R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \
89G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \
90G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \
91B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \
92B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \
93
94#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
95PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
96PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
97PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
98PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
99PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
100
101#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \
102{ \
103 __m128i lo_ab, hi_ab, lo_gr, hi_gr; \
104\
105 lo_ab = _mm_unpacklo_epi8( A1, B1 ); \
106 hi_ab = _mm_unpackhi_epi8( A1, B1 ); \
107 lo_gr = _mm_unpacklo_epi8( G1, R1 ); \
108 hi_gr = _mm_unpackhi_epi8( G1, R1 ); \
109 RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
110 RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
111 RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
112 RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
113\
114 lo_ab = _mm_unpacklo_epi8( A2, B2 ); \
115 hi_ab = _mm_unpackhi_epi8( A2, B2 ); \
116 lo_gr = _mm_unpacklo_epi8( G2, R2 ); \
117 hi_gr = _mm_unpackhi_epi8( G2, R2 ); \
118 RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
119 RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
120 RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
121 RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
122}
123
124#if RGB_FORMAT == RGB_FORMAT_RGB565
125
126#define PACK_PIXEL \
127 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
128 \
129 PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \
130 \
131 PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \
132
133#elif RGB_FORMAT == RGB_FORMAT_RGB24
134
135#define PACK_PIXEL \
136 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \
137 __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \
138 \
139 PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \
140 \
141 PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \
142
143#elif RGB_FORMAT == RGB_FORMAT_RGBA
144
145#define PACK_PIXEL \
146 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
147 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
148 __m128i a = _mm_set1_epi8((char)0xFF); \
149 \
150 PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
151 \
152 PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
153
154#elif RGB_FORMAT == RGB_FORMAT_BGRA
155
156#define PACK_PIXEL \
157 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
158 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
159 __m128i a = _mm_set1_epi8((char)0xFF); \
160 \
161 PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
162 \
163 PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
164
165#elif RGB_FORMAT == RGB_FORMAT_ARGB
166
167#define PACK_PIXEL \
168 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
169 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
170 __m128i a = _mm_set1_epi8((char)0xFF); \
171 \
172 PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
173 \
174 PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
175
176#elif RGB_FORMAT == RGB_FORMAT_ABGR
177
178#define PACK_PIXEL \
179 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
180 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
181 __m128i a = _mm_set1_epi8((char)0xFF); \
182 \
183 PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
184 \
185 PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
186
187#else
188#error PACK_PIXEL unimplemented
189#endif
190
191#if RGB_FORMAT == RGB_FORMAT_RGB565
192
193#define SAVE_LINE1 \
194 SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
195 SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
196 SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
197 SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
198
199#define SAVE_LINE2 \
200 SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \
201 SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \
202 SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \
203 SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \
204
205#elif RGB_FORMAT == RGB_FORMAT_RGB24
206
207#define SAVE_LINE1 \
208 SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
209 SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
210 SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
211 SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
212 SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
213 SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
214
215#define SAVE_LINE2 \
216 SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \
217 SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \
218 SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \
219 SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \
220 SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \
221 SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \
222
223#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
224 RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
225
226#define SAVE_LINE1 \
227 SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
228 SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
229 SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
230 SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
231 SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
232 SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
233 SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \
234 SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \
235
236#define SAVE_LINE2 \
237 SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \
238 SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \
239 SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \
240 SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \
241 SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \
242 SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \
243 SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \
244 SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \
245
246#else
247#error SAVE_LINE unimplemented
248#endif
249
250#if YUV_FORMAT == YUV_FORMAT_420
251
252#define READ_Y(y_ptr) \
253 y = LOAD_SI128((const __m128i*)(y_ptr)); \
254
255#define READ_UV \
256 u = LOAD_SI128((const __m128i*)(u_ptr)); \
257 v = LOAD_SI128((const __m128i*)(v_ptr)); \
258
259#elif YUV_FORMAT == YUV_FORMAT_422
260
261#define READ_Y(y_ptr) \
262{ \
263 __m128i y1, y2; \
264 y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \
265 y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \
266 y = _mm_packus_epi16(y1, y2); \
267}
268
269#define READ_UV \
270{ \
271 __m128i u1, u2, u3, u4, v1, v2, v3, v4; \
272 u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \
273 u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \
274 u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \
275 u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \
276 u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \
277 v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \
278 v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \
279 v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \
280 v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \
281 v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \
282}
283
284#elif YUV_FORMAT == YUV_FORMAT_NV12
285
286#define READ_Y(y_ptr) \
287 y = LOAD_SI128((const __m128i*)(y_ptr)); \
288
289#define READ_UV \
290{ \
291 __m128i u1, u2, v1, v2; \
292 u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \
293 u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \
294 u = _mm_packus_epi16(u1, u2); \
295 v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \
296 v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \
297 v = _mm_packus_epi16(v1, v2); \
298}
299
300#else
301#error READ_UV unimplemented
302#endif
303
304#define YUV2RGB_32 \
305 __m128i r_tmp, g_tmp, b_tmp; \
306 __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \
307 __m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \
308 __m128i y_16_1, y_16_2; \
309 __m128i y, u, v, u_16, v_16; \
310 __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \
311 __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \
312 \
313 READ_UV \
314 \
315 /* process first 16 pixels of first line */\
316 u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \
317 v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \
318 u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
319 v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
320 \
321 UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
322 r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
323 r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
324 \
325 READ_Y(y_ptr1) \
326 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
327 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
328 \
329 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
330 \
331 r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \
332 g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \
333 b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \
334 \
335 /* process first 16 pixels of second line */\
336 r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
337 r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
338 \
339 READ_Y(y_ptr2) \
340 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
341 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
342 \
343 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
344 \
345 r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \
346 g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \
347 b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \
348 \
349 /* process last 16 pixels of first line */\
350 u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \
351 v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \
352 u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
353 v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
354 \
355 UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
356 r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
357 r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
358 \
359 READ_Y(y_ptr1+16*y_pixel_stride) \
360 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
361 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
362 \
363 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
364 \
365 r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \
366 g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \
367 b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \
368 \
369 /* process last 16 pixels of second line */\
370 r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
371 r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
372 \
373 READ_Y(y_ptr2+16*y_pixel_stride) \
374 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
375 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
376 \
377 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
378 \
379 r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \
380 g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \
381 b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \
382 \
383
384
385void SSE_FUNCTION_NAME(uint32_t width, uint32_t height,
386 const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
387 uint8_t *RGB, uint32_t RGB_stride,
388 YCbCrType yuv_type)
389{
390 const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
391#if YUV_FORMAT == YUV_FORMAT_420
392 const int y_pixel_stride = 1;
393 const int uv_pixel_stride = 1;
394 const int uv_x_sample_interval = 2;
395 const int uv_y_sample_interval = 2;
396#elif YUV_FORMAT == YUV_FORMAT_422
397 const int y_pixel_stride = 2;
398 const int uv_pixel_stride = 4;
399 const int uv_x_sample_interval = 2;
400 const int uv_y_sample_interval = 1;
401#elif YUV_FORMAT == YUV_FORMAT_NV12
402 const int y_pixel_stride = 1;
403 const int uv_pixel_stride = 2;
404 const int uv_x_sample_interval = 2;
405 const int uv_y_sample_interval = 2;
406#endif
407#if RGB_FORMAT == RGB_FORMAT_RGB565
408 const int rgb_pixel_stride = 2;
409#elif RGB_FORMAT == RGB_FORMAT_RGB24
410 const int rgb_pixel_stride = 3;
411#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
412 RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
413 const int rgb_pixel_stride = 4;
414#else
415#error Unknown RGB pixel size
416#endif
417
418 if (width >= 32) {
419 uint32_t xpos, ypos;
420 for(ypos=0; ypos<(height-(uv_y_sample_interval-1)); ypos+=uv_y_sample_interval)
421 {
422 const uint8_t *y_ptr1=Y+ypos*Y_stride,
423 *y_ptr2=Y+(ypos+1)*Y_stride,
424 *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
425 *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
426
427 uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
428 *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
429
430 for(xpos=0; xpos<(width-31); xpos+=32)
431 {
432 YUV2RGB_32
433 {
434 PACK_PIXEL
435 SAVE_LINE1
436 if (uv_y_sample_interval > 1)
437 {
438 SAVE_LINE2
439 }
440 }
441
442 y_ptr1+=32*y_pixel_stride;
443 y_ptr2+=32*y_pixel_stride;
444 u_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
445 v_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
446 rgb_ptr1+=32*rgb_pixel_stride;
447 rgb_ptr2+=32*rgb_pixel_stride;
448 }
449 }
450
451 /* Catch the last line, if needed */
452 if (uv_y_sample_interval == 2 && ypos == (height-1))
453 {
454 const uint8_t *y_ptr=Y+ypos*Y_stride,
455 *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
456 *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
457
458 uint8_t *rgb_ptr=RGB+ypos*RGB_stride;
459
460 STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
461 }
462 }
463
464 /* Catch the right column, if needed */
465 {
466 int converted = (width & ~31);
467 if (converted != width)
468 {
469 const uint8_t *y_ptr=Y+converted*y_pixel_stride,
470 *u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval,
471 *v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval;
472
473 uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride;
474
475 STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
476 }
477 }
478}
479
480#undef SSE_FUNCTION_NAME
481#undef STD_FUNCTION_NAME
482#undef YUV_FORMAT
483#undef RGB_FORMAT
484#undef SSE_ALIGNED
485#undef LOAD_SI128
486#undef SAVE_SI128
487#undef UV2RGB_16
488#undef ADD_Y2RGB_16
489#undef PACK_RGB24_32_STEP1
490#undef PACK_RGB24_32_STEP2
491#undef PACK_RGB24_32
492#undef PACK_RGBA_32
493#undef PACK_PIXEL
494#undef SAVE_LINE1
495#undef SAVE_LINE2
496#undef READ_Y
497#undef READ_UV
498#undef YUV2RGB_32
499