1 | // Copyright 2016 Adrien Descamps |
2 | // Distributed under BSD 3-Clause License |
3 | #include "SDL_internal.h" |
4 | |
5 | #ifdef SDL_HAVE_YUV |
6 | #include "yuv_rgb_internal.h" |
7 | |
8 | #ifdef SDL_SSE2_INTRINSICS |
9 | |
10 | /* SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan. |
11 | #define SSE_FUNCTION_NAME yuv420_rgb565_sse |
12 | #define STD_FUNCTION_NAME yuv420_rgb565_std |
13 | #define YUV_FORMAT YUV_FORMAT_420 |
14 | #define RGB_FORMAT RGB_FORMAT_RGB565 |
15 | #define SSE_ALIGNED |
16 | #include "yuv_rgb_sse_func.h" |
17 | |
18 | #define SSE_FUNCTION_NAME yuv420_rgb24_sse |
19 | #define STD_FUNCTION_NAME yuv420_rgb24_std |
20 | #define YUV_FORMAT YUV_FORMAT_420 |
21 | #define RGB_FORMAT RGB_FORMAT_RGB24 |
22 | #define SSE_ALIGNED |
23 | #include "yuv_rgb_sse_func.h" |
24 | |
25 | #define SSE_FUNCTION_NAME yuv420_rgba_sse |
26 | #define STD_FUNCTION_NAME yuv420_rgba_std |
27 | #define YUV_FORMAT YUV_FORMAT_420 |
28 | #define RGB_FORMAT RGB_FORMAT_RGBA |
29 | #define SSE_ALIGNED |
30 | #include "yuv_rgb_sse_func.h" |
31 | |
32 | #define SSE_FUNCTION_NAME yuv420_bgra_sse |
33 | #define STD_FUNCTION_NAME yuv420_bgra_std |
34 | #define YUV_FORMAT YUV_FORMAT_420 |
35 | #define RGB_FORMAT RGB_FORMAT_BGRA |
36 | #define SSE_ALIGNED |
37 | #include "yuv_rgb_sse_func.h" |
38 | |
39 | #define SSE_FUNCTION_NAME yuv420_argb_sse |
40 | #define STD_FUNCTION_NAME yuv420_argb_std |
41 | #define YUV_FORMAT YUV_FORMAT_420 |
42 | #define RGB_FORMAT RGB_FORMAT_ARGB |
43 | #define SSE_ALIGNED |
44 | #include "yuv_rgb_sse_func.h" |
45 | |
46 | #define SSE_FUNCTION_NAME yuv420_abgr_sse |
47 | #define STD_FUNCTION_NAME yuv420_abgr_std |
48 | #define YUV_FORMAT YUV_FORMAT_420 |
49 | #define RGB_FORMAT RGB_FORMAT_ABGR |
50 | #define SSE_ALIGNED |
51 | #include "yuv_rgb_sse_func.h" |
52 | |
53 | #define SSE_FUNCTION_NAME yuv422_rgb565_sse |
54 | #define STD_FUNCTION_NAME yuv422_rgb565_std |
55 | #define YUV_FORMAT YUV_FORMAT_422 |
56 | #define RGB_FORMAT RGB_FORMAT_RGB565 |
57 | #define SSE_ALIGNED |
58 | #include "yuv_rgb_sse_func.h" |
59 | |
60 | #define SSE_FUNCTION_NAME yuv422_rgb24_sse |
61 | #define STD_FUNCTION_NAME yuv422_rgb24_std |
62 | #define YUV_FORMAT YUV_FORMAT_422 |
63 | #define RGB_FORMAT RGB_FORMAT_RGB24 |
64 | #define SSE_ALIGNED |
65 | #include "yuv_rgb_sse_func.h" |
66 | |
67 | #define SSE_FUNCTION_NAME yuv422_rgba_sse |
68 | #define STD_FUNCTION_NAME yuv422_rgba_std |
69 | #define YUV_FORMAT YUV_FORMAT_422 |
70 | #define RGB_FORMAT RGB_FORMAT_RGBA |
71 | #define SSE_ALIGNED |
72 | #include "yuv_rgb_sse_func.h" |
73 | |
74 | #define SSE_FUNCTION_NAME yuv422_bgra_sse |
75 | #define STD_FUNCTION_NAME yuv422_bgra_std |
76 | #define YUV_FORMAT YUV_FORMAT_422 |
77 | #define RGB_FORMAT RGB_FORMAT_BGRA |
78 | #define SSE_ALIGNED |
79 | #include "yuv_rgb_sse_func.h" |
80 | |
81 | #define SSE_FUNCTION_NAME yuv422_argb_sse |
82 | #define STD_FUNCTION_NAME yuv422_argb_std |
83 | #define YUV_FORMAT YUV_FORMAT_422 |
84 | #define RGB_FORMAT RGB_FORMAT_ARGB |
85 | #define SSE_ALIGNED |
86 | #include "yuv_rgb_sse_func.h" |
87 | |
88 | #define SSE_FUNCTION_NAME yuv422_abgr_sse |
89 | #define STD_FUNCTION_NAME yuv422_abgr_std |
90 | #define YUV_FORMAT YUV_FORMAT_422 |
91 | #define RGB_FORMAT RGB_FORMAT_ABGR |
92 | #define SSE_ALIGNED |
93 | #include "yuv_rgb_sse_func.h" |
94 | |
95 | #define SSE_FUNCTION_NAME yuvnv12_rgb565_sse |
96 | #define STD_FUNCTION_NAME yuvnv12_rgb565_std |
97 | #define YUV_FORMAT YUV_FORMAT_NV12 |
98 | #define RGB_FORMAT RGB_FORMAT_RGB565 |
99 | #define SSE_ALIGNED |
100 | #include "yuv_rgb_sse_func.h" |
101 | |
102 | #define SSE_FUNCTION_NAME yuvnv12_rgb24_sse |
103 | #define STD_FUNCTION_NAME yuvnv12_rgb24_std |
104 | #define YUV_FORMAT YUV_FORMAT_NV12 |
105 | #define RGB_FORMAT RGB_FORMAT_RGB24 |
106 | #define SSE_ALIGNED |
107 | #include "yuv_rgb_sse_func.h" |
108 | |
109 | #define SSE_FUNCTION_NAME yuvnv12_rgba_sse |
110 | #define STD_FUNCTION_NAME yuvnv12_rgba_std |
111 | #define YUV_FORMAT YUV_FORMAT_NV12 |
112 | #define RGB_FORMAT RGB_FORMAT_RGBA |
113 | #define SSE_ALIGNED |
114 | #include "yuv_rgb_sse_func.h" |
115 | |
116 | #define SSE_FUNCTION_NAME yuvnv12_bgra_sse |
117 | #define STD_FUNCTION_NAME yuvnv12_bgra_std |
118 | #define YUV_FORMAT YUV_FORMAT_NV12 |
119 | #define RGB_FORMAT RGB_FORMAT_BGRA |
120 | #define SSE_ALIGNED |
121 | #include "yuv_rgb_sse_func.h" |
122 | |
123 | #define SSE_FUNCTION_NAME yuvnv12_argb_sse |
124 | #define STD_FUNCTION_NAME yuvnv12_argb_std |
125 | #define YUV_FORMAT YUV_FORMAT_NV12 |
126 | #define RGB_FORMAT RGB_FORMAT_ARGB |
127 | #define SSE_ALIGNED |
128 | #include "yuv_rgb_sse_func.h" |
129 | |
130 | #define SSE_FUNCTION_NAME yuvnv12_abgr_sse |
131 | #define STD_FUNCTION_NAME yuvnv12_abgr_std |
132 | #define YUV_FORMAT YUV_FORMAT_NV12 |
133 | #define RGB_FORMAT RGB_FORMAT_ABGR |
134 | #define SSE_ALIGNED |
135 | #include "yuv_rgb_sse_func.h" |
136 | */ |
137 | |
138 | #define SSE_FUNCTION_NAME yuv420_rgb565_sseu |
139 | #define STD_FUNCTION_NAME yuv420_rgb565_std |
140 | #define YUV_FORMAT YUV_FORMAT_420 |
141 | #define RGB_FORMAT RGB_FORMAT_RGB565 |
142 | #include "yuv_rgb_sse_func.h" |
143 | |
144 | #define SSE_FUNCTION_NAME yuv420_rgb24_sseu |
145 | #define STD_FUNCTION_NAME yuv420_rgb24_std |
146 | #define YUV_FORMAT YUV_FORMAT_420 |
147 | #define RGB_FORMAT RGB_FORMAT_RGB24 |
148 | #include "yuv_rgb_sse_func.h" |
149 | |
150 | #define SSE_FUNCTION_NAME yuv420_rgba_sseu |
151 | #define STD_FUNCTION_NAME yuv420_rgba_std |
152 | #define YUV_FORMAT YUV_FORMAT_420 |
153 | #define RGB_FORMAT RGB_FORMAT_RGBA |
154 | #include "yuv_rgb_sse_func.h" |
155 | |
156 | #define SSE_FUNCTION_NAME yuv420_bgra_sseu |
157 | #define STD_FUNCTION_NAME yuv420_bgra_std |
158 | #define YUV_FORMAT YUV_FORMAT_420 |
159 | #define RGB_FORMAT RGB_FORMAT_BGRA |
160 | #include "yuv_rgb_sse_func.h" |
161 | |
162 | #define SSE_FUNCTION_NAME yuv420_argb_sseu |
163 | #define STD_FUNCTION_NAME yuv420_argb_std |
164 | #define YUV_FORMAT YUV_FORMAT_420 |
165 | #define RGB_FORMAT RGB_FORMAT_ARGB |
166 | #include "yuv_rgb_sse_func.h" |
167 | |
168 | #define SSE_FUNCTION_NAME yuv420_abgr_sseu |
169 | #define STD_FUNCTION_NAME yuv420_abgr_std |
170 | #define YUV_FORMAT YUV_FORMAT_420 |
171 | #define RGB_FORMAT RGB_FORMAT_ABGR |
172 | #include "yuv_rgb_sse_func.h" |
173 | |
174 | #define SSE_FUNCTION_NAME yuv422_rgb565_sseu |
175 | #define STD_FUNCTION_NAME yuv422_rgb565_std |
176 | #define YUV_FORMAT YUV_FORMAT_422 |
177 | #define RGB_FORMAT RGB_FORMAT_RGB565 |
178 | #include "yuv_rgb_sse_func.h" |
179 | |
180 | #define SSE_FUNCTION_NAME yuv422_rgb24_sseu |
181 | #define STD_FUNCTION_NAME yuv422_rgb24_std |
182 | #define YUV_FORMAT YUV_FORMAT_422 |
183 | #define RGB_FORMAT RGB_FORMAT_RGB24 |
184 | #include "yuv_rgb_sse_func.h" |
185 | |
186 | #define SSE_FUNCTION_NAME yuv422_rgba_sseu |
187 | #define STD_FUNCTION_NAME yuv422_rgba_std |
188 | #define YUV_FORMAT YUV_FORMAT_422 |
189 | #define RGB_FORMAT RGB_FORMAT_RGBA |
190 | #include "yuv_rgb_sse_func.h" |
191 | |
192 | #define SSE_FUNCTION_NAME yuv422_bgra_sseu |
193 | #define STD_FUNCTION_NAME yuv422_bgra_std |
194 | #define YUV_FORMAT YUV_FORMAT_422 |
195 | #define RGB_FORMAT RGB_FORMAT_BGRA |
196 | #include "yuv_rgb_sse_func.h" |
197 | |
198 | #define SSE_FUNCTION_NAME yuv422_argb_sseu |
199 | #define STD_FUNCTION_NAME yuv422_argb_std |
200 | #define YUV_FORMAT YUV_FORMAT_422 |
201 | #define RGB_FORMAT RGB_FORMAT_ARGB |
202 | #include "yuv_rgb_sse_func.h" |
203 | |
204 | #define SSE_FUNCTION_NAME yuv422_abgr_sseu |
205 | #define STD_FUNCTION_NAME yuv422_abgr_std |
206 | #define YUV_FORMAT YUV_FORMAT_422 |
207 | #define RGB_FORMAT RGB_FORMAT_ABGR |
208 | #include "yuv_rgb_sse_func.h" |
209 | |
210 | #define SSE_FUNCTION_NAME yuvnv12_rgb565_sseu |
211 | #define STD_FUNCTION_NAME yuvnv12_rgb565_std |
212 | #define YUV_FORMAT YUV_FORMAT_NV12 |
213 | #define RGB_FORMAT RGB_FORMAT_RGB565 |
214 | #include "yuv_rgb_sse_func.h" |
215 | |
216 | #define SSE_FUNCTION_NAME yuvnv12_rgb24_sseu |
217 | #define STD_FUNCTION_NAME yuvnv12_rgb24_std |
218 | #define YUV_FORMAT YUV_FORMAT_NV12 |
219 | #define RGB_FORMAT RGB_FORMAT_RGB24 |
220 | #include "yuv_rgb_sse_func.h" |
221 | |
222 | #define SSE_FUNCTION_NAME yuvnv12_rgba_sseu |
223 | #define STD_FUNCTION_NAME yuvnv12_rgba_std |
224 | #define YUV_FORMAT YUV_FORMAT_NV12 |
225 | #define RGB_FORMAT RGB_FORMAT_RGBA |
226 | #include "yuv_rgb_sse_func.h" |
227 | |
228 | #define SSE_FUNCTION_NAME yuvnv12_bgra_sseu |
229 | #define STD_FUNCTION_NAME yuvnv12_bgra_std |
230 | #define YUV_FORMAT YUV_FORMAT_NV12 |
231 | #define RGB_FORMAT RGB_FORMAT_BGRA |
232 | #include "yuv_rgb_sse_func.h" |
233 | |
234 | #define SSE_FUNCTION_NAME yuvnv12_argb_sseu |
235 | #define STD_FUNCTION_NAME yuvnv12_argb_std |
236 | #define YUV_FORMAT YUV_FORMAT_NV12 |
237 | #define RGB_FORMAT RGB_FORMAT_ARGB |
238 | #include "yuv_rgb_sse_func.h" |
239 | |
240 | #define SSE_FUNCTION_NAME yuvnv12_abgr_sseu |
241 | #define STD_FUNCTION_NAME yuvnv12_abgr_std |
242 | #define YUV_FORMAT YUV_FORMAT_NV12 |
243 | #define RGB_FORMAT RGB_FORMAT_ABGR |
244 | #include "yuv_rgb_sse_func.h" |
245 | |
246 | |
247 | /* SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan. |
248 | #define UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ |
249 | R1 = _mm_unpacklo_epi8(RGB1, RGB4); \ |
250 | R2 = _mm_unpackhi_epi8(RGB1, RGB4); \ |
251 | G1 = _mm_unpacklo_epi8(RGB2, RGB5); \ |
252 | G2 = _mm_unpackhi_epi8(RGB2, RGB5); \ |
253 | B1 = _mm_unpacklo_epi8(RGB3, RGB6); \ |
254 | B2 = _mm_unpackhi_epi8(RGB3, RGB6); |
255 | |
256 | #define UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ |
257 | RGB1 = _mm_unpacklo_epi8(R1, G2); \ |
258 | RGB2 = _mm_unpackhi_epi8(R1, G2); \ |
259 | RGB3 = _mm_unpacklo_epi8(R2, B1); \ |
260 | RGB4 = _mm_unpackhi_epi8(R2, B1); \ |
261 | RGB5 = _mm_unpacklo_epi8(G1, B2); \ |
262 | RGB6 = _mm_unpackhi_epi8(G1, B2); \ |
263 | |
264 | #define UNPACK_RGB24_32(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ |
265 | UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ |
266 | UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ |
267 | UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ |
268 | UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ |
269 | UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ |
270 | |
271 | #define RGB2YUV_16(R, G, B, Y, U, V) \ |
272 | Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[0][0])), \ |
273 | _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[0][1]))); \ |
274 | Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[0][2]))); \ |
275 | Y = _mm_add_epi16(Y, _mm_set1_epi16((param->y_shift)<<PRECISION)); \ |
276 | Y = _mm_srai_epi16(Y, PRECISION); \ |
277 | U = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[1][0])), \ |
278 | _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[1][1]))); \ |
279 | U = _mm_add_epi16(U, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[1][2]))); \ |
280 | U = _mm_add_epi16(U, _mm_set1_epi16(128<<PRECISION)); \ |
281 | U = _mm_srai_epi16(U, PRECISION); \ |
282 | V = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[2][0])), \ |
283 | _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[2][1]))); \ |
284 | V = _mm_add_epi16(V, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[2][2]))); \ |
285 | V = _mm_add_epi16(V, _mm_set1_epi16(128<<PRECISION)); \ |
286 | V = _mm_srai_epi16(V, PRECISION); |
287 | */ |
288 | |
289 | #if 0 // SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan. |
290 | #define RGB2YUV_32 \ |
291 | __m128i r1, r2, b1, b2, g1, g2; \ |
292 | __m128i r_16, g_16, b_16; \ |
293 | __m128i y1_16, y2_16, u1_16, u2_16, v1_16, v2_16, y, u1, u2, v1, v2, u1_tmp, u2_tmp, v1_tmp, v2_tmp; \ |
294 | __m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \ |
295 | rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \ |
296 | rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \ |
297 | rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \ |
298 | rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \ |
299 | rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \ |
300 | /* unpack rgb24 data to r, g and b data in separate channels*/ \ |
301 | UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \ |
302 | /* process pixels of first line */ \ |
303 | r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \ |
304 | g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \ |
305 | b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \ |
306 | RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \ |
307 | r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \ |
308 | g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \ |
309 | b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \ |
310 | RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \ |
311 | y = _mm_packus_epi16(y1_16, y2_16); \ |
312 | u1 = _mm_packus_epi16(u1_16, u2_16); \ |
313 | v1 = _mm_packus_epi16(v1_16, v2_16); \ |
314 | /* save Y values */ \ |
315 | SAVE_SI128((__m128i*)(y_ptr1), y); \ |
316 | /* process pixels of second line */ \ |
317 | r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \ |
318 | g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \ |
319 | b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \ |
320 | RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \ |
321 | r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \ |
322 | g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \ |
323 | b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \ |
324 | RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \ |
325 | y = _mm_packus_epi16(y1_16, y2_16); \ |
326 | u2 = _mm_packus_epi16(u1_16, u2_16); \ |
327 | v2 = _mm_packus_epi16(v1_16, v2_16); \ |
328 | /* save Y values */ \ |
329 | SAVE_SI128((__m128i*)(y_ptr2), y); \ |
330 | /* vertical subsampling of u/v values */ \ |
331 | u1_tmp = _mm_avg_epu8(u1, u2); \ |
332 | v1_tmp = _mm_avg_epu8(v1, v2); \ |
333 | /* do the same again with next data */ \ |
334 | rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)); \ |
335 | rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)); \ |
336 | rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)); \ |
337 | rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \ |
338 | rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)); \ |
339 | rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \ |
340 | /* unpack rgb24 data to r, g and b data in separate channels*/ \ |
341 | UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \ |
342 | /* process pixels of first line */ \ |
343 | r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \ |
344 | g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \ |
345 | b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \ |
346 | RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \ |
347 | r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \ |
348 | g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \ |
349 | b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \ |
350 | RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \ |
351 | y = _mm_packus_epi16(y1_16, y2_16); \ |
352 | u1 = _mm_packus_epi16(u1_16, u2_16); \ |
353 | v1 = _mm_packus_epi16(v1_16, v2_16); \ |
354 | /* save Y values */ \ |
355 | SAVE_SI128((__m128i*)(y_ptr1+16), y); \ |
356 | /* process pixels of second line */ \ |
357 | r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \ |
358 | g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \ |
359 | b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \ |
360 | RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \ |
361 | r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \ |
362 | g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \ |
363 | b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \ |
364 | RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \ |
365 | y = _mm_packus_epi16(y1_16, y2_16); \ |
366 | u2 = _mm_packus_epi16(u1_16, u2_16); \ |
367 | v2 = _mm_packus_epi16(v1_16, v2_16); \ |
368 | /* save Y values */ \ |
369 | SAVE_SI128((__m128i*)(y_ptr2+16), y); \ |
370 | /* vertical subsampling of u/v values */ \ |
371 | u2_tmp = _mm_avg_epu8(u1, u2); \ |
372 | v2_tmp = _mm_avg_epu8(v1, v2); \ |
373 | /* horizontal subsampling of u/v values */ \ |
374 | u1 = _mm_packus_epi16(_mm_srl_epi16(u1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(u2_tmp, _mm_cvtsi32_si128(8))); \ |
375 | v1 = _mm_packus_epi16(_mm_srl_epi16(v1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(v2_tmp, _mm_cvtsi32_si128(8))); \ |
376 | u2 = _mm_packus_epi16(_mm_and_si128(u1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(u2_tmp, _mm_set1_epi16(0xFF))); \ |
377 | v2 = _mm_packus_epi16(_mm_and_si128(v1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(v2_tmp, _mm_set1_epi16(0xFF))); \ |
378 | u1 = _mm_avg_epu8(u1, u2); \ |
379 | v1 = _mm_avg_epu8(v1, v2); \ |
380 | SAVE_SI128((__m128i*)(u_ptr), u1); \ |
381 | SAVE_SI128((__m128i*)(v_ptr), v1); |
382 | #endif |
383 | |
384 | /* SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan. |
385 | void SDL_TARGETING("sse2") rgb24_yuv420_sse(uint32_t width, uint32_t height, |
386 | const uint8_t *RGB, uint32_t RGB_stride, |
387 | uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, |
388 | YCbCrType yuv_type) |
389 | { |
390 | #define LOAD_SI128 _mm_load_si128 |
391 | #define SAVE_SI128 _mm_stream_si128 |
392 | const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); |
393 | |
394 | uint32_t xpos, ypos; |
395 | for(ypos=0; ypos<(height-1); ypos+=2) |
396 | { |
397 | const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride, |
398 | *rgb_ptr2=RGB+(ypos+1)*RGB_stride; |
399 | |
400 | uint8_t *y_ptr1=Y+ypos*Y_stride, |
401 | *y_ptr2=Y+(ypos+1)*Y_stride, |
402 | *u_ptr=U+(ypos/2)*UV_stride, |
403 | *v_ptr=V+(ypos/2)*UV_stride; |
404 | |
405 | for(xpos=0; xpos<(width-31); xpos+=32) |
406 | { |
407 | RGB2YUV_32 |
408 | |
409 | rgb_ptr1+=96; |
410 | rgb_ptr2+=96; |
411 | y_ptr1+=32; |
412 | y_ptr2+=32; |
413 | u_ptr+=16; |
414 | v_ptr+=16; |
415 | } |
416 | } |
417 | #undef LOAD_SI128 |
418 | #undef SAVE_SI128 |
419 | } |
420 | |
421 | void SDL_TARGETING("sse2") rgb24_yuv420_sseu(uint32_t width, uint32_t height, |
422 | const uint8_t *RGB, uint32_t RGB_stride, |
423 | uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, |
424 | YCbCrType yuv_type) |
425 | { |
426 | #define LOAD_SI128 _mm_loadu_si128 |
427 | #define SAVE_SI128 _mm_storeu_si128 |
428 | const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); |
429 | |
430 | uint32_t xpos, ypos; |
431 | for(ypos=0; ypos<(height-1); ypos+=2) |
432 | { |
433 | const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride, |
434 | *rgb_ptr2=RGB+(ypos+1)*RGB_stride; |
435 | |
436 | uint8_t *y_ptr1=Y+ypos*Y_stride, |
437 | *y_ptr2=Y+(ypos+1)*Y_stride, |
438 | *u_ptr=U+(ypos/2)*UV_stride, |
439 | *v_ptr=V+(ypos/2)*UV_stride; |
440 | |
441 | for(xpos=0; xpos<(width-31); xpos+=32) |
442 | { |
443 | RGB2YUV_32 |
444 | |
445 | rgb_ptr1+=96; |
446 | rgb_ptr2+=96; |
447 | y_ptr1+=32; |
448 | y_ptr2+=32; |
449 | u_ptr+=16; |
450 | v_ptr+=16; |
451 | } |
452 | } |
453 | #undef LOAD_SI128 |
454 | #undef SAVE_SI128 |
455 | } |
456 | */ |
457 | |
458 | #endif // SDL_SSE2_INTRINSICS |
459 | |
460 | #endif // SDL_HAVE_YUV |
461 | |