1// Copyright 2016 Adrien Descamps
2// Distributed under BSD 3-Clause License
3#include "SDL_internal.h"
4
5#ifdef SDL_HAVE_YUV
6#include "yuv_rgb_internal.h"
7
8#ifdef SDL_SSE2_INTRINSICS
9
10/* SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan.
11#define SSE_FUNCTION_NAME yuv420_rgb565_sse
12#define STD_FUNCTION_NAME yuv420_rgb565_std
13#define YUV_FORMAT YUV_FORMAT_420
14#define RGB_FORMAT RGB_FORMAT_RGB565
15#define SSE_ALIGNED
16#include "yuv_rgb_sse_func.h"
17
18#define SSE_FUNCTION_NAME yuv420_rgb24_sse
19#define STD_FUNCTION_NAME yuv420_rgb24_std
20#define YUV_FORMAT YUV_FORMAT_420
21#define RGB_FORMAT RGB_FORMAT_RGB24
22#define SSE_ALIGNED
23#include "yuv_rgb_sse_func.h"
24
25#define SSE_FUNCTION_NAME yuv420_rgba_sse
26#define STD_FUNCTION_NAME yuv420_rgba_std
27#define YUV_FORMAT YUV_FORMAT_420
28#define RGB_FORMAT RGB_FORMAT_RGBA
29#define SSE_ALIGNED
30#include "yuv_rgb_sse_func.h"
31
32#define SSE_FUNCTION_NAME yuv420_bgra_sse
33#define STD_FUNCTION_NAME yuv420_bgra_std
34#define YUV_FORMAT YUV_FORMAT_420
35#define RGB_FORMAT RGB_FORMAT_BGRA
36#define SSE_ALIGNED
37#include "yuv_rgb_sse_func.h"
38
39#define SSE_FUNCTION_NAME yuv420_argb_sse
40#define STD_FUNCTION_NAME yuv420_argb_std
41#define YUV_FORMAT YUV_FORMAT_420
42#define RGB_FORMAT RGB_FORMAT_ARGB
43#define SSE_ALIGNED
44#include "yuv_rgb_sse_func.h"
45
46#define SSE_FUNCTION_NAME yuv420_abgr_sse
47#define STD_FUNCTION_NAME yuv420_abgr_std
48#define YUV_FORMAT YUV_FORMAT_420
49#define RGB_FORMAT RGB_FORMAT_ABGR
50#define SSE_ALIGNED
51#include "yuv_rgb_sse_func.h"
52
53#define SSE_FUNCTION_NAME yuv422_rgb565_sse
54#define STD_FUNCTION_NAME yuv422_rgb565_std
55#define YUV_FORMAT YUV_FORMAT_422
56#define RGB_FORMAT RGB_FORMAT_RGB565
57#define SSE_ALIGNED
58#include "yuv_rgb_sse_func.h"
59
60#define SSE_FUNCTION_NAME yuv422_rgb24_sse
61#define STD_FUNCTION_NAME yuv422_rgb24_std
62#define YUV_FORMAT YUV_FORMAT_422
63#define RGB_FORMAT RGB_FORMAT_RGB24
64#define SSE_ALIGNED
65#include "yuv_rgb_sse_func.h"
66
67#define SSE_FUNCTION_NAME yuv422_rgba_sse
68#define STD_FUNCTION_NAME yuv422_rgba_std
69#define YUV_FORMAT YUV_FORMAT_422
70#define RGB_FORMAT RGB_FORMAT_RGBA
71#define SSE_ALIGNED
72#include "yuv_rgb_sse_func.h"
73
74#define SSE_FUNCTION_NAME yuv422_bgra_sse
75#define STD_FUNCTION_NAME yuv422_bgra_std
76#define YUV_FORMAT YUV_FORMAT_422
77#define RGB_FORMAT RGB_FORMAT_BGRA
78#define SSE_ALIGNED
79#include "yuv_rgb_sse_func.h"
80
81#define SSE_FUNCTION_NAME yuv422_argb_sse
82#define STD_FUNCTION_NAME yuv422_argb_std
83#define YUV_FORMAT YUV_FORMAT_422
84#define RGB_FORMAT RGB_FORMAT_ARGB
85#define SSE_ALIGNED
86#include "yuv_rgb_sse_func.h"
87
88#define SSE_FUNCTION_NAME yuv422_abgr_sse
89#define STD_FUNCTION_NAME yuv422_abgr_std
90#define YUV_FORMAT YUV_FORMAT_422
91#define RGB_FORMAT RGB_FORMAT_ABGR
92#define SSE_ALIGNED
93#include "yuv_rgb_sse_func.h"
94
95#define SSE_FUNCTION_NAME yuvnv12_rgb565_sse
96#define STD_FUNCTION_NAME yuvnv12_rgb565_std
97#define YUV_FORMAT YUV_FORMAT_NV12
98#define RGB_FORMAT RGB_FORMAT_RGB565
99#define SSE_ALIGNED
100#include "yuv_rgb_sse_func.h"
101
102#define SSE_FUNCTION_NAME yuvnv12_rgb24_sse
103#define STD_FUNCTION_NAME yuvnv12_rgb24_std
104#define YUV_FORMAT YUV_FORMAT_NV12
105#define RGB_FORMAT RGB_FORMAT_RGB24
106#define SSE_ALIGNED
107#include "yuv_rgb_sse_func.h"
108
109#define SSE_FUNCTION_NAME yuvnv12_rgba_sse
110#define STD_FUNCTION_NAME yuvnv12_rgba_std
111#define YUV_FORMAT YUV_FORMAT_NV12
112#define RGB_FORMAT RGB_FORMAT_RGBA
113#define SSE_ALIGNED
114#include "yuv_rgb_sse_func.h"
115
116#define SSE_FUNCTION_NAME yuvnv12_bgra_sse
117#define STD_FUNCTION_NAME yuvnv12_bgra_std
118#define YUV_FORMAT YUV_FORMAT_NV12
119#define RGB_FORMAT RGB_FORMAT_BGRA
120#define SSE_ALIGNED
121#include "yuv_rgb_sse_func.h"
122
123#define SSE_FUNCTION_NAME yuvnv12_argb_sse
124#define STD_FUNCTION_NAME yuvnv12_argb_std
125#define YUV_FORMAT YUV_FORMAT_NV12
126#define RGB_FORMAT RGB_FORMAT_ARGB
127#define SSE_ALIGNED
128#include "yuv_rgb_sse_func.h"
129
130#define SSE_FUNCTION_NAME yuvnv12_abgr_sse
131#define STD_FUNCTION_NAME yuvnv12_abgr_std
132#define YUV_FORMAT YUV_FORMAT_NV12
133#define RGB_FORMAT RGB_FORMAT_ABGR
134#define SSE_ALIGNED
135#include "yuv_rgb_sse_func.h"
136*/
137
138#define SSE_FUNCTION_NAME yuv420_rgb565_sseu
139#define STD_FUNCTION_NAME yuv420_rgb565_std
140#define YUV_FORMAT YUV_FORMAT_420
141#define RGB_FORMAT RGB_FORMAT_RGB565
142#include "yuv_rgb_sse_func.h"
143
144#define SSE_FUNCTION_NAME yuv420_rgb24_sseu
145#define STD_FUNCTION_NAME yuv420_rgb24_std
146#define YUV_FORMAT YUV_FORMAT_420
147#define RGB_FORMAT RGB_FORMAT_RGB24
148#include "yuv_rgb_sse_func.h"
149
150#define SSE_FUNCTION_NAME yuv420_rgba_sseu
151#define STD_FUNCTION_NAME yuv420_rgba_std
152#define YUV_FORMAT YUV_FORMAT_420
153#define RGB_FORMAT RGB_FORMAT_RGBA
154#include "yuv_rgb_sse_func.h"
155
156#define SSE_FUNCTION_NAME yuv420_bgra_sseu
157#define STD_FUNCTION_NAME yuv420_bgra_std
158#define YUV_FORMAT YUV_FORMAT_420
159#define RGB_FORMAT RGB_FORMAT_BGRA
160#include "yuv_rgb_sse_func.h"
161
162#define SSE_FUNCTION_NAME yuv420_argb_sseu
163#define STD_FUNCTION_NAME yuv420_argb_std
164#define YUV_FORMAT YUV_FORMAT_420
165#define RGB_FORMAT RGB_FORMAT_ARGB
166#include "yuv_rgb_sse_func.h"
167
168#define SSE_FUNCTION_NAME yuv420_abgr_sseu
169#define STD_FUNCTION_NAME yuv420_abgr_std
170#define YUV_FORMAT YUV_FORMAT_420
171#define RGB_FORMAT RGB_FORMAT_ABGR
172#include "yuv_rgb_sse_func.h"
173
174#define SSE_FUNCTION_NAME yuv422_rgb565_sseu
175#define STD_FUNCTION_NAME yuv422_rgb565_std
176#define YUV_FORMAT YUV_FORMAT_422
177#define RGB_FORMAT RGB_FORMAT_RGB565
178#include "yuv_rgb_sse_func.h"
179
180#define SSE_FUNCTION_NAME yuv422_rgb24_sseu
181#define STD_FUNCTION_NAME yuv422_rgb24_std
182#define YUV_FORMAT YUV_FORMAT_422
183#define RGB_FORMAT RGB_FORMAT_RGB24
184#include "yuv_rgb_sse_func.h"
185
186#define SSE_FUNCTION_NAME yuv422_rgba_sseu
187#define STD_FUNCTION_NAME yuv422_rgba_std
188#define YUV_FORMAT YUV_FORMAT_422
189#define RGB_FORMAT RGB_FORMAT_RGBA
190#include "yuv_rgb_sse_func.h"
191
192#define SSE_FUNCTION_NAME yuv422_bgra_sseu
193#define STD_FUNCTION_NAME yuv422_bgra_std
194#define YUV_FORMAT YUV_FORMAT_422
195#define RGB_FORMAT RGB_FORMAT_BGRA
196#include "yuv_rgb_sse_func.h"
197
198#define SSE_FUNCTION_NAME yuv422_argb_sseu
199#define STD_FUNCTION_NAME yuv422_argb_std
200#define YUV_FORMAT YUV_FORMAT_422
201#define RGB_FORMAT RGB_FORMAT_ARGB
202#include "yuv_rgb_sse_func.h"
203
204#define SSE_FUNCTION_NAME yuv422_abgr_sseu
205#define STD_FUNCTION_NAME yuv422_abgr_std
206#define YUV_FORMAT YUV_FORMAT_422
207#define RGB_FORMAT RGB_FORMAT_ABGR
208#include "yuv_rgb_sse_func.h"
209
210#define SSE_FUNCTION_NAME yuvnv12_rgb565_sseu
211#define STD_FUNCTION_NAME yuvnv12_rgb565_std
212#define YUV_FORMAT YUV_FORMAT_NV12
213#define RGB_FORMAT RGB_FORMAT_RGB565
214#include "yuv_rgb_sse_func.h"
215
216#define SSE_FUNCTION_NAME yuvnv12_rgb24_sseu
217#define STD_FUNCTION_NAME yuvnv12_rgb24_std
218#define YUV_FORMAT YUV_FORMAT_NV12
219#define RGB_FORMAT RGB_FORMAT_RGB24
220#include "yuv_rgb_sse_func.h"
221
222#define SSE_FUNCTION_NAME yuvnv12_rgba_sseu
223#define STD_FUNCTION_NAME yuvnv12_rgba_std
224#define YUV_FORMAT YUV_FORMAT_NV12
225#define RGB_FORMAT RGB_FORMAT_RGBA
226#include "yuv_rgb_sse_func.h"
227
228#define SSE_FUNCTION_NAME yuvnv12_bgra_sseu
229#define STD_FUNCTION_NAME yuvnv12_bgra_std
230#define YUV_FORMAT YUV_FORMAT_NV12
231#define RGB_FORMAT RGB_FORMAT_BGRA
232#include "yuv_rgb_sse_func.h"
233
234#define SSE_FUNCTION_NAME yuvnv12_argb_sseu
235#define STD_FUNCTION_NAME yuvnv12_argb_std
236#define YUV_FORMAT YUV_FORMAT_NV12
237#define RGB_FORMAT RGB_FORMAT_ARGB
238#include "yuv_rgb_sse_func.h"
239
240#define SSE_FUNCTION_NAME yuvnv12_abgr_sseu
241#define STD_FUNCTION_NAME yuvnv12_abgr_std
242#define YUV_FORMAT YUV_FORMAT_NV12
243#define RGB_FORMAT RGB_FORMAT_ABGR
244#include "yuv_rgb_sse_func.h"
245
246
247/* SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan.
248#define UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
249R1 = _mm_unpacklo_epi8(RGB1, RGB4); \
250R2 = _mm_unpackhi_epi8(RGB1, RGB4); \
251G1 = _mm_unpacklo_epi8(RGB2, RGB5); \
252G2 = _mm_unpackhi_epi8(RGB2, RGB5); \
253B1 = _mm_unpacklo_epi8(RGB3, RGB6); \
254B2 = _mm_unpackhi_epi8(RGB3, RGB6);
255
256#define UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
257RGB1 = _mm_unpacklo_epi8(R1, G2); \
258RGB2 = _mm_unpackhi_epi8(R1, G2); \
259RGB3 = _mm_unpacklo_epi8(R2, B1); \
260RGB4 = _mm_unpackhi_epi8(R2, B1); \
261RGB5 = _mm_unpacklo_epi8(G1, B2); \
262RGB6 = _mm_unpackhi_epi8(G1, B2); \
263
264#define UNPACK_RGB24_32(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
265UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
266UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
267UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
268UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
269UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
270
271#define RGB2YUV_16(R, G, B, Y, U, V) \
272Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[0][0])), \
273 _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[0][1]))); \
274Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[0][2]))); \
275Y = _mm_add_epi16(Y, _mm_set1_epi16((param->y_shift)<<PRECISION)); \
276Y = _mm_srai_epi16(Y, PRECISION); \
277U = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[1][0])), \
278 _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[1][1]))); \
279U = _mm_add_epi16(U, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[1][2]))); \
280U = _mm_add_epi16(U, _mm_set1_epi16(128<<PRECISION)); \
281U = _mm_srai_epi16(U, PRECISION); \
282V = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[2][0])), \
283 _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[2][1]))); \
284V = _mm_add_epi16(V, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[2][2]))); \
285V = _mm_add_epi16(V, _mm_set1_epi16(128<<PRECISION)); \
286V = _mm_srai_epi16(V, PRECISION);
287*/
288
289#if 0 // SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan.
290#define RGB2YUV_32 \
291 __m128i r1, r2, b1, b2, g1, g2; \
292 __m128i r_16, g_16, b_16; \
293 __m128i y1_16, y2_16, u1_16, u2_16, v1_16, v2_16, y, u1, u2, v1, v2, u1_tmp, u2_tmp, v1_tmp, v2_tmp; \
294 __m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \
295 rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \
296 rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \
297 rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \
298 rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \
299 rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \
300 /* unpack rgb24 data to r, g and b data in separate channels*/ \
301 UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
302 /* process pixels of first line */ \
303 r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
304 g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
305 b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
306 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
307 r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
308 g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
309 b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
310 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
311 y = _mm_packus_epi16(y1_16, y2_16); \
312 u1 = _mm_packus_epi16(u1_16, u2_16); \
313 v1 = _mm_packus_epi16(v1_16, v2_16); \
314 /* save Y values */ \
315 SAVE_SI128((__m128i*)(y_ptr1), y); \
316 /* process pixels of second line */ \
317 r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
318 g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
319 b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
320 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
321 r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
322 g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
323 b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
324 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
325 y = _mm_packus_epi16(y1_16, y2_16); \
326 u2 = _mm_packus_epi16(u1_16, u2_16); \
327 v2 = _mm_packus_epi16(v1_16, v2_16); \
328 /* save Y values */ \
329 SAVE_SI128((__m128i*)(y_ptr2), y); \
330 /* vertical subsampling of u/v values */ \
331 u1_tmp = _mm_avg_epu8(u1, u2); \
332 v1_tmp = _mm_avg_epu8(v1, v2); \
333 /* do the same again with next data */ \
334 rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)); \
335 rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)); \
336 rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)); \
337 rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \
338 rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)); \
339 rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \
340 /* unpack rgb24 data to r, g and b data in separate channels*/ \
341 UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
342 /* process pixels of first line */ \
343 r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
344 g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
345 b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
346 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
347 r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
348 g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
349 b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
350 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
351 y = _mm_packus_epi16(y1_16, y2_16); \
352 u1 = _mm_packus_epi16(u1_16, u2_16); \
353 v1 = _mm_packus_epi16(v1_16, v2_16); \
354 /* save Y values */ \
355 SAVE_SI128((__m128i*)(y_ptr1+16), y); \
356 /* process pixels of second line */ \
357 r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
358 g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
359 b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
360 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
361 r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
362 g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
363 b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
364 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
365 y = _mm_packus_epi16(y1_16, y2_16); \
366 u2 = _mm_packus_epi16(u1_16, u2_16); \
367 v2 = _mm_packus_epi16(v1_16, v2_16); \
368 /* save Y values */ \
369 SAVE_SI128((__m128i*)(y_ptr2+16), y); \
370 /* vertical subsampling of u/v values */ \
371 u2_tmp = _mm_avg_epu8(u1, u2); \
372 v2_tmp = _mm_avg_epu8(v1, v2); \
373 /* horizontal subsampling of u/v values */ \
374 u1 = _mm_packus_epi16(_mm_srl_epi16(u1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(u2_tmp, _mm_cvtsi32_si128(8))); \
375 v1 = _mm_packus_epi16(_mm_srl_epi16(v1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(v2_tmp, _mm_cvtsi32_si128(8))); \
376 u2 = _mm_packus_epi16(_mm_and_si128(u1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(u2_tmp, _mm_set1_epi16(0xFF))); \
377 v2 = _mm_packus_epi16(_mm_and_si128(v1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(v2_tmp, _mm_set1_epi16(0xFF))); \
378 u1 = _mm_avg_epu8(u1, u2); \
379 v1 = _mm_avg_epu8(v1, v2); \
380 SAVE_SI128((__m128i*)(u_ptr), u1); \
381 SAVE_SI128((__m128i*)(v_ptr), v1);
382#endif
383
384/* SDL doesn't use these atm and compiling them adds seconds onto the build. --ryan.
385void SDL_TARGETING("sse2") rgb24_yuv420_sse(uint32_t width, uint32_t height,
386 const uint8_t *RGB, uint32_t RGB_stride,
387 uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
388 YCbCrType yuv_type)
389{
390 #define LOAD_SI128 _mm_load_si128
391 #define SAVE_SI128 _mm_stream_si128
392 const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
393
394 uint32_t xpos, ypos;
395 for(ypos=0; ypos<(height-1); ypos+=2)
396 {
397 const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
398 *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
399
400 uint8_t *y_ptr1=Y+ypos*Y_stride,
401 *y_ptr2=Y+(ypos+1)*Y_stride,
402 *u_ptr=U+(ypos/2)*UV_stride,
403 *v_ptr=V+(ypos/2)*UV_stride;
404
405 for(xpos=0; xpos<(width-31); xpos+=32)
406 {
407 RGB2YUV_32
408
409 rgb_ptr1+=96;
410 rgb_ptr2+=96;
411 y_ptr1+=32;
412 y_ptr2+=32;
413 u_ptr+=16;
414 v_ptr+=16;
415 }
416 }
417 #undef LOAD_SI128
418 #undef SAVE_SI128
419}
420
421void SDL_TARGETING("sse2") rgb24_yuv420_sseu(uint32_t width, uint32_t height,
422 const uint8_t *RGB, uint32_t RGB_stride,
423 uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
424 YCbCrType yuv_type)
425{
426 #define LOAD_SI128 _mm_loadu_si128
427 #define SAVE_SI128 _mm_storeu_si128
428 const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
429
430 uint32_t xpos, ypos;
431 for(ypos=0; ypos<(height-1); ypos+=2)
432 {
433 const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
434 *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
435
436 uint8_t *y_ptr1=Y+ypos*Y_stride,
437 *y_ptr2=Y+(ypos+1)*Y_stride,
438 *u_ptr=U+(ypos/2)*UV_stride,
439 *v_ptr=V+(ypos/2)*UV_stride;
440
441 for(xpos=0; xpos<(width-31); xpos+=32)
442 {
443 RGB2YUV_32
444
445 rgb_ptr1+=96;
446 rgb_ptr2+=96;
447 y_ptr1+=32;
448 y_ptr2+=32;
449 u_ptr+=16;
450 v_ptr+=16;
451 }
452 }
453 #undef LOAD_SI128
454 #undef SAVE_SI128
455}
456*/
457
458#endif // SDL_SSE2_INTRINSICS
459
460#endif // SDL_HAVE_YUV
461