1// Copyright 2016 Adrien Descamps
2// Distributed under BSD 3-Clause License
3#include "../../SDL_internal.h"
4
5#if SDL_HAVE_YUV
6
7#include "yuv_rgb.h"
8
9#include "SDL_cpuinfo.h"
10/*#include <x86intrin.h>*/
11
12#define PRECISION 6
13#define PRECISION_FACTOR (1<<PRECISION)
14
15typedef struct
16{
17 uint8_t y_shift;
18 int16_t matrix[3][3];
19} RGB2YUVParam;
20// |Y| |y_shift| |matrix[0][0] matrix[0][1] matrix[0][2]| |R|
21// |U| = | 128 | + 1/PRECISION_FACTOR * |matrix[1][0] matrix[1][1] matrix[1][2]| * |G|
22// |V| | 128 | |matrix[2][0] matrix[2][1] matrix[2][2]| |B|
23
24typedef struct
25{
26 uint8_t y_shift;
27 int16_t y_factor;
28 int16_t v_r_factor;
29 int16_t u_g_factor;
30 int16_t v_g_factor;
31 int16_t u_b_factor;
32} YUV2RGBParam;
33// |R| |y_factor 0 v_r_factor| |Y-y_shift|
34// |G| = 1/PRECISION_FACTOR * |y_factor u_g_factor v_g_factor| * | U-128 |
35// |B| |y_factor u_b_factor 0 | | V-128 |
36
37#define V(value) (int16_t)((value*PRECISION_FACTOR)+0.5)
38
39// for ITU-T T.871, values can be found in section 7
40// for ITU-R BT.601-7 values are derived from equations in sections 2.5.1-2.5.3, assuming RGB is encoded using full range ([0-1]<->[0-255])
41// for ITU-R BT.709-6 values are derived from equations in sections 3.2-3.4, assuming RGB is encoded using full range ([0-1]<->[0-255])
42// all values are rounded to the fourth decimal
43
44static const YUV2RGBParam YUV2RGB[3] = {
45 // ITU-T T.871 (JPEG)
46 {/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.402), /*.u_g_factor=*/ -V(0.3441), /*.v_g_factor=*/ -V(0.7141), /*.u_b_factor=*/ V(1.772)},
47 // ITU-R BT.601-7
48 {/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.596), /*.u_g_factor=*/ -V(0.3918), /*.v_g_factor=*/ -V(0.813), /*.u_b_factor=*/ V(2.0172)},
49 // ITU-R BT.709-6
50 {/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.7927), /*.u_g_factor=*/ -V(0.2132), /*.v_g_factor=*/ -V(0.5329), /*.u_b_factor=*/ V(2.1124)}
51};
52
53static const RGB2YUVParam RGB2YUV[3] = {
54 // ITU-T T.871 (JPEG)
55 {/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.299), V(0.587), V(0.114)}, {-V(0.1687), -V(0.3313), V(0.5)}, {V(0.5), -V(0.4187), -V(0.0813)}}},
56 // ITU-R BT.601-7
57 {/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.2568), V(0.5041), V(0.0979)}, {-V(0.1482), -V(0.291), V(0.4392)}, {V(0.4392), -V(0.3678), -V(0.0714)}}},
58 // ITU-R BT.709-6
59 {/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.1826), V(0.6142), V(0.062)}, {-V(0.1006), -V(0.3386), V(0.4392)}, {V(0.4392), -V(0.3989), -V(0.0403)}}}
60};
61
62/* The various layouts of YUV data we support */
63#define YUV_FORMAT_420 1
64#define YUV_FORMAT_422 2
65#define YUV_FORMAT_NV12 3
66
67/* The various formats of RGB pixel that we support */
68#define RGB_FORMAT_RGB565 1
69#define RGB_FORMAT_RGB24 2
70#define RGB_FORMAT_RGBA 3
71#define RGB_FORMAT_BGRA 4
72#define RGB_FORMAT_ARGB 5
73#define RGB_FORMAT_ABGR 6
74
75// divide by PRECISION_FACTOR and clamp to [0:255] interval
76// input must be in the [-128*PRECISION_FACTOR:384*PRECISION_FACTOR] range
77static uint8_t clampU8(int32_t v)
78{
79 static const uint8_t lut[512] =
80 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
81 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
82 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
83 47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
84 91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
85 126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,
86 159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
87 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,
88 225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,
89 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
90 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
91 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
92 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
93 };
94 return lut[(v+128*PRECISION_FACTOR)>>PRECISION];
95}
96
97
98#define STD_FUNCTION_NAME yuv420_rgb565_std
99#define YUV_FORMAT YUV_FORMAT_420
100#define RGB_FORMAT RGB_FORMAT_RGB565
101#include "yuv_rgb_std_func.h"
102
103#define STD_FUNCTION_NAME yuv420_rgb24_std
104#define YUV_FORMAT YUV_FORMAT_420
105#define RGB_FORMAT RGB_FORMAT_RGB24
106#include "yuv_rgb_std_func.h"
107
108#define STD_FUNCTION_NAME yuv420_rgba_std
109#define YUV_FORMAT YUV_FORMAT_420
110#define RGB_FORMAT RGB_FORMAT_RGBA
111#include "yuv_rgb_std_func.h"
112
113#define STD_FUNCTION_NAME yuv420_bgra_std
114#define YUV_FORMAT YUV_FORMAT_420
115#define RGB_FORMAT RGB_FORMAT_BGRA
116#include "yuv_rgb_std_func.h"
117
118#define STD_FUNCTION_NAME yuv420_argb_std
119#define YUV_FORMAT YUV_FORMAT_420
120#define RGB_FORMAT RGB_FORMAT_ARGB
121#include "yuv_rgb_std_func.h"
122
123#define STD_FUNCTION_NAME yuv420_abgr_std
124#define YUV_FORMAT YUV_FORMAT_420
125#define RGB_FORMAT RGB_FORMAT_ABGR
126#include "yuv_rgb_std_func.h"
127
128#define STD_FUNCTION_NAME yuv422_rgb565_std
129#define YUV_FORMAT YUV_FORMAT_422
130#define RGB_FORMAT RGB_FORMAT_RGB565
131#include "yuv_rgb_std_func.h"
132
133#define STD_FUNCTION_NAME yuv422_rgb24_std
134#define YUV_FORMAT YUV_FORMAT_422
135#define RGB_FORMAT RGB_FORMAT_RGB24
136#include "yuv_rgb_std_func.h"
137
138#define STD_FUNCTION_NAME yuv422_rgba_std
139#define YUV_FORMAT YUV_FORMAT_422
140#define RGB_FORMAT RGB_FORMAT_RGBA
141#include "yuv_rgb_std_func.h"
142
143#define STD_FUNCTION_NAME yuv422_bgra_std
144#define YUV_FORMAT YUV_FORMAT_422
145#define RGB_FORMAT RGB_FORMAT_BGRA
146#include "yuv_rgb_std_func.h"
147
148#define STD_FUNCTION_NAME yuv422_argb_std
149#define YUV_FORMAT YUV_FORMAT_422
150#define RGB_FORMAT RGB_FORMAT_ARGB
151#include "yuv_rgb_std_func.h"
152
153#define STD_FUNCTION_NAME yuv422_abgr_std
154#define YUV_FORMAT YUV_FORMAT_422
155#define RGB_FORMAT RGB_FORMAT_ABGR
156#include "yuv_rgb_std_func.h"
157
158#define STD_FUNCTION_NAME yuvnv12_rgb565_std
159#define YUV_FORMAT YUV_FORMAT_NV12
160#define RGB_FORMAT RGB_FORMAT_RGB565
161#include "yuv_rgb_std_func.h"
162
163#define STD_FUNCTION_NAME yuvnv12_rgb24_std
164#define YUV_FORMAT YUV_FORMAT_NV12
165#define RGB_FORMAT RGB_FORMAT_RGB24
166#include "yuv_rgb_std_func.h"
167
168#define STD_FUNCTION_NAME yuvnv12_rgba_std
169#define YUV_FORMAT YUV_FORMAT_NV12
170#define RGB_FORMAT RGB_FORMAT_RGBA
171#include "yuv_rgb_std_func.h"
172
173#define STD_FUNCTION_NAME yuvnv12_bgra_std
174#define YUV_FORMAT YUV_FORMAT_NV12
175#define RGB_FORMAT RGB_FORMAT_BGRA
176#include "yuv_rgb_std_func.h"
177
178#define STD_FUNCTION_NAME yuvnv12_argb_std
179#define YUV_FORMAT YUV_FORMAT_NV12
180#define RGB_FORMAT RGB_FORMAT_ARGB
181#include "yuv_rgb_std_func.h"
182
183#define STD_FUNCTION_NAME yuvnv12_abgr_std
184#define YUV_FORMAT YUV_FORMAT_NV12
185#define RGB_FORMAT RGB_FORMAT_ABGR
186#include "yuv_rgb_std_func.h"
187
188void rgb24_yuv420_std(
189 uint32_t width, uint32_t height,
190 const uint8_t *RGB, uint32_t RGB_stride,
191 uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
192 YCbCrType yuv_type)
193{
194 const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
195
196 uint32_t x, y;
197 for(y=0; y<(height-1); y+=2)
198 {
199 const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
200 *rgb_ptr2=RGB+(y+1)*RGB_stride;
201
202 uint8_t *y_ptr1=Y+y*Y_stride,
203 *y_ptr2=Y+(y+1)*Y_stride,
204 *u_ptr=U+(y/2)*UV_stride,
205 *v_ptr=V+(y/2)*UV_stride;
206
207 for(x=0; x<(width-1); x+=2)
208 {
209 // compute yuv for the four pixels, u and v values are summed
210 int32_t y_tmp, u_tmp, v_tmp;
211
212 y_tmp = param->matrix[0][0]*rgb_ptr1[0] + param->matrix[0][1]*rgb_ptr1[1] + param->matrix[0][2]*rgb_ptr1[2];
213 u_tmp = param->matrix[1][0]*rgb_ptr1[0] + param->matrix[1][1]*rgb_ptr1[1] + param->matrix[1][2]*rgb_ptr1[2];
214 v_tmp = param->matrix[2][0]*rgb_ptr1[0] + param->matrix[2][1]*rgb_ptr1[1] + param->matrix[2][2]*rgb_ptr1[2];
215 y_ptr1[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
216
217 y_tmp = param->matrix[0][0]*rgb_ptr1[3] + param->matrix[0][1]*rgb_ptr1[4] + param->matrix[0][2]*rgb_ptr1[5];
218 u_tmp += param->matrix[1][0]*rgb_ptr1[3] + param->matrix[1][1]*rgb_ptr1[4] + param->matrix[1][2]*rgb_ptr1[5];
219 v_tmp += param->matrix[2][0]*rgb_ptr1[3] + param->matrix[2][1]*rgb_ptr1[4] + param->matrix[2][2]*rgb_ptr1[5];
220 y_ptr1[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
221
222 y_tmp = param->matrix[0][0]*rgb_ptr2[0] + param->matrix[0][1]*rgb_ptr2[1] + param->matrix[0][2]*rgb_ptr2[2];
223 u_tmp += param->matrix[1][0]*rgb_ptr2[0] + param->matrix[1][1]*rgb_ptr2[1] + param->matrix[1][2]*rgb_ptr2[2];
224 v_tmp += param->matrix[2][0]*rgb_ptr2[0] + param->matrix[2][1]*rgb_ptr2[1] + param->matrix[2][2]*rgb_ptr2[2];
225 y_ptr2[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
226
227 y_tmp = param->matrix[0][0]*rgb_ptr2[3] + param->matrix[0][1]*rgb_ptr2[4] + param->matrix[0][2]*rgb_ptr2[5];
228 u_tmp += param->matrix[1][0]*rgb_ptr2[3] + param->matrix[1][1]*rgb_ptr2[4] + param->matrix[1][2]*rgb_ptr2[5];
229 v_tmp += param->matrix[2][0]*rgb_ptr2[3] + param->matrix[2][1]*rgb_ptr2[4] + param->matrix[2][2]*rgb_ptr2[5];
230 y_ptr2[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
231
232 u_ptr[0] = clampU8(u_tmp/4+(128<<PRECISION));
233 v_ptr[0] = clampU8(v_tmp/4+(128<<PRECISION));
234
235 rgb_ptr1 += 6;
236 rgb_ptr2 += 6;
237 y_ptr1 += 2;
238 y_ptr2 += 2;
239 u_ptr += 1;
240 v_ptr += 1;
241 }
242 }
243}
244
245#ifdef __SSE2__
246
247#define SSE_FUNCTION_NAME yuv420_rgb565_sse
248#define STD_FUNCTION_NAME yuv420_rgb565_std
249#define YUV_FORMAT YUV_FORMAT_420
250#define RGB_FORMAT RGB_FORMAT_RGB565
251#define SSE_ALIGNED
252#include "yuv_rgb_sse_func.h"
253
254#define SSE_FUNCTION_NAME yuv420_rgb565_sseu
255#define STD_FUNCTION_NAME yuv420_rgb565_std
256#define YUV_FORMAT YUV_FORMAT_420
257#define RGB_FORMAT RGB_FORMAT_RGB565
258#include "yuv_rgb_sse_func.h"
259
260#define SSE_FUNCTION_NAME yuv420_rgb24_sse
261#define STD_FUNCTION_NAME yuv420_rgb24_std
262#define YUV_FORMAT YUV_FORMAT_420
263#define RGB_FORMAT RGB_FORMAT_RGB24
264#define SSE_ALIGNED
265#include "yuv_rgb_sse_func.h"
266
267#define SSE_FUNCTION_NAME yuv420_rgb24_sseu
268#define STD_FUNCTION_NAME yuv420_rgb24_std
269#define YUV_FORMAT YUV_FORMAT_420
270#define RGB_FORMAT RGB_FORMAT_RGB24
271#include "yuv_rgb_sse_func.h"
272
273#define SSE_FUNCTION_NAME yuv420_rgba_sse
274#define STD_FUNCTION_NAME yuv420_rgba_std
275#define YUV_FORMAT YUV_FORMAT_420
276#define RGB_FORMAT RGB_FORMAT_RGBA
277#define SSE_ALIGNED
278#include "yuv_rgb_sse_func.h"
279
280#define SSE_FUNCTION_NAME yuv420_rgba_sseu
281#define STD_FUNCTION_NAME yuv420_rgba_std
282#define YUV_FORMAT YUV_FORMAT_420
283#define RGB_FORMAT RGB_FORMAT_RGBA
284#include "yuv_rgb_sse_func.h"
285
286#define SSE_FUNCTION_NAME yuv420_bgra_sse
287#define STD_FUNCTION_NAME yuv420_bgra_std
288#define YUV_FORMAT YUV_FORMAT_420
289#define RGB_FORMAT RGB_FORMAT_BGRA
290#define SSE_ALIGNED
291#include "yuv_rgb_sse_func.h"
292
293#define SSE_FUNCTION_NAME yuv420_bgra_sseu
294#define STD_FUNCTION_NAME yuv420_bgra_std
295#define YUV_FORMAT YUV_FORMAT_420
296#define RGB_FORMAT RGB_FORMAT_BGRA
297#include "yuv_rgb_sse_func.h"
298
299#define SSE_FUNCTION_NAME yuv420_argb_sse
300#define STD_FUNCTION_NAME yuv420_argb_std
301#define YUV_FORMAT YUV_FORMAT_420
302#define RGB_FORMAT RGB_FORMAT_ARGB
303#define SSE_ALIGNED
304#include "yuv_rgb_sse_func.h"
305
306#define SSE_FUNCTION_NAME yuv420_argb_sseu
307#define STD_FUNCTION_NAME yuv420_argb_std
308#define YUV_FORMAT YUV_FORMAT_420
309#define RGB_FORMAT RGB_FORMAT_ARGB
310#include "yuv_rgb_sse_func.h"
311
312#define SSE_FUNCTION_NAME yuv420_abgr_sse
313#define STD_FUNCTION_NAME yuv420_abgr_std
314#define YUV_FORMAT YUV_FORMAT_420
315#define RGB_FORMAT RGB_FORMAT_ABGR
316#define SSE_ALIGNED
317#include "yuv_rgb_sse_func.h"
318
319#define SSE_FUNCTION_NAME yuv420_abgr_sseu
320#define STD_FUNCTION_NAME yuv420_abgr_std
321#define YUV_FORMAT YUV_FORMAT_420
322#define RGB_FORMAT RGB_FORMAT_ABGR
323#include "yuv_rgb_sse_func.h"
324
325#define SSE_FUNCTION_NAME yuv422_rgb565_sse
326#define STD_FUNCTION_NAME yuv422_rgb565_std
327#define YUV_FORMAT YUV_FORMAT_422
328#define RGB_FORMAT RGB_FORMAT_RGB565
329#define SSE_ALIGNED
330#include "yuv_rgb_sse_func.h"
331
332#define SSE_FUNCTION_NAME yuv422_rgb565_sseu
333#define STD_FUNCTION_NAME yuv422_rgb565_std
334#define YUV_FORMAT YUV_FORMAT_422
335#define RGB_FORMAT RGB_FORMAT_RGB565
336#include "yuv_rgb_sse_func.h"
337
338#define SSE_FUNCTION_NAME yuv422_rgb24_sse
339#define STD_FUNCTION_NAME yuv422_rgb24_std
340#define YUV_FORMAT YUV_FORMAT_422
341#define RGB_FORMAT RGB_FORMAT_RGB24
342#define SSE_ALIGNED
343#include "yuv_rgb_sse_func.h"
344
345#define SSE_FUNCTION_NAME yuv422_rgb24_sseu
346#define STD_FUNCTION_NAME yuv422_rgb24_std
347#define YUV_FORMAT YUV_FORMAT_422
348#define RGB_FORMAT RGB_FORMAT_RGB24
349#include "yuv_rgb_sse_func.h"
350
351#define SSE_FUNCTION_NAME yuv422_rgba_sse
352#define STD_FUNCTION_NAME yuv422_rgba_std
353#define YUV_FORMAT YUV_FORMAT_422
354#define RGB_FORMAT RGB_FORMAT_RGBA
355#define SSE_ALIGNED
356#include "yuv_rgb_sse_func.h"
357
358#define SSE_FUNCTION_NAME yuv422_rgba_sseu
359#define STD_FUNCTION_NAME yuv422_rgba_std
360#define YUV_FORMAT YUV_FORMAT_422
361#define RGB_FORMAT RGB_FORMAT_RGBA
362#include "yuv_rgb_sse_func.h"
363
364#define SSE_FUNCTION_NAME yuv422_bgra_sse
365#define STD_FUNCTION_NAME yuv422_bgra_std
366#define YUV_FORMAT YUV_FORMAT_422
367#define RGB_FORMAT RGB_FORMAT_BGRA
368#define SSE_ALIGNED
369#include "yuv_rgb_sse_func.h"
370
371#define SSE_FUNCTION_NAME yuv422_bgra_sseu
372#define STD_FUNCTION_NAME yuv422_bgra_std
373#define YUV_FORMAT YUV_FORMAT_422
374#define RGB_FORMAT RGB_FORMAT_BGRA
375#include "yuv_rgb_sse_func.h"
376
377#define SSE_FUNCTION_NAME yuv422_argb_sse
378#define STD_FUNCTION_NAME yuv422_argb_std
379#define YUV_FORMAT YUV_FORMAT_422
380#define RGB_FORMAT RGB_FORMAT_ARGB
381#define SSE_ALIGNED
382#include "yuv_rgb_sse_func.h"
383
384#define SSE_FUNCTION_NAME yuv422_argb_sseu
385#define STD_FUNCTION_NAME yuv422_argb_std
386#define YUV_FORMAT YUV_FORMAT_422
387#define RGB_FORMAT RGB_FORMAT_ARGB
388#include "yuv_rgb_sse_func.h"
389
390#define SSE_FUNCTION_NAME yuv422_abgr_sse
391#define STD_FUNCTION_NAME yuv422_abgr_std
392#define YUV_FORMAT YUV_FORMAT_422
393#define RGB_FORMAT RGB_FORMAT_ABGR
394#define SSE_ALIGNED
395#include "yuv_rgb_sse_func.h"
396
397#define SSE_FUNCTION_NAME yuv422_abgr_sseu
398#define STD_FUNCTION_NAME yuv422_abgr_std
399#define YUV_FORMAT YUV_FORMAT_422
400#define RGB_FORMAT RGB_FORMAT_ABGR
401#include "yuv_rgb_sse_func.h"
402
403#define SSE_FUNCTION_NAME yuvnv12_rgb565_sse
404#define STD_FUNCTION_NAME yuvnv12_rgb565_std
405#define YUV_FORMAT YUV_FORMAT_NV12
406#define RGB_FORMAT RGB_FORMAT_RGB565
407#define SSE_ALIGNED
408#include "yuv_rgb_sse_func.h"
409
410#define SSE_FUNCTION_NAME yuvnv12_rgb565_sseu
411#define STD_FUNCTION_NAME yuvnv12_rgb565_std
412#define YUV_FORMAT YUV_FORMAT_NV12
413#define RGB_FORMAT RGB_FORMAT_RGB565
414#include "yuv_rgb_sse_func.h"
415
416#define SSE_FUNCTION_NAME yuvnv12_rgb24_sse
417#define STD_FUNCTION_NAME yuvnv12_rgb24_std
418#define YUV_FORMAT YUV_FORMAT_NV12
419#define RGB_FORMAT RGB_FORMAT_RGB24
420#define SSE_ALIGNED
421#include "yuv_rgb_sse_func.h"
422
423#define SSE_FUNCTION_NAME yuvnv12_rgb24_sseu
424#define STD_FUNCTION_NAME yuvnv12_rgb24_std
425#define YUV_FORMAT YUV_FORMAT_NV12
426#define RGB_FORMAT RGB_FORMAT_RGB24
427#include "yuv_rgb_sse_func.h"
428
429#define SSE_FUNCTION_NAME yuvnv12_rgba_sse
430#define STD_FUNCTION_NAME yuvnv12_rgba_std
431#define YUV_FORMAT YUV_FORMAT_NV12
432#define RGB_FORMAT RGB_FORMAT_RGBA
433#define SSE_ALIGNED
434#include "yuv_rgb_sse_func.h"
435
436#define SSE_FUNCTION_NAME yuvnv12_rgba_sseu
437#define STD_FUNCTION_NAME yuvnv12_rgba_std
438#define YUV_FORMAT YUV_FORMAT_NV12
439#define RGB_FORMAT RGB_FORMAT_RGBA
440#include "yuv_rgb_sse_func.h"
441
442#define SSE_FUNCTION_NAME yuvnv12_bgra_sse
443#define STD_FUNCTION_NAME yuvnv12_bgra_std
444#define YUV_FORMAT YUV_FORMAT_NV12
445#define RGB_FORMAT RGB_FORMAT_BGRA
446#define SSE_ALIGNED
447#include "yuv_rgb_sse_func.h"
448
449#define SSE_FUNCTION_NAME yuvnv12_bgra_sseu
450#define STD_FUNCTION_NAME yuvnv12_bgra_std
451#define YUV_FORMAT YUV_FORMAT_NV12
452#define RGB_FORMAT RGB_FORMAT_BGRA
453#include "yuv_rgb_sse_func.h"
454
455#define SSE_FUNCTION_NAME yuvnv12_argb_sse
456#define STD_FUNCTION_NAME yuvnv12_argb_std
457#define YUV_FORMAT YUV_FORMAT_NV12
458#define RGB_FORMAT RGB_FORMAT_ARGB
459#define SSE_ALIGNED
460#include "yuv_rgb_sse_func.h"
461
462#define SSE_FUNCTION_NAME yuvnv12_argb_sseu
463#define STD_FUNCTION_NAME yuvnv12_argb_std
464#define YUV_FORMAT YUV_FORMAT_NV12
465#define RGB_FORMAT RGB_FORMAT_ARGB
466#include "yuv_rgb_sse_func.h"
467
468#define SSE_FUNCTION_NAME yuvnv12_abgr_sse
469#define STD_FUNCTION_NAME yuvnv12_abgr_std
470#define YUV_FORMAT YUV_FORMAT_NV12
471#define RGB_FORMAT RGB_FORMAT_ABGR
472#define SSE_ALIGNED
473#include "yuv_rgb_sse_func.h"
474
475#define SSE_FUNCTION_NAME yuvnv12_abgr_sseu
476#define STD_FUNCTION_NAME yuvnv12_abgr_std
477#define YUV_FORMAT YUV_FORMAT_NV12
478#define RGB_FORMAT RGB_FORMAT_ABGR
479#include "yuv_rgb_sse_func.h"
480
481
482#define UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
483R1 = _mm_unpacklo_epi8(RGB1, RGB4); \
484R2 = _mm_unpackhi_epi8(RGB1, RGB4); \
485G1 = _mm_unpacklo_epi8(RGB2, RGB5); \
486G2 = _mm_unpackhi_epi8(RGB2, RGB5); \
487B1 = _mm_unpacklo_epi8(RGB3, RGB6); \
488B2 = _mm_unpackhi_epi8(RGB3, RGB6);
489
490#define UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
491RGB1 = _mm_unpacklo_epi8(R1, G2); \
492RGB2 = _mm_unpackhi_epi8(R1, G2); \
493RGB3 = _mm_unpacklo_epi8(R2, B1); \
494RGB4 = _mm_unpackhi_epi8(R2, B1); \
495RGB5 = _mm_unpacklo_epi8(G1, B2); \
496RGB6 = _mm_unpackhi_epi8(G1, B2); \
497
498#define UNPACK_RGB24_32(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
499UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
500UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
501UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
502UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
503UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
504
505#define RGB2YUV_16(R, G, B, Y, U, V) \
506Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[0][0])), \
507 _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[0][1]))); \
508Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[0][2]))); \
509Y = _mm_add_epi16(Y, _mm_set1_epi16((param->y_shift)<<PRECISION)); \
510Y = _mm_srai_epi16(Y, PRECISION); \
511U = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[1][0])), \
512 _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[1][1]))); \
513U = _mm_add_epi16(U, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[1][2]))); \
514U = _mm_add_epi16(U, _mm_set1_epi16(128<<PRECISION)); \
515U = _mm_srai_epi16(U, PRECISION); \
516V = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[2][0])), \
517 _mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[2][1]))); \
518V = _mm_add_epi16(V, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[2][2]))); \
519V = _mm_add_epi16(V, _mm_set1_epi16(128<<PRECISION)); \
520V = _mm_srai_epi16(V, PRECISION);
521
522#define RGB2YUV_32 \
523 __m128i r1, r2, b1, b2, g1, g2; \
524 __m128i r_16, g_16, b_16; \
525 __m128i y1_16, y2_16, u1_16, u2_16, v1_16, v2_16, y, u1, u2, v1, v2, u1_tmp, u2_tmp, v1_tmp, v2_tmp; \
526 __m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \
527 rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \
528 rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \
529 rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \
530 rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \
531 rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \
532 /* unpack rgb24 data to r, g and b data in separate channels*/ \
533 UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
534 /* process pixels of first line */ \
535 r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
536 g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
537 b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
538 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
539 r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
540 g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
541 b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
542 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
543 y = _mm_packus_epi16(y1_16, y2_16); \
544 u1 = _mm_packus_epi16(u1_16, u2_16); \
545 v1 = _mm_packus_epi16(v1_16, v2_16); \
546 /* save Y values */ \
547 SAVE_SI128((__m128i*)(y_ptr1), y); \
548 /* process pixels of second line */ \
549 r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
550 g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
551 b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
552 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
553 r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
554 g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
555 b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
556 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
557 y = _mm_packus_epi16(y1_16, y2_16); \
558 u2 = _mm_packus_epi16(u1_16, u2_16); \
559 v2 = _mm_packus_epi16(v1_16, v2_16); \
560 /* save Y values */ \
561 SAVE_SI128((__m128i*)(y_ptr2), y); \
562 /* vertical subsampling of u/v values */ \
563 u1_tmp = _mm_avg_epu8(u1, u2); \
564 v1_tmp = _mm_avg_epu8(v1, v2); \
565 /* do the same again with next data */ \
566 rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)); \
567 rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)); \
568 rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)); \
569 rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \
570 rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)); \
571 rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \
572 /* unpack rgb24 data to r, g and b data in separate channels*/ \
573 UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
574 /* process pixels of first line */ \
575 r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
576 g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
577 b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
578 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
579 r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
580 g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
581 b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
582 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
583 y = _mm_packus_epi16(y1_16, y2_16); \
584 u1 = _mm_packus_epi16(u1_16, u2_16); \
585 v1 = _mm_packus_epi16(v1_16, v2_16); \
586 /* save Y values */ \
587 SAVE_SI128((__m128i*)(y_ptr1+16), y); \
588 /* process pixels of second line */ \
589 r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
590 g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
591 b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
592 RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
593 r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
594 g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
595 b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
596 RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
597 y = _mm_packus_epi16(y1_16, y2_16); \
598 u2 = _mm_packus_epi16(u1_16, u2_16); \
599 v2 = _mm_packus_epi16(v1_16, v2_16); \
600 /* save Y values */ \
601 SAVE_SI128((__m128i*)(y_ptr2+16), y); \
602 /* vertical subsampling of u/v values */ \
603 u2_tmp = _mm_avg_epu8(u1, u2); \
604 v2_tmp = _mm_avg_epu8(v1, v2); \
605 /* horizontal subsampling of u/v values */ \
606 u1 = _mm_packus_epi16(_mm_srl_epi16(u1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(u2_tmp, _mm_cvtsi32_si128(8))); \
607 v1 = _mm_packus_epi16(_mm_srl_epi16(v1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(v2_tmp, _mm_cvtsi32_si128(8))); \
608 u2 = _mm_packus_epi16(_mm_and_si128(u1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(u2_tmp, _mm_set1_epi16(0xFF))); \
609 v2 = _mm_packus_epi16(_mm_and_si128(v1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(v2_tmp, _mm_set1_epi16(0xFF))); \
610 u1 = _mm_avg_epu8(u1, u2); \
611 v1 = _mm_avg_epu8(v1, v2); \
612 SAVE_SI128((__m128i*)(u_ptr), u1); \
613 SAVE_SI128((__m128i*)(v_ptr), v1);
614
615void rgb24_yuv420_sse(uint32_t width, uint32_t height,
616 const uint8_t *RGB, uint32_t RGB_stride,
617 uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
618 YCbCrType yuv_type)
619{
620 #define LOAD_SI128 _mm_load_si128
621 #define SAVE_SI128 _mm_stream_si128
622 const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
623
624 uint32_t xpos, ypos;
625 for(ypos=0; ypos<(height-1); ypos+=2)
626 {
627 const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
628 *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
629
630 uint8_t *y_ptr1=Y+ypos*Y_stride,
631 *y_ptr2=Y+(ypos+1)*Y_stride,
632 *u_ptr=U+(ypos/2)*UV_stride,
633 *v_ptr=V+(ypos/2)*UV_stride;
634
635 for(xpos=0; xpos<(width-31); xpos+=32)
636 {
637 RGB2YUV_32
638
639 rgb_ptr1+=96;
640 rgb_ptr2+=96;
641 y_ptr1+=32;
642 y_ptr2+=32;
643 u_ptr+=16;
644 v_ptr+=16;
645 }
646 }
647 #undef LOAD_SI128
648 #undef SAVE_SI128
649}
650
651void rgb24_yuv420_sseu(uint32_t width, uint32_t height,
652 const uint8_t *RGB, uint32_t RGB_stride,
653 uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
654 YCbCrType yuv_type)
655{
656 #define LOAD_SI128 _mm_loadu_si128
657 #define SAVE_SI128 _mm_storeu_si128
658 const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
659
660 uint32_t xpos, ypos;
661 for(ypos=0; ypos<(height-1); ypos+=2)
662 {
663 const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
664 *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
665
666 uint8_t *y_ptr1=Y+ypos*Y_stride,
667 *y_ptr2=Y+(ypos+1)*Y_stride,
668 *u_ptr=U+(ypos/2)*UV_stride,
669 *v_ptr=V+(ypos/2)*UV_stride;
670
671 for(xpos=0; xpos<(width-31); xpos+=32)
672 {
673 RGB2YUV_32
674
675 rgb_ptr1+=96;
676 rgb_ptr2+=96;
677 y_ptr1+=32;
678 y_ptr2+=32;
679 u_ptr+=16;
680 v_ptr+=16;
681 }
682 }
683 #undef LOAD_SI128
684 #undef SAVE_SI128
685}
686
687
688#endif //__SSE2__
689
690#endif /* SDL_HAVE_YUV */
691