1 | /* |
2 | * Copyright 2016 Google Inc. |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #ifndef SkSwizzler_opts_DEFINED |
9 | #define SkSwizzler_opts_DEFINED |
10 | |
11 | #include "include/private/SkColorData.h" |
12 | |
13 | #include <utility> |
14 | |
15 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
16 | #include <immintrin.h> |
17 | #elif defined(SK_ARM_HAS_NEON) |
18 | #include <arm_neon.h> |
19 | #endif |
20 | |
21 | namespace SK_OPTS_NS { |
22 | |
23 | static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) { |
24 | for (int i = 0; i < count; i++) { |
25 | uint8_t a = (src[i] >> 24) & 0xFF, |
26 | b = (src[i] >> 16) & 0xFF, |
27 | g = (src[i] >> 8) & 0xFF, |
28 | r = (src[i] >> 0) & 0xFF; |
29 | b = (b*a+127)/255; |
30 | g = (g*a+127)/255; |
31 | r = (r*a+127)/255; |
32 | dst[i] = (uint32_t)a << 24 |
33 | | (uint32_t)b << 16 |
34 | | (uint32_t)g << 8 |
35 | | (uint32_t)r << 0; |
36 | } |
37 | } |
38 | |
39 | static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) { |
40 | for (int i = 0; i < count; i++) { |
41 | uint8_t a = (src[i] >> 24) & 0xFF, |
42 | b = (src[i] >> 16) & 0xFF, |
43 | g = (src[i] >> 8) & 0xFF, |
44 | r = (src[i] >> 0) & 0xFF; |
45 | b = (b*a+127)/255; |
46 | g = (g*a+127)/255; |
47 | r = (r*a+127)/255; |
48 | dst[i] = (uint32_t)a << 24 |
49 | | (uint32_t)r << 16 |
50 | | (uint32_t)g << 8 |
51 | | (uint32_t)b << 0; |
52 | } |
53 | } |
54 | |
55 | static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) { |
56 | for (int i = 0; i < count; i++) { |
57 | uint8_t a = (src[i] >> 24) & 0xFF, |
58 | b = (src[i] >> 16) & 0xFF, |
59 | g = (src[i] >> 8) & 0xFF, |
60 | r = (src[i] >> 0) & 0xFF; |
61 | dst[i] = (uint32_t)a << 24 |
62 | | (uint32_t)r << 16 |
63 | | (uint32_t)g << 8 |
64 | | (uint32_t)b << 0; |
65 | } |
66 | } |
67 | |
68 | static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) { |
69 | for (int i = 0; i < count; i++) { |
70 | uint8_t r = src[0], |
71 | g = src[1], |
72 | b = src[2]; |
73 | src += 3; |
74 | dst[i] = (uint32_t)0xFF << 24 |
75 | | (uint32_t)b << 16 |
76 | | (uint32_t)g << 8 |
77 | | (uint32_t)r << 0; |
78 | } |
79 | } |
80 | |
81 | static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) { |
82 | for (int i = 0; i < count; i++) { |
83 | uint8_t r = src[0], |
84 | g = src[1], |
85 | b = src[2]; |
86 | src += 3; |
87 | dst[i] = (uint32_t)0xFF << 24 |
88 | | (uint32_t)r << 16 |
89 | | (uint32_t)g << 8 |
90 | | (uint32_t)b << 0; |
91 | } |
92 | } |
93 | |
94 | static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) { |
95 | for (int i = 0; i < count; i++) { |
96 | dst[i] = (uint32_t)0xFF << 24 |
97 | | (uint32_t)src[i] << 16 |
98 | | (uint32_t)src[i] << 8 |
99 | | (uint32_t)src[i] << 0; |
100 | } |
101 | } |
102 | |
103 | static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) { |
104 | for (int i = 0; i < count; i++) { |
105 | uint8_t g = src[0], |
106 | a = src[1]; |
107 | src += 2; |
108 | dst[i] = (uint32_t)a << 24 |
109 | | (uint32_t)g << 16 |
110 | | (uint32_t)g << 8 |
111 | | (uint32_t)g << 0; |
112 | } |
113 | } |
114 | |
115 | static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) { |
116 | for (int i = 0; i < count; i++) { |
117 | uint8_t g = src[0], |
118 | a = src[1]; |
119 | src += 2; |
120 | g = (g*a+127)/255; |
121 | dst[i] = (uint32_t)a << 24 |
122 | | (uint32_t)g << 16 |
123 | | (uint32_t)g << 8 |
124 | | (uint32_t)g << 0; |
125 | } |
126 | } |
127 | |
128 | static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) { |
129 | for (int i = 0; i < count; i++) { |
130 | uint8_t k = (src[i] >> 24) & 0xFF, |
131 | y = (src[i] >> 16) & 0xFF, |
132 | m = (src[i] >> 8) & 0xFF, |
133 | c = (src[i] >> 0) & 0xFF; |
134 | // See comments in SkSwizzler.cpp for details on the conversion formula. |
135 | uint8_t b = (y*k+127)/255, |
136 | g = (m*k+127)/255, |
137 | r = (c*k+127)/255; |
138 | dst[i] = (uint32_t)0xFF << 24 |
139 | | (uint32_t) b << 16 |
140 | | (uint32_t) g << 8 |
141 | | (uint32_t) r << 0; |
142 | } |
143 | } |
144 | |
145 | static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) { |
146 | for (int i = 0; i < count; i++) { |
147 | uint8_t k = (src[i] >> 24) & 0xFF, |
148 | y = (src[i] >> 16) & 0xFF, |
149 | m = (src[i] >> 8) & 0xFF, |
150 | c = (src[i] >> 0) & 0xFF; |
151 | uint8_t b = (y*k+127)/255, |
152 | g = (m*k+127)/255, |
153 | r = (c*k+127)/255; |
154 | dst[i] = (uint32_t)0xFF << 24 |
155 | | (uint32_t) r << 16 |
156 | | (uint32_t) g << 8 |
157 | | (uint32_t) b << 0; |
158 | } |
159 | } |
160 | |
161 | #if defined(SK_ARM_HAS_NEON) |
162 | |
163 | // Rounded divide by 255, (x + 127) / 255 |
164 | static uint8x8_t div255_round(uint16x8_t x) { |
165 | // result = (x + 127) / 255 |
166 | // result = (x + 127) / 256 + error1 |
167 | // |
168 | // error1 = (x + 127) / (255 * 256) |
169 | // error1 = (x + 127) / (256 * 256) + error2 |
170 | // |
171 | // error2 = (x + 127) / (255 * 256 * 256) |
172 | // |
173 | // The maximum value of error2 is too small to matter. Thus: |
174 | // result = (x + 127) / 256 + (x + 127) / (256 * 256) |
175 | // result = ((x + 127) / 256 + x + 127) / 256 |
176 | // result = ((x + 127) >> 8 + x + 127) >> 8 |
177 | // |
178 | // Use >>> to represent "rounded right shift" which, conveniently, |
179 | // NEON supports in one instruction. |
180 | // result = ((x >>> 8) + x) >>> 8 |
181 | // |
182 | // Note that the second right shift is actually performed as an |
183 | // "add, round, and narrow back to 8-bits" instruction. |
184 | return vraddhn_u16(x, vrshrq_n_u16(x, 8)); |
185 | } |
186 | |
187 | // Scale a byte by another, (x * y + 127) / 255 |
188 | static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { |
189 | return div255_round(vmull_u8(x, y)); |
190 | } |
191 | |
192 | template <bool kSwapRB> |
193 | static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) { |
194 | while (count >= 8) { |
195 | // Load 8 pixels. |
196 | uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); |
197 | |
198 | uint8x8_t a = rgba.val[3], |
199 | b = rgba.val[2], |
200 | g = rgba.val[1], |
201 | r = rgba.val[0]; |
202 | |
203 | // Premultiply. |
204 | b = scale(b, a); |
205 | g = scale(g, a); |
206 | r = scale(r, a); |
207 | |
208 | // Store 8 premultiplied pixels. |
209 | if (kSwapRB) { |
210 | rgba.val[2] = r; |
211 | rgba.val[1] = g; |
212 | rgba.val[0] = b; |
213 | } else { |
214 | rgba.val[2] = b; |
215 | rgba.val[1] = g; |
216 | rgba.val[0] = r; |
217 | } |
218 | vst4_u8((uint8_t*) dst, rgba); |
219 | src += 8; |
220 | dst += 8; |
221 | count -= 8; |
222 | } |
223 | |
224 | // Call portable code to finish up the tail of [0,8) pixels. |
225 | auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; |
226 | proc(dst, src, count); |
227 | } |
228 | |
229 | /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { |
230 | premul_should_swapRB<false>(dst, src, count); |
231 | } |
232 | |
233 | /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { |
234 | premul_should_swapRB<true>(dst, src, count); |
235 | } |
236 | |
237 | /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { |
238 | using std::swap; |
239 | while (count >= 16) { |
240 | // Load 16 pixels. |
241 | uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); |
242 | |
243 | // Swap r and b. |
244 | swap(rgba.val[0], rgba.val[2]); |
245 | |
246 | // Store 16 pixels. |
247 | vst4q_u8((uint8_t*) dst, rgba); |
248 | src += 16; |
249 | dst += 16; |
250 | count -= 16; |
251 | } |
252 | |
253 | if (count >= 8) { |
254 | // Load 8 pixels. |
255 | uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); |
256 | |
257 | // Swap r and b. |
258 | swap(rgba.val[0], rgba.val[2]); |
259 | |
260 | // Store 8 pixels. |
261 | vst4_u8((uint8_t*) dst, rgba); |
262 | src += 8; |
263 | dst += 8; |
264 | count -= 8; |
265 | } |
266 | |
267 | RGBA_to_BGRA_portable(dst, src, count); |
268 | } |
269 | |
270 | template <bool kSwapRB> |
271 | static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) { |
272 | while (count >= 16) { |
273 | // Load 16 pixels. |
274 | uint8x16x3_t rgb = vld3q_u8(src); |
275 | |
276 | // Insert an opaque alpha channel and swap if needed. |
277 | uint8x16x4_t rgba; |
278 | if (kSwapRB) { |
279 | rgba.val[0] = rgb.val[2]; |
280 | rgba.val[2] = rgb.val[0]; |
281 | } else { |
282 | rgba.val[0] = rgb.val[0]; |
283 | rgba.val[2] = rgb.val[2]; |
284 | } |
285 | rgba.val[1] = rgb.val[1]; |
286 | rgba.val[3] = vdupq_n_u8(0xFF); |
287 | |
288 | // Store 16 pixels. |
289 | vst4q_u8((uint8_t*) dst, rgba); |
290 | src += 16*3; |
291 | dst += 16; |
292 | count -= 16; |
293 | } |
294 | |
295 | if (count >= 8) { |
296 | // Load 8 pixels. |
297 | uint8x8x3_t rgb = vld3_u8(src); |
298 | |
299 | // Insert an opaque alpha channel and swap if needed. |
300 | uint8x8x4_t rgba; |
301 | if (kSwapRB) { |
302 | rgba.val[0] = rgb.val[2]; |
303 | rgba.val[2] = rgb.val[0]; |
304 | } else { |
305 | rgba.val[0] = rgb.val[0]; |
306 | rgba.val[2] = rgb.val[2]; |
307 | } |
308 | rgba.val[1] = rgb.val[1]; |
309 | rgba.val[3] = vdup_n_u8(0xFF); |
310 | |
311 | // Store 8 pixels. |
312 | vst4_u8((uint8_t*) dst, rgba); |
313 | src += 8*3; |
314 | dst += 8; |
315 | count -= 8; |
316 | } |
317 | |
318 | // Call portable code to finish up the tail of [0,8) pixels. |
319 | auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; |
320 | proc(dst, src, count); |
321 | } |
322 | |
323 | /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { |
324 | insert_alpha_should_swaprb<false>(dst, src, count); |
325 | } |
326 | |
327 | /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { |
328 | insert_alpha_should_swaprb<true>(dst, src, count); |
329 | } |
330 | |
331 | /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { |
332 | while (count >= 16) { |
333 | // Load 16 pixels. |
334 | uint8x16_t gray = vld1q_u8(src); |
335 | |
336 | // Set each of the color channels. |
337 | uint8x16x4_t rgba; |
338 | rgba.val[0] = gray; |
339 | rgba.val[1] = gray; |
340 | rgba.val[2] = gray; |
341 | rgba.val[3] = vdupq_n_u8(0xFF); |
342 | |
343 | // Store 16 pixels. |
344 | vst4q_u8((uint8_t*) dst, rgba); |
345 | src += 16; |
346 | dst += 16; |
347 | count -= 16; |
348 | } |
349 | |
350 | if (count >= 8) { |
351 | // Load 8 pixels. |
352 | uint8x8_t gray = vld1_u8(src); |
353 | |
354 | // Set each of the color channels. |
355 | uint8x8x4_t rgba; |
356 | rgba.val[0] = gray; |
357 | rgba.val[1] = gray; |
358 | rgba.val[2] = gray; |
359 | rgba.val[3] = vdup_n_u8(0xFF); |
360 | |
361 | // Store 8 pixels. |
362 | vst4_u8((uint8_t*) dst, rgba); |
363 | src += 8; |
364 | dst += 8; |
365 | count -= 8; |
366 | } |
367 | |
368 | gray_to_RGB1_portable(dst, src, count); |
369 | } |
370 | |
371 | template <bool kPremul> |
372 | static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) { |
373 | while (count >= 16) { |
374 | // Load 16 pixels. |
375 | uint8x16x2_t ga = vld2q_u8(src); |
376 | |
377 | // Premultiply if requested. |
378 | if (kPremul) { |
379 | ga.val[0] = vcombine_u8( |
380 | scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])), |
381 | scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1]))); |
382 | } |
383 | |
384 | // Set each of the color channels. |
385 | uint8x16x4_t rgba; |
386 | rgba.val[0] = ga.val[0]; |
387 | rgba.val[1] = ga.val[0]; |
388 | rgba.val[2] = ga.val[0]; |
389 | rgba.val[3] = ga.val[1]; |
390 | |
391 | // Store 16 pixels. |
392 | vst4q_u8((uint8_t*) dst, rgba); |
393 | src += 16*2; |
394 | dst += 16; |
395 | count -= 16; |
396 | } |
397 | |
398 | if (count >= 8) { |
399 | // Load 8 pixels. |
400 | uint8x8x2_t ga = vld2_u8(src); |
401 | |
402 | // Premultiply if requested. |
403 | if (kPremul) { |
404 | ga.val[0] = scale(ga.val[0], ga.val[1]); |
405 | } |
406 | |
407 | // Set each of the color channels. |
408 | uint8x8x4_t rgba; |
409 | rgba.val[0] = ga.val[0]; |
410 | rgba.val[1] = ga.val[0]; |
411 | rgba.val[2] = ga.val[0]; |
412 | rgba.val[3] = ga.val[1]; |
413 | |
414 | // Store 8 pixels. |
415 | vst4_u8((uint8_t*) dst, rgba); |
416 | src += 8*2; |
417 | dst += 8; |
418 | count -= 8; |
419 | } |
420 | |
421 | auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable; |
422 | proc(dst, src, count); |
423 | } |
424 | |
425 | /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { |
426 | expand_grayA<false>(dst, src, count); |
427 | } |
428 | |
429 | /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { |
430 | expand_grayA<true>(dst, src, count); |
431 | } |
432 | |
433 | enum Format { kRGB1, kBGR1 }; |
434 | template <Format format> |
435 | static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) { |
436 | while (count >= 8) { |
437 | // Load 8 cmyk pixels. |
438 | uint8x8x4_t pixels = vld4_u8((const uint8_t*) src); |
439 | |
440 | uint8x8_t k = pixels.val[3], |
441 | y = pixels.val[2], |
442 | m = pixels.val[1], |
443 | c = pixels.val[0]; |
444 | |
445 | // Scale to r, g, b. |
446 | uint8x8_t b = scale(y, k); |
447 | uint8x8_t g = scale(m, k); |
448 | uint8x8_t r = scale(c, k); |
449 | |
450 | // Store 8 rgba pixels. |
451 | if (kBGR1 == format) { |
452 | pixels.val[3] = vdup_n_u8(0xFF); |
453 | pixels.val[2] = r; |
454 | pixels.val[1] = g; |
455 | pixels.val[0] = b; |
456 | } else { |
457 | pixels.val[3] = vdup_n_u8(0xFF); |
458 | pixels.val[2] = b; |
459 | pixels.val[1] = g; |
460 | pixels.val[0] = r; |
461 | } |
462 | vst4_u8((uint8_t*) dst, pixels); |
463 | src += 8; |
464 | dst += 8; |
465 | count -= 8; |
466 | } |
467 | |
468 | auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; |
469 | proc(dst, src, count); |
470 | } |
471 | |
472 | /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { |
473 | inverted_cmyk_to<kRGB1>(dst, src, count); |
474 | } |
475 | |
476 | /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { |
477 | inverted_cmyk_to<kBGR1>(dst, src, count); |
478 | } |
479 | |
480 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
481 | |
482 | // Scale a byte by another. |
483 | // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. |
484 | static __m128i scale(__m128i x, __m128i y) { |
485 | const __m128i _128 = _mm_set1_epi16(128); |
486 | const __m128i _257 = _mm_set1_epi16(257); |
487 | |
488 | // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. |
489 | return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); |
490 | } |
491 | |
492 | template <bool kSwapRB> |
493 | static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) { |
494 | |
495 | auto premul8 = [](__m128i* lo, __m128i* hi) { |
496 | const __m128i zeros = _mm_setzero_si128(); |
497 | __m128i planar; |
498 | if (kSwapRB) { |
499 | planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); |
500 | } else { |
501 | planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); |
502 | } |
503 | |
504 | // Swizzle the pixels to 8-bit planar. |
505 | *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa |
506 | *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA |
507 | __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG |
508 | ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA |
509 | |
510 | // Unpack to 16-bit planar. |
511 | __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ |
512 | g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ |
513 | b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ |
514 | a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ |
515 | |
516 | // Premultiply! |
517 | r = scale(r, a); |
518 | g = scale(g, a); |
519 | b = scale(b, a); |
520 | |
521 | // Repack into interlaced pixels. |
522 | rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG |
523 | ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA |
524 | *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba |
525 | *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA |
526 | }; |
527 | |
528 | while (count >= 8) { |
529 | __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), |
530 | hi = _mm_loadu_si128((const __m128i*) (src + 4)); |
531 | |
532 | premul8(&lo, &hi); |
533 | |
534 | _mm_storeu_si128((__m128i*) (dst + 0), lo); |
535 | _mm_storeu_si128((__m128i*) (dst + 4), hi); |
536 | |
537 | src += 8; |
538 | dst += 8; |
539 | count -= 8; |
540 | } |
541 | |
542 | if (count >= 4) { |
543 | __m128i lo = _mm_loadu_si128((const __m128i*) src), |
544 | hi = _mm_setzero_si128(); |
545 | |
546 | premul8(&lo, &hi); |
547 | |
548 | _mm_storeu_si128((__m128i*) dst, lo); |
549 | |
550 | src += 4; |
551 | dst += 4; |
552 | count -= 4; |
553 | } |
554 | |
555 | // Call portable code to finish up the tail of [0,4) pixels. |
556 | auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; |
557 | proc(dst, src, count); |
558 | } |
559 | |
560 | /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { |
561 | premul_should_swapRB<false>(dst, src, count); |
562 | } |
563 | |
564 | /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { |
565 | premul_should_swapRB<true>(dst, src, count); |
566 | } |
567 | |
568 | /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { |
569 | const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); |
570 | |
571 | while (count >= 4) { |
572 | __m128i rgba = _mm_loadu_si128((const __m128i*) src); |
573 | __m128i bgra = _mm_shuffle_epi8(rgba, swapRB); |
574 | _mm_storeu_si128((__m128i*) dst, bgra); |
575 | |
576 | src += 4; |
577 | dst += 4; |
578 | count -= 4; |
579 | } |
580 | |
581 | RGBA_to_BGRA_portable(dst, src, count); |
582 | } |
583 | |
584 | template <bool kSwapRB> |
585 | static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) { |
586 | const __m128i alphaMask = _mm_set1_epi32(0xFF000000); |
587 | __m128i expand; |
588 | const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant. |
589 | if (kSwapRB) { |
590 | expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X); |
591 | } else { |
592 | expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X); |
593 | } |
594 | |
595 | while (count >= 6) { |
596 | // Load a vector. While this actually contains 5 pixels plus an |
597 | // extra component, we will discard all but the first four pixels on |
598 | // this iteration. |
599 | __m128i rgb = _mm_loadu_si128((const __m128i*) src); |
600 | |
601 | // Expand the first four pixels to RGBX and then mask to RGB(FF). |
602 | __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask); |
603 | |
604 | // Store 4 pixels. |
605 | _mm_storeu_si128((__m128i*) dst, rgba); |
606 | |
607 | src += 4*3; |
608 | dst += 4; |
609 | count -= 4; |
610 | } |
611 | |
612 | // Call portable code to finish up the tail of [0,4) pixels. |
613 | auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; |
614 | proc(dst, src, count); |
615 | } |
616 | |
617 | /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { |
618 | insert_alpha_should_swaprb<false>(dst, src, count); |
619 | } |
620 | |
621 | /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { |
622 | insert_alpha_should_swaprb<true>(dst, src, count); |
623 | } |
624 | |
625 | /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { |
626 | const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF); |
627 | while (count >= 16) { |
628 | __m128i grays = _mm_loadu_si128((const __m128i*) src); |
629 | |
630 | __m128i gg_lo = _mm_unpacklo_epi8(grays, grays); |
631 | __m128i gg_hi = _mm_unpackhi_epi8(grays, grays); |
632 | __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas); |
633 | __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas); |
634 | |
635 | __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo); |
636 | __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo); |
637 | __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi); |
638 | __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi); |
639 | |
640 | _mm_storeu_si128((__m128i*) (dst + 0), ggga0); |
641 | _mm_storeu_si128((__m128i*) (dst + 4), ggga1); |
642 | _mm_storeu_si128((__m128i*) (dst + 8), ggga2); |
643 | _mm_storeu_si128((__m128i*) (dst + 12), ggga3); |
644 | |
645 | src += 16; |
646 | dst += 16; |
647 | count -= 16; |
648 | } |
649 | |
650 | gray_to_RGB1_portable(dst, src, count); |
651 | } |
652 | |
653 | /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { |
654 | while (count >= 8) { |
655 | __m128i ga = _mm_loadu_si128((const __m128i*) src); |
656 | |
657 | __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)), |
658 | _mm_slli_epi16(ga, 8)); |
659 | |
660 | __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); |
661 | __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); |
662 | |
663 | _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); |
664 | _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); |
665 | |
666 | src += 8*2; |
667 | dst += 8; |
668 | count -= 8; |
669 | } |
670 | |
671 | grayA_to_RGBA_portable(dst, src, count); |
672 | } |
673 | |
674 | /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { |
675 | while (count >= 8) { |
676 | __m128i grayA = _mm_loadu_si128((const __m128i*) src); |
677 | |
678 | __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF)); |
679 | __m128i a0 = _mm_srli_epi16(grayA, 8); |
680 | |
681 | // Premultiply |
682 | g0 = scale(g0, a0); |
683 | |
684 | __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8)); |
685 | __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8)); |
686 | |
687 | |
688 | __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); |
689 | __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); |
690 | |
691 | _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); |
692 | _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); |
693 | |
694 | src += 8*2; |
695 | dst += 8; |
696 | count -= 8; |
697 | } |
698 | |
699 | grayA_to_rgbA_portable(dst, src, count); |
700 | } |
701 | |
702 | enum Format { kRGB1, kBGR1 }; |
703 | template <Format format> |
704 | static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) { |
705 | auto convert8 = [](__m128i* lo, __m128i* hi) { |
706 | const __m128i zeros = _mm_setzero_si128(); |
707 | __m128i planar; |
708 | if (kBGR1 == format) { |
709 | planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); |
710 | } else { |
711 | planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); |
712 | } |
713 | |
714 | // Swizzle the pixels to 8-bit planar. |
715 | *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk |
716 | *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK |
717 | __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM |
718 | yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK |
719 | |
720 | // Unpack to 16-bit planar. |
721 | __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ |
722 | m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ |
723 | y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ |
724 | k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ |
725 | |
726 | // Scale to r, g, b. |
727 | __m128i r = scale(c, k), |
728 | g = scale(m, k), |
729 | b = scale(y, k); |
730 | |
731 | // Repack into interlaced pixels. |
732 | __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG |
733 | ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1 |
734 | *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba |
735 | *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 |
736 | }; |
737 | |
738 | while (count >= 8) { |
739 | __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), |
740 | hi = _mm_loadu_si128((const __m128i*) (src + 4)); |
741 | |
742 | convert8(&lo, &hi); |
743 | |
744 | _mm_storeu_si128((__m128i*) (dst + 0), lo); |
745 | _mm_storeu_si128((__m128i*) (dst + 4), hi); |
746 | |
747 | src += 8; |
748 | dst += 8; |
749 | count -= 8; |
750 | } |
751 | |
752 | if (count >= 4) { |
753 | __m128i lo = _mm_loadu_si128((const __m128i*) src), |
754 | hi = _mm_setzero_si128(); |
755 | |
756 | convert8(&lo, &hi); |
757 | |
758 | _mm_storeu_si128((__m128i*) dst, lo); |
759 | |
760 | src += 4; |
761 | dst += 4; |
762 | count -= 4; |
763 | } |
764 | |
765 | auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; |
766 | proc(dst, src, count); |
767 | } |
768 | |
769 | /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { |
770 | inverted_cmyk_to<kRGB1>(dst, src, count); |
771 | } |
772 | |
773 | /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { |
774 | inverted_cmyk_to<kBGR1>(dst, src, count); |
775 | } |
776 | |
777 | #else |
778 | |
779 | /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { |
780 | RGBA_to_rgbA_portable(dst, src, count); |
781 | } |
782 | |
783 | /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { |
784 | RGBA_to_bgrA_portable(dst, src, count); |
785 | } |
786 | |
787 | /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { |
788 | RGBA_to_BGRA_portable(dst, src, count); |
789 | } |
790 | |
791 | /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { |
792 | RGB_to_RGB1_portable(dst, src, count); |
793 | } |
794 | |
795 | /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { |
796 | RGB_to_BGR1_portable(dst, src, count); |
797 | } |
798 | |
799 | /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { |
800 | gray_to_RGB1_portable(dst, src, count); |
801 | } |
802 | |
803 | /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { |
804 | grayA_to_RGBA_portable(dst, src, count); |
805 | } |
806 | |
807 | /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { |
808 | grayA_to_rgbA_portable(dst, src, count); |
809 | } |
810 | |
811 | /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { |
812 | inverted_CMYK_to_RGB1_portable(dst, src, count); |
813 | } |
814 | |
815 | /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { |
816 | inverted_CMYK_to_BGR1_portable(dst, src, count); |
817 | } |
818 | |
819 | #endif |
820 | |
821 | } |
822 | |
823 | #endif // SkSwizzler_opts_DEFINED |
824 | |