1/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkSwizzler_opts_DEFINED
9#define SkSwizzler_opts_DEFINED
10
11#include "include/private/SkColorData.h"
12
13#include <utility>
14
15#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
16 #include <immintrin.h>
17#elif defined(SK_ARM_HAS_NEON)
18 #include <arm_neon.h>
19#endif
20
21namespace SK_OPTS_NS {
22
23static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
24 for (int i = 0; i < count; i++) {
25 uint8_t a = (src[i] >> 24) & 0xFF,
26 b = (src[i] >> 16) & 0xFF,
27 g = (src[i] >> 8) & 0xFF,
28 r = (src[i] >> 0) & 0xFF;
29 b = (b*a+127)/255;
30 g = (g*a+127)/255;
31 r = (r*a+127)/255;
32 dst[i] = (uint32_t)a << 24
33 | (uint32_t)b << 16
34 | (uint32_t)g << 8
35 | (uint32_t)r << 0;
36 }
37}
38
39static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
40 for (int i = 0; i < count; i++) {
41 uint8_t a = (src[i] >> 24) & 0xFF,
42 b = (src[i] >> 16) & 0xFF,
43 g = (src[i] >> 8) & 0xFF,
44 r = (src[i] >> 0) & 0xFF;
45 b = (b*a+127)/255;
46 g = (g*a+127)/255;
47 r = (r*a+127)/255;
48 dst[i] = (uint32_t)a << 24
49 | (uint32_t)r << 16
50 | (uint32_t)g << 8
51 | (uint32_t)b << 0;
52 }
53}
54
55static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
56 for (int i = 0; i < count; i++) {
57 uint8_t a = (src[i] >> 24) & 0xFF,
58 b = (src[i] >> 16) & 0xFF,
59 g = (src[i] >> 8) & 0xFF,
60 r = (src[i] >> 0) & 0xFF;
61 dst[i] = (uint32_t)a << 24
62 | (uint32_t)r << 16
63 | (uint32_t)g << 8
64 | (uint32_t)b << 0;
65 }
66}
67
68static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
69 for (int i = 0; i < count; i++) {
70 uint8_t g = src[0],
71 a = src[1];
72 src += 2;
73 dst[i] = (uint32_t)a << 24
74 | (uint32_t)g << 16
75 | (uint32_t)g << 8
76 | (uint32_t)g << 0;
77 }
78}
79
80static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
81 for (int i = 0; i < count; i++) {
82 uint8_t g = src[0],
83 a = src[1];
84 src += 2;
85 g = (g*a+127)/255;
86 dst[i] = (uint32_t)a << 24
87 | (uint32_t)g << 16
88 | (uint32_t)g << 8
89 | (uint32_t)g << 0;
90 }
91}
92
93static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
94 for (int i = 0; i < count; i++) {
95 uint8_t k = (src[i] >> 24) & 0xFF,
96 y = (src[i] >> 16) & 0xFF,
97 m = (src[i] >> 8) & 0xFF,
98 c = (src[i] >> 0) & 0xFF;
99 // See comments in SkSwizzler.cpp for details on the conversion formula.
100 uint8_t b = (y*k+127)/255,
101 g = (m*k+127)/255,
102 r = (c*k+127)/255;
103 dst[i] = (uint32_t)0xFF << 24
104 | (uint32_t) b << 16
105 | (uint32_t) g << 8
106 | (uint32_t) r << 0;
107 }
108}
109
110static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
111 for (int i = 0; i < count; i++) {
112 uint8_t k = (src[i] >> 24) & 0xFF,
113 y = (src[i] >> 16) & 0xFF,
114 m = (src[i] >> 8) & 0xFF,
115 c = (src[i] >> 0) & 0xFF;
116 uint8_t b = (y*k+127)/255,
117 g = (m*k+127)/255,
118 r = (c*k+127)/255;
119 dst[i] = (uint32_t)0xFF << 24
120 | (uint32_t) r << 16
121 | (uint32_t) g << 8
122 | (uint32_t) b << 0;
123 }
124}
125
126#if defined(SK_ARM_HAS_NEON)
127
128// Rounded divide by 255, (x + 127) / 255
129static uint8x8_t div255_round(uint16x8_t x) {
130 // result = (x + 127) / 255
131 // result = (x + 127) / 256 + error1
132 //
133 // error1 = (x + 127) / (255 * 256)
134 // error1 = (x + 127) / (256 * 256) + error2
135 //
136 // error2 = (x + 127) / (255 * 256 * 256)
137 //
138 // The maximum value of error2 is too small to matter. Thus:
139 // result = (x + 127) / 256 + (x + 127) / (256 * 256)
140 // result = ((x + 127) / 256 + x + 127) / 256
141 // result = ((x + 127) >> 8 + x + 127) >> 8
142 //
143 // Use >>> to represent "rounded right shift" which, conveniently,
144 // NEON supports in one instruction.
145 // result = ((x >>> 8) + x) >>> 8
146 //
147 // Note that the second right shift is actually performed as an
148 // "add, round, and narrow back to 8-bits" instruction.
149 return vraddhn_u16(x, vrshrq_n_u16(x, 8));
150}
151
152// Scale a byte by another, (x * y + 127) / 255
153static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
154 return div255_round(vmull_u8(x, y));
155}
156
157static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
158 while (count >= 8) {
159 // Load 8 pixels.
160 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
161
162 uint8x8_t a = rgba.val[3],
163 b = rgba.val[2],
164 g = rgba.val[1],
165 r = rgba.val[0];
166
167 // Premultiply.
168 b = scale(b, a);
169 g = scale(g, a);
170 r = scale(r, a);
171
172 // Store 8 premultiplied pixels.
173 if (kSwapRB) {
174 rgba.val[2] = r;
175 rgba.val[1] = g;
176 rgba.val[0] = b;
177 } else {
178 rgba.val[2] = b;
179 rgba.val[1] = g;
180 rgba.val[0] = r;
181 }
182 vst4_u8((uint8_t*) dst, rgba);
183 src += 8;
184 dst += 8;
185 count -= 8;
186 }
187
188 // Call portable code to finish up the tail of [0,8) pixels.
189 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
190 proc(dst, src, count);
191}
192
193/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
194 premul_should_swapRB(false, dst, src, count);
195}
196
197/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
198 premul_should_swapRB(true, dst, src, count);
199}
200
201/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
202 using std::swap;
203 while (count >= 16) {
204 // Load 16 pixels.
205 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
206
207 // Swap r and b.
208 swap(rgba.val[0], rgba.val[2]);
209
210 // Store 16 pixels.
211 vst4q_u8((uint8_t*) dst, rgba);
212 src += 16;
213 dst += 16;
214 count -= 16;
215 }
216
217 if (count >= 8) {
218 // Load 8 pixels.
219 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
220
221 // Swap r and b.
222 swap(rgba.val[0], rgba.val[2]);
223
224 // Store 8 pixels.
225 vst4_u8((uint8_t*) dst, rgba);
226 src += 8;
227 dst += 8;
228 count -= 8;
229 }
230
231 RGBA_to_BGRA_portable(dst, src, count);
232}
233
234static void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) {
235 while (count >= 16) {
236 // Load 16 pixels.
237 uint8x16x2_t ga = vld2q_u8(src);
238
239 // Premultiply if requested.
240 if (kPremul) {
241 ga.val[0] = vcombine_u8(
242 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
243 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
244 }
245
246 // Set each of the color channels.
247 uint8x16x4_t rgba;
248 rgba.val[0] = ga.val[0];
249 rgba.val[1] = ga.val[0];
250 rgba.val[2] = ga.val[0];
251 rgba.val[3] = ga.val[1];
252
253 // Store 16 pixels.
254 vst4q_u8((uint8_t*) dst, rgba);
255 src += 16*2;
256 dst += 16;
257 count -= 16;
258 }
259
260 if (count >= 8) {
261 // Load 8 pixels.
262 uint8x8x2_t ga = vld2_u8(src);
263
264 // Premultiply if requested.
265 if (kPremul) {
266 ga.val[0] = scale(ga.val[0], ga.val[1]);
267 }
268
269 // Set each of the color channels.
270 uint8x8x4_t rgba;
271 rgba.val[0] = ga.val[0];
272 rgba.val[1] = ga.val[0];
273 rgba.val[2] = ga.val[0];
274 rgba.val[3] = ga.val[1];
275
276 // Store 8 pixels.
277 vst4_u8((uint8_t*) dst, rgba);
278 src += 8*2;
279 dst += 8;
280 count -= 8;
281 }
282
283 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
284 proc(dst, src, count);
285}
286
287/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
288 expand_grayA(false, dst, src, count);
289}
290
291/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
292 expand_grayA(true, dst, src, count);
293}
294
295enum Format { kRGB1, kBGR1 };
296static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
297 while (count >= 8) {
298 // Load 8 cmyk pixels.
299 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
300
301 uint8x8_t k = pixels.val[3],
302 y = pixels.val[2],
303 m = pixels.val[1],
304 c = pixels.val[0];
305
306 // Scale to r, g, b.
307 uint8x8_t b = scale(y, k);
308 uint8x8_t g = scale(m, k);
309 uint8x8_t r = scale(c, k);
310
311 // Store 8 rgba pixels.
312 if (kBGR1 == format) {
313 pixels.val[3] = vdup_n_u8(0xFF);
314 pixels.val[2] = r;
315 pixels.val[1] = g;
316 pixels.val[0] = b;
317 } else {
318 pixels.val[3] = vdup_n_u8(0xFF);
319 pixels.val[2] = b;
320 pixels.val[1] = g;
321 pixels.val[0] = r;
322 }
323 vst4_u8((uint8_t*) dst, pixels);
324 src += 8;
325 dst += 8;
326 count -= 8;
327 }
328
329 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
330 proc(dst, src, count);
331}
332
333/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
334 inverted_cmyk_to(kRGB1, dst, src, count);
335}
336
337/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
338 inverted_cmyk_to(kBGR1, dst, src, count);
339}
340
341#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
342// Scale a byte by another.
343// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
344static __m512i scale(__m512i x, __m512i y) {
345 const __m512i _128 = _mm512_set1_epi16(128);
346 const __m512i _257 = _mm512_set1_epi16(257);
347
348 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
349 return _mm512_mulhi_epu16(_mm512_add_epi16(_mm512_mullo_epi16(x, y), _128), _257);
350}
351
352static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
353
354 auto premul8 = [=](__m512i* lo, __m512i* hi) {
355 const __m512i zeros = _mm512_setzero_si512();
356 skvx::Vec<64, uint8_t> mask;
357 if (kSwapRB) {
358 mask = { 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
359 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
360 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
361 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15 };
362 } else {
363 mask = { 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
364 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
365 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
366 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15 };
367 }
368 __m512i planar = skvx::bit_pun<__m512i>(mask);
369
370 // Swizzle the pixels to 8-bit planar.
371 *lo = _mm512_shuffle_epi8(*lo, planar);
372 *hi = _mm512_shuffle_epi8(*hi, planar);
373 __m512i rg = _mm512_unpacklo_epi32(*lo, *hi),
374 ba = _mm512_unpackhi_epi32(*lo, *hi);
375
376 // Unpack to 16-bit planar.
377 __m512i r = _mm512_unpacklo_epi8(rg, zeros),
378 g = _mm512_unpackhi_epi8(rg, zeros),
379 b = _mm512_unpacklo_epi8(ba, zeros),
380 a = _mm512_unpackhi_epi8(ba, zeros);
381
382 // Premultiply!
383 r = scale(r, a);
384 g = scale(g, a);
385 b = scale(b, a);
386
387 // Repack into interlaced pixels.
388 rg = _mm512_or_si512(r, _mm512_slli_epi16(g, 8));
389 ba = _mm512_or_si512(b, _mm512_slli_epi16(a, 8));
390 *lo = _mm512_unpacklo_epi16(rg, ba);
391 *hi = _mm512_unpackhi_epi16(rg, ba);
392 };
393
394 while (count >= 32) {
395 __m512i lo = _mm512_loadu_si512((const __m512i*) (src + 0)),
396 hi = _mm512_loadu_si512((const __m512i*) (src + 16));
397
398 premul8(&lo, &hi);
399
400 _mm512_storeu_si512((__m512i*) (dst + 0), lo);
401 _mm512_storeu_si512((__m512i*) (dst + 16), hi);
402
403 src += 32;
404 dst += 32;
405 count -= 32;
406 }
407
408 if (count >= 16) {
409 __m512i lo = _mm512_loadu_si512((const __m512i*) src),
410 hi = _mm512_setzero_si512();
411
412 premul8(&lo, &hi);
413
414 _mm512_storeu_si512((__m512i*) dst, lo);
415
416 src += 16;
417 dst += 16;
418 count -= 16;
419 }
420
421 // Call portable code to finish up the tail of [0,16) pixels.
422 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
423 proc(dst, src, count);
424}
425
426/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
427 premul_should_swapRB(false, dst, src, count);
428}
429
430/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
431 premul_should_swapRB(true, dst, src, count);
432}
433
434/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
435 const uint8_t mask[64] = { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
436 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
437 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
438 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
439 const __m512i swapRB = _mm512_loadu_si512(mask);
440
441 while (count >= 16) {
442 __m512i rgba = _mm512_loadu_si512((const __m512i*) src);
443 __m512i bgra = _mm512_shuffle_epi8(rgba, swapRB);
444 _mm512_storeu_si512((__m512i*) dst, bgra);
445
446 src += 16;
447 dst += 16;
448 count -= 16;
449 }
450
451 RGBA_to_BGRA_portable(dst, src, count);
452}
453
454// Use SSSE3 impl as AVX2 / AVX-512 impl regresses performance for RGB_to_RGB1 / RGB_to_BGR1.
455
456// Use AVX2 impl as AVX-512 impl regresses performance for gray_to_RGB1.
457
458/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
459 while (count >= 32) {
460 __m512i ga = _mm512_loadu_si512((const __m512i*) src);
461
462 __m512i gg = _mm512_or_si512(_mm512_and_si512(ga, _mm512_set1_epi16(0x00FF)),
463 _mm512_slli_epi16(ga, 8));
464
465 __m512i ggga_lo = _mm512_unpacklo_epi16(gg, ga);
466 __m512i ggga_hi = _mm512_unpackhi_epi16(gg, ga);
467
468 // 1st shuffle for pixel reorder.
469 // Note. 'p' stands for 'ggga'
470 // Before 1st shuffle:
471 // ggga_lo = p0 p1 p2 p3 | p8 p9 p10 p11 | p16 p17 p18 p19 | p24 p25 p26 p27
472 // ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15 | p20 p21 p22 p23 | p28 p29 p30 p31
473 //
474 // After 1st shuffle:
475 // ggga_lo_shuffle_1 =
476 // p0 p1 p2 p3 | p8 p9 p10 p11 | p4 p5 p6 p7 | p12 p13 p14 p15
477 // ggga_hi_shuffle_1 =
478 // p16 p17 p18 p19 | p24 p25 p26 p27 | p20 p21 p22 p23 | p28 p29 p30 p31
479 __m512i ggga_lo_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0x44),
480 ggga_hi_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0xee);
481
482 // 2nd shuffle for pixel reorder.
483 // After the 2nd shuffle:
484 // ggga_lo_shuffle_2 =
485 // p0 p1 p2 p3 | p4 p5 p6 p7 | p8 p9 p10 p11 | p12 p13 p14 p15
486 // ggga_hi_shuffle_2 =
487 // p16 p17 p18 p19 | p20 p21 p22 p23 | p24 p25 p26 p27 | p28 p29 p30 p31
488 __m512i ggga_lo_shuffle_2 = _mm512_shuffle_i32x4(ggga_lo_shuffle_1,
489 ggga_lo_shuffle_1, 0xd8),
490 ggga_hi_shuffle_2 = _mm512_shuffle_i32x4(ggga_hi_shuffle_1,
491 ggga_hi_shuffle_1, 0xd8);
492
493 _mm512_storeu_si512((__m512i*) (dst + 0), ggga_lo_shuffle_2);
494 _mm512_storeu_si512((__m512i*) (dst + 16), ggga_hi_shuffle_2);
495
496 src += 32*2;
497 dst += 32;
498 count -= 32;
499 }
500
501 grayA_to_RGBA_portable(dst, src, count);
502}
503
504/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
505 while (count >= 32) {
506 __m512i grayA = _mm512_loadu_si512((const __m512i*) src);
507
508 __m512i g0 = _mm512_and_si512(grayA, _mm512_set1_epi16(0x00FF));
509 __m512i a0 = _mm512_srli_epi16(grayA, 8);
510
511 // Premultiply
512 g0 = scale(g0, a0);
513
514 __m512i gg = _mm512_or_si512(g0, _mm512_slli_epi16(g0, 8));
515 __m512i ga = _mm512_or_si512(g0, _mm512_slli_epi16(a0, 8));
516
517 __m512i ggga_lo = _mm512_unpacklo_epi16(gg, ga);
518 __m512i ggga_hi = _mm512_unpackhi_epi16(gg, ga);
519
520 // 1st shuffle for pixel reorder, same as grayA_to_RGBA.
521 __m512i ggga_lo_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0x44),
522 ggga_hi_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0xee);
523
524 // 2nd shuffle for pixel reorder, same as grayA_to_RGBA.
525 __m512i ggga_lo_shuffle_2 = _mm512_shuffle_i32x4(ggga_lo_shuffle_1,
526 ggga_lo_shuffle_1, 0xd8),
527 ggga_hi_shuffle_2 = _mm512_shuffle_i32x4(ggga_hi_shuffle_1,
528 ggga_hi_shuffle_1, 0xd8);
529
530 _mm512_storeu_si512((__m512i*) (dst + 0), ggga_lo_shuffle_2);
531 _mm512_storeu_si512((__m512i*) (dst + 16), ggga_hi_shuffle_2);
532
533 src += 32*2;
534 dst += 32;
535 count -= 32;
536 }
537
538 grayA_to_rgbA_portable(dst, src, count);
539}
540
541enum Format { kRGB1, kBGR1 };
542static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
543 auto convert8 = [=](__m512i* lo, __m512i* hi) {
544 const __m512i zeros = _mm512_setzero_si512();
545 skvx::Vec<64, uint8_t> mask;
546 if (kBGR1 == format) {
547 mask = { 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
548 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
549 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
550 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15 };
551 } else {
552 mask = { 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
553 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
554 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
555 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15 };
556 }
557 __m512i planar = skvx::bit_pun<__m512i>(mask);
558
559 // Swizzle the pixels to 8-bit planar.
560 *lo = _mm512_shuffle_epi8(*lo, planar);
561 *hi = _mm512_shuffle_epi8(*hi, planar);
562 __m512i cm = _mm512_unpacklo_epi32(*lo, *hi),
563 yk = _mm512_unpackhi_epi32(*lo, *hi);
564
565 // Unpack to 16-bit planar.
566 __m512i c = _mm512_unpacklo_epi8(cm, zeros),
567 m = _mm512_unpackhi_epi8(cm, zeros),
568 y = _mm512_unpacklo_epi8(yk, zeros),
569 k = _mm512_unpackhi_epi8(yk, zeros);
570
571 // Scale to r, g, b.
572 __m512i r = scale(c, k),
573 g = scale(m, k),
574 b = scale(y, k);
575
576 // Repack into interlaced pixels.
577 __m512i rg = _mm512_or_si512(r, _mm512_slli_epi16(g, 8)),
578 ba = _mm512_or_si512(b, _mm512_set1_epi16((uint16_t) 0xFF00));
579 *lo = _mm512_unpacklo_epi16(rg, ba);
580 *hi = _mm512_unpackhi_epi16(rg, ba);
581 };
582
583 while (count >= 32) {
584 __m512i lo = _mm512_loadu_si512((const __m512i*) (src + 0)),
585 hi = _mm512_loadu_si512((const __m512i*) (src + 16));
586
587 convert8(&lo, &hi);
588
589 _mm512_storeu_si512((__m512i*) (dst + 0), lo);
590 _mm512_storeu_si512((__m512i*) (dst + 16), hi);
591
592 src += 32;
593 dst += 32;
594 count -= 32;
595 }
596
597 if (count >= 16) {
598 __m512i lo = _mm512_loadu_si512((const __m512i*) src),
599 hi = _mm512_setzero_si512();
600
601 convert8(&lo, &hi);
602
603 _mm512_storeu_si512((__m512i*) dst, lo);
604
605 src += 16;
606 dst += 16;
607 count -= 16;
608 }
609
610 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
611 proc(dst, src, count);
612}
613
614/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
615 inverted_cmyk_to(kRGB1, dst, src, count);
616}
617
618/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
619 inverted_cmyk_to(kBGR1, dst, src, count);
620}
621
622#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
623
624// Scale a byte by another.
625// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
626static __m256i scale(__m256i x, __m256i y) {
627 const __m256i _128 = _mm256_set1_epi16(128);
628 const __m256i _257 = _mm256_set1_epi16(257);
629
630 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
631 return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257);
632}
633
634static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
635
636 auto premul8 = [=](__m256i* lo, __m256i* hi) {
637 const __m256i zeros = _mm256_setzero_si256();
638 __m256i planar;
639 if (kSwapRB) {
640 planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
641 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
642 } else {
643 planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
644 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
645 }
646
647 // Swizzle the pixels to 8-bit planar.
648 *lo = _mm256_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa
649 *hi = _mm256_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA
650 __m256i rg = _mm256_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG
651 ba = _mm256_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA
652
653 // Unpack to 16-bit planar.
654 __m256i r = _mm256_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_
655 g = _mm256_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_
656 b = _mm256_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_
657 a = _mm256_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_
658
659 // Premultiply!
660 r = scale(r, a);
661 g = scale(g, a);
662 b = scale(b, a);
663
664 // Repack into interlaced pixels.
665 rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
666 ba = _mm256_or_si256(b, _mm256_slli_epi16(a, 8)); // babababa BABABABA babababa BABABABA
667 *lo = _mm256_unpacklo_epi16(rg, ba); // rgbargba rgbargba rgbargba rgbargba
668 *hi = _mm256_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA RGBARGBA RGBARGBA
669 };
670
671 while (count >= 16) {
672 __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
673 hi = _mm256_loadu_si256((const __m256i*) (src + 8));
674
675 premul8(&lo, &hi);
676
677 _mm256_storeu_si256((__m256i*) (dst + 0), lo);
678 _mm256_storeu_si256((__m256i*) (dst + 8), hi);
679
680 src += 16;
681 dst += 16;
682 count -= 16;
683 }
684
685 if (count >= 8) {
686 __m256i lo = _mm256_loadu_si256((const __m256i*) src),
687 hi = _mm256_setzero_si256();
688
689 premul8(&lo, &hi);
690
691 _mm256_storeu_si256((__m256i*) dst, lo);
692
693 src += 8;
694 dst += 8;
695 count -= 8;
696 }
697
698 // Call portable code to finish up the tail of [0,8) pixels.
699 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
700 proc(dst, src, count);
701}
702
703/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
704 premul_should_swapRB(false, dst, src, count);
705}
706
707/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
708 premul_should_swapRB(true, dst, src, count);
709}
710
711/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
712 const __m256i swapRB = _mm256_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
713 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
714
715 while (count >= 8) {
716 __m256i rgba = _mm256_loadu_si256((const __m256i*) src);
717 __m256i bgra = _mm256_shuffle_epi8(rgba, swapRB);
718 _mm256_storeu_si256((__m256i*) dst, bgra);
719
720 src += 8;
721 dst += 8;
722 count -= 8;
723 }
724
725 RGBA_to_BGRA_portable(dst, src, count);
726}
727
728/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
729 while (count >= 16) {
730 __m256i ga = _mm256_loadu_si256((const __m256i*) src);
731
732 __m256i gg = _mm256_or_si256(_mm256_and_si256(ga, _mm256_set1_epi16(0x00FF)),
733 _mm256_slli_epi16(ga, 8));
734
735 __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
736 __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
737
738 // Shuffle for pixel reorder
739 // Note. 'p' stands for 'ggga'
740 // Before shuffle:
741 // ggga_lo = p0 p1 p2 p3 | p8 p9 p10 p11
742 // ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15
743 //
744 // After shuffle:
745 // ggga_lo_shuffle = p0 p1 p2 p3 | p4 p5 p6 p7
746 // ggga_hi_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15
747 __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
748 ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
749
750 _mm256_storeu_si256((__m256i*) (dst + 0), ggga_lo_shuffle);
751 _mm256_storeu_si256((__m256i*) (dst + 8), ggga_hi_shuffle);
752
753 src += 16*2;
754 dst += 16;
755 count -= 16;
756 }
757
758 grayA_to_RGBA_portable(dst, src, count);
759}
760
761/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
762 while (count >= 16) {
763 __m256i grayA = _mm256_loadu_si256((const __m256i*) src);
764
765 __m256i g0 = _mm256_and_si256(grayA, _mm256_set1_epi16(0x00FF));
766 __m256i a0 = _mm256_srli_epi16(grayA, 8);
767
768 // Premultiply
769 g0 = scale(g0, a0);
770
771 __m256i gg = _mm256_or_si256(g0, _mm256_slli_epi16(g0, 8));
772 __m256i ga = _mm256_or_si256(g0, _mm256_slli_epi16(a0, 8));
773
774 __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
775 __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
776
777 // Shuffle for pixel reorder, similar as grayA_to_RGBA
778 __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
779 ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
780
781 _mm256_storeu_si256((__m256i*) (dst + 0), ggga_lo_shuffle);
782 _mm256_storeu_si256((__m256i*) (dst + 8), ggga_hi_shuffle);
783
784 src += 16*2;
785 dst += 16;
786 count -= 16;
787 }
788
789 grayA_to_rgbA_portable(dst, src, count);
790}
791
792enum Format { kRGB1, kBGR1 };
793static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
794 auto convert8 = [=](__m256i* lo, __m256i* hi) {
795 const __m256i zeros = _mm256_setzero_si256();
796 __m256i planar;
797 if (kBGR1 == format) {
798 planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
799 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
800 } else {
801 planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
802 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
803 }
804
805 // Swizzle the pixels to 8-bit planar.
806 *lo = _mm256_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk ccccmmmm yyyykkkk
807 *hi = _mm256_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK
808 __m256i cm = _mm256_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM
809 yk = _mm256_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK
810
811 // Unpack to 16-bit planar.
812 __m256i c = _mm256_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_
813 m = _mm256_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_
814 y = _mm256_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_
815 k = _mm256_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_
816
817 // Scale to r, g, b.
818 __m256i r = scale(c, k),
819 g = scale(m, k),
820 b = scale(y, k);
821
822 // Repack into interlaced pixels:
823 // rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
824 // ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1
825 __m256i rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)),
826 ba = _mm256_or_si256(b, _mm256_set1_epi16((uint16_t) 0xFF00));
827 *lo = _mm256_unpacklo_epi16(rg, ba); // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1
828 *hi = _mm256_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1
829 };
830
831 while (count >= 16) {
832 __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
833 hi = _mm256_loadu_si256((const __m256i*) (src + 8));
834
835 convert8(&lo, &hi);
836
837 _mm256_storeu_si256((__m256i*) (dst + 0), lo);
838 _mm256_storeu_si256((__m256i*) (dst + 8), hi);
839
840 src += 16;
841 dst += 16;
842 count -= 16;
843 }
844
845 if (count >= 8) {
846 __m256i lo = _mm256_loadu_si256((const __m256i*) src),
847 hi = _mm256_setzero_si256();
848
849 convert8(&lo, &hi);
850
851 _mm256_storeu_si256((__m256i*) dst, lo);
852
853 src += 8;
854 dst += 8;
855 count -= 8;
856 }
857
858 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
859 proc(dst, src, count);
860}
861
862/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
863 inverted_cmyk_to(kRGB1, dst, src, count);
864}
865
866/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
867 inverted_cmyk_to(kBGR1, dst, src, count);
868}
869
870#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
871
872// Scale a byte by another.
873// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
874static __m128i scale(__m128i x, __m128i y) {
875 const __m128i _128 = _mm_set1_epi16(128);
876 const __m128i _257 = _mm_set1_epi16(257);
877
878 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
879 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
880}
881
882static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
883
884 auto premul8 = [=](__m128i* lo, __m128i* hi) {
885 const __m128i zeros = _mm_setzero_si128();
886 __m128i planar;
887 if (kSwapRB) {
888 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
889 } else {
890 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
891 }
892
893 // Swizzle the pixels to 8-bit planar.
894 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
895 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
896 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
897 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
898
899 // Unpack to 16-bit planar.
900 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
901 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
902 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
903 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
904
905 // Premultiply!
906 r = scale(r, a);
907 g = scale(g, a);
908 b = scale(b, a);
909
910 // Repack into interlaced pixels.
911 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
912 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
913 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
914 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
915 };
916
917 while (count >= 8) {
918 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
919 hi = _mm_loadu_si128((const __m128i*) (src + 4));
920
921 premul8(&lo, &hi);
922
923 _mm_storeu_si128((__m128i*) (dst + 0), lo);
924 _mm_storeu_si128((__m128i*) (dst + 4), hi);
925
926 src += 8;
927 dst += 8;
928 count -= 8;
929 }
930
931 if (count >= 4) {
932 __m128i lo = _mm_loadu_si128((const __m128i*) src),
933 hi = _mm_setzero_si128();
934
935 premul8(&lo, &hi);
936
937 _mm_storeu_si128((__m128i*) dst, lo);
938
939 src += 4;
940 dst += 4;
941 count -= 4;
942 }
943
944 // Call portable code to finish up the tail of [0,4) pixels.
945 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
946 proc(dst, src, count);
947}
948
949/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
950 premul_should_swapRB(false, dst, src, count);
951}
952
953/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
954 premul_should_swapRB(true, dst, src, count);
955}
956
957/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
958 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
959
960 while (count >= 4) {
961 __m128i rgba = _mm_loadu_si128((const __m128i*) src);
962 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
963 _mm_storeu_si128((__m128i*) dst, bgra);
964
965 src += 4;
966 dst += 4;
967 count -= 4;
968 }
969
970 RGBA_to_BGRA_portable(dst, src, count);
971}
972
973/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
974 while (count >= 8) {
975 __m128i ga = _mm_loadu_si128((const __m128i*) src);
976
977 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
978 _mm_slli_epi16(ga, 8));
979
980 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
981 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
982
983 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
984 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
985
986 src += 8*2;
987 dst += 8;
988 count -= 8;
989 }
990
991 grayA_to_RGBA_portable(dst, src, count);
992}
993
994/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
995 while (count >= 8) {
996 __m128i grayA = _mm_loadu_si128((const __m128i*) src);
997
998 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
999 __m128i a0 = _mm_srli_epi16(grayA, 8);
1000
1001 // Premultiply
1002 g0 = scale(g0, a0);
1003
1004 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
1005 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
1006
1007
1008 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
1009 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
1010
1011 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
1012 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
1013
1014 src += 8*2;
1015 dst += 8;
1016 count -= 8;
1017 }
1018
1019 grayA_to_rgbA_portable(dst, src, count);
1020}
1021
1022enum Format { kRGB1, kBGR1 };
1023static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1024 auto convert8 = [=](__m128i* lo, __m128i* hi) {
1025 const __m128i zeros = _mm_setzero_si128();
1026 __m128i planar;
1027 if (kBGR1 == format) {
1028 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
1029 } else {
1030 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
1031 }
1032
1033 // Swizzle the pixels to 8-bit planar.
1034 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk
1035 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK
1036 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM
1037 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK
1038
1039 // Unpack to 16-bit planar.
1040 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
1041 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
1042 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
1043 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
1044
1045 // Scale to r, g, b.
1046 __m128i r = scale(c, k),
1047 g = scale(m, k),
1048 b = scale(y, k);
1049
1050 // Repack into interlaced pixels.
1051 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG
1052 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1
1053 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
1054 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1
1055 };
1056
1057 while (count >= 8) {
1058 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
1059 hi = _mm_loadu_si128((const __m128i*) (src + 4));
1060
1061 convert8(&lo, &hi);
1062
1063 _mm_storeu_si128((__m128i*) (dst + 0), lo);
1064 _mm_storeu_si128((__m128i*) (dst + 4), hi);
1065
1066 src += 8;
1067 dst += 8;
1068 count -= 8;
1069 }
1070
1071 if (count >= 4) {
1072 __m128i lo = _mm_loadu_si128((const __m128i*) src),
1073 hi = _mm_setzero_si128();
1074
1075 convert8(&lo, &hi);
1076
1077 _mm_storeu_si128((__m128i*) dst, lo);
1078
1079 src += 4;
1080 dst += 4;
1081 count -= 4;
1082 }
1083
1084 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1085 proc(dst, src, count);
1086}
1087
1088/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1089 inverted_cmyk_to(kRGB1, dst, src, count);
1090}
1091
1092/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1093 inverted_cmyk_to(kBGR1, dst, src, count);
1094}
1095
1096#else
1097
1098/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1099 RGBA_to_rgbA_portable(dst, src, count);
1100}
1101
1102/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1103 RGBA_to_bgrA_portable(dst, src, count);
1104}
1105
1106/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1107 RGBA_to_BGRA_portable(dst, src, count);
1108}
1109
1110/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1111 grayA_to_RGBA_portable(dst, src, count);
1112}
1113
1114/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1115 grayA_to_rgbA_portable(dst, src, count);
1116}
1117
1118/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1119 inverted_CMYK_to_RGB1_portable(dst, src, count);
1120}
1121
1122/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1123 inverted_CMYK_to_BGR1_portable(dst, src, count);
1124}
1125
1126#endif
1127
1128// Basically as above, but we found no benefit from AVX-512 for gray_to_RGB1.
1129static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1130 for (int i = 0; i < count; i++) {
1131 dst[i] = (uint32_t)0xFF << 24
1132 | (uint32_t)src[i] << 16
1133 | (uint32_t)src[i] << 8
1134 | (uint32_t)src[i] << 0;
1135 }
1136}
1137#if defined(SK_ARM_HAS_NEON)
1138 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1139 while (count >= 16) {
1140 // Load 16 pixels.
1141 uint8x16_t gray = vld1q_u8(src);
1142
1143 // Set each of the color channels.
1144 uint8x16x4_t rgba;
1145 rgba.val[0] = gray;
1146 rgba.val[1] = gray;
1147 rgba.val[2] = gray;
1148 rgba.val[3] = vdupq_n_u8(0xFF);
1149
1150 // Store 16 pixels.
1151 vst4q_u8((uint8_t*) dst, rgba);
1152 src += 16;
1153 dst += 16;
1154 count -= 16;
1155 }
1156 if (count >= 8) {
1157 // Load 8 pixels.
1158 uint8x8_t gray = vld1_u8(src);
1159
1160 // Set each of the color channels.
1161 uint8x8x4_t rgba;
1162 rgba.val[0] = gray;
1163 rgba.val[1] = gray;
1164 rgba.val[2] = gray;
1165 rgba.val[3] = vdup_n_u8(0xFF);
1166
1167 // Store 8 pixels.
1168 vst4_u8((uint8_t*) dst, rgba);
1169 src += 8;
1170 dst += 8;
1171 count -= 8;
1172 }
1173 gray_to_RGB1_portable(dst, src, count);
1174 }
1175#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
1176 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1177 const __m256i alphas = _mm256_set1_epi8((uint8_t) 0xFF);
1178 while (count >= 32) {
1179 __m256i grays = _mm256_loadu_si256((const __m256i*) src);
1180
1181 __m256i gg_lo = _mm256_unpacklo_epi8(grays, grays);
1182 __m256i gg_hi = _mm256_unpackhi_epi8(grays, grays);
1183 __m256i ga_lo = _mm256_unpacklo_epi8(grays, alphas);
1184 __m256i ga_hi = _mm256_unpackhi_epi8(grays, alphas);
1185
1186 __m256i ggga0 = _mm256_unpacklo_epi16(gg_lo, ga_lo);
1187 __m256i ggga1 = _mm256_unpackhi_epi16(gg_lo, ga_lo);
1188 __m256i ggga2 = _mm256_unpacklo_epi16(gg_hi, ga_hi);
1189 __m256i ggga3 = _mm256_unpackhi_epi16(gg_hi, ga_hi);
1190
1191 // Shuffle for pixel reorder.
1192 // Note. 'p' stands for 'ggga'
1193 // Before shuffle:
1194 // ggga0 = p0 p1 p2 p3 | p16 p17 p18 p19
1195 // ggga1 = p4 p5 p6 p7 | p20 p21 p22 p23
1196 // ggga2 = p8 p9 p10 p11 | p24 p25 p26 p27
1197 // ggga3 = p12 p13 p14 p15 | p28 p29 p30 p31
1198 //
1199 // After shuffle:
1200 // ggga0_shuffle = p0 p1 p2 p3 | p4 p5 p6 p7
1201 // ggga1_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15
1202 // ggga2_shuffle = p16 p17 p18 p19 | p20 p21 p22 p23
1203 // ggga3_shuffle = p24 p25 p26 p27 | p28 p29 p30 p31
1204 __m256i ggga0_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x20),
1205 ggga1_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x20),
1206 ggga2_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x31),
1207 ggga3_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x31);
1208
1209 _mm256_storeu_si256((__m256i*) (dst + 0), ggga0_shuffle);
1210 _mm256_storeu_si256((__m256i*) (dst + 8), ggga1_shuffle);
1211 _mm256_storeu_si256((__m256i*) (dst + 16), ggga2_shuffle);
1212 _mm256_storeu_si256((__m256i*) (dst + 24), ggga3_shuffle);
1213
1214 src += 32;
1215 dst += 32;
1216 count -= 32;
1217 }
1218 gray_to_RGB1_portable(dst, src, count);
1219 }
1220#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 // TODO: just check >= SSE2?
1221 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1222 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
1223 while (count >= 16) {
1224 __m128i grays = _mm_loadu_si128((const __m128i*) src);
1225
1226 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
1227 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
1228 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
1229 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
1230
1231 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
1232 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
1233 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
1234 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
1235
1236 _mm_storeu_si128((__m128i*) (dst + 0), ggga0);
1237 _mm_storeu_si128((__m128i*) (dst + 4), ggga1);
1238 _mm_storeu_si128((__m128i*) (dst + 8), ggga2);
1239 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
1240
1241 src += 16;
1242 dst += 16;
1243 count -= 16;
1244 }
1245 gray_to_RGB1_portable(dst, src, count);
1246 }
1247#else
1248 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1249 gray_to_RGB1_portable(dst, src, count);
1250 }
1251#endif
1252
1253// Again as above, this time not even finding benefit from AVX2 for RGB_to_{RGB,BGR}1.
1254static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1255 for (int i = 0; i < count; i++) {
1256 uint8_t r = src[0],
1257 g = src[1],
1258 b = src[2];
1259 src += 3;
1260 dst[i] = (uint32_t)0xFF << 24
1261 | (uint32_t)b << 16
1262 | (uint32_t)g << 8
1263 | (uint32_t)r << 0;
1264 }
1265}
1266static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
1267 for (int i = 0; i < count; i++) {
1268 uint8_t r = src[0],
1269 g = src[1],
1270 b = src[2];
1271 src += 3;
1272 dst[i] = (uint32_t)0xFF << 24
1273 | (uint32_t)r << 16
1274 | (uint32_t)g << 8
1275 | (uint32_t)b << 0;
1276 }
1277}
1278#if defined(SK_ARM_HAS_NEON)
1279 static void insert_alpha_should_swaprb(bool kSwapRB,
1280 uint32_t dst[], const uint8_t* src, int count) {
1281 while (count >= 16) {
1282 // Load 16 pixels.
1283 uint8x16x3_t rgb = vld3q_u8(src);
1284
1285 // Insert an opaque alpha channel and swap if needed.
1286 uint8x16x4_t rgba;
1287 if (kSwapRB) {
1288 rgba.val[0] = rgb.val[2];
1289 rgba.val[2] = rgb.val[0];
1290 } else {
1291 rgba.val[0] = rgb.val[0];
1292 rgba.val[2] = rgb.val[2];
1293 }
1294 rgba.val[1] = rgb.val[1];
1295 rgba.val[3] = vdupq_n_u8(0xFF);
1296
1297 // Store 16 pixels.
1298 vst4q_u8((uint8_t*) dst, rgba);
1299 src += 16*3;
1300 dst += 16;
1301 count -= 16;
1302 }
1303
1304 if (count >= 8) {
1305 // Load 8 pixels.
1306 uint8x8x3_t rgb = vld3_u8(src);
1307
1308 // Insert an opaque alpha channel and swap if needed.
1309 uint8x8x4_t rgba;
1310 if (kSwapRB) {
1311 rgba.val[0] = rgb.val[2];
1312 rgba.val[2] = rgb.val[0];
1313 } else {
1314 rgba.val[0] = rgb.val[0];
1315 rgba.val[2] = rgb.val[2];
1316 }
1317 rgba.val[1] = rgb.val[1];
1318 rgba.val[3] = vdup_n_u8(0xFF);
1319
1320 // Store 8 pixels.
1321 vst4_u8((uint8_t*) dst, rgba);
1322 src += 8*3;
1323 dst += 8;
1324 count -= 8;
1325 }
1326
1327 // Call portable code to finish up the tail of [0,8) pixels.
1328 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1329 proc(dst, src, count);
1330 }
1331
1332 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1333 insert_alpha_should_swaprb(false, dst, src, count);
1334 }
1335 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1336 insert_alpha_should_swaprb(true, dst, src, count);
1337 }
1338#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
1339 static void insert_alpha_should_swaprb(bool kSwapRB,
1340 uint32_t dst[], const uint8_t* src, int count) {
1341 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
1342 __m128i expand;
1343 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant.
1344 if (kSwapRB) {
1345 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
1346 } else {
1347 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
1348 }
1349
1350 while (count >= 6) {
1351 // Load a vector. While this actually contains 5 pixels plus an
1352 // extra component, we will discard all but the first four pixels on
1353 // this iteration.
1354 __m128i rgb = _mm_loadu_si128((const __m128i*) src);
1355
1356 // Expand the first four pixels to RGBX and then mask to RGB(FF).
1357 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
1358
1359 // Store 4 pixels.
1360 _mm_storeu_si128((__m128i*) dst, rgba);
1361
1362 src += 4*3;
1363 dst += 4;
1364 count -= 4;
1365 }
1366
1367 // Call portable code to finish up the tail of [0,4) pixels.
1368 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1369 proc(dst, src, count);
1370 }
1371
1372 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1373 insert_alpha_should_swaprb(false, dst, src, count);
1374 }
1375 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1376 insert_alpha_should_swaprb(true, dst, src, count);
1377 }
1378#else
1379 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1380 RGB_to_RGB1_portable(dst, src, count);
1381 }
1382 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1383 RGB_to_BGR1_portable(dst, src, count);
1384 }
1385#endif
1386
1387} // namespace SK_OPTS_NS
1388
1389#endif // SkSwizzler_opts_DEFINED
1390