SkSwizzler_opts.h source code [engine/third_party/skia/src/opts/SkSwizzler_opts.h]

1	/*
2	* Copyright 2016 Google Inc.
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	#ifndef SkSwizzler_opts_DEFINED
9	#define SkSwizzler_opts_DEFINED
10
11	#include "include/private/SkColorData.h"
12
13	#include <utility>
14
15	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
16	#include <immintrin.h>
17	#elif defined(SK_ARM_HAS_NEON)
18	#include <arm_neon.h>
19	#endif
20
21	namespace SK_OPTS_NS {
22
23	static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
24	for (int i = `0`; i < count; i++) {
25	uint8_t a = (src[i] >> `24`) & `0xFF`,
26	b = (src[i] >> `16`) & `0xFF`,
27	g = (src[i] >> `8`) & `0xFF`,
28	r = (src[i] >> `0`) & `0xFF`;
29	b = (b*a+`127`)/`255`;
30	g = (g*a+`127`)/`255`;
31	r = (r*a+`127`)/`255`;
32	dst[i] = (uint32_t)a << `24`
33	\| (uint32_t)b << `16`
34	\| (uint32_t)g << `8`
35	\| (uint32_t)r << `0`;
36	}
37	}
38
39	static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
40	for (int i = `0`; i < count; i++) {
41	uint8_t a = (src[i] >> `24`) & `0xFF`,
42	b = (src[i] >> `16`) & `0xFF`,
43	g = (src[i] >> `8`) & `0xFF`,
44	r = (src[i] >> `0`) & `0xFF`;
45	b = (b*a+`127`)/`255`;
46	g = (g*a+`127`)/`255`;
47	r = (r*a+`127`)/`255`;
48	dst[i] = (uint32_t)a << `24`
49	\| (uint32_t)r << `16`
50	\| (uint32_t)g << `8`
51	\| (uint32_t)b << `0`;
52	}
53	}
54
55	static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
56	for (int i = `0`; i < count; i++) {
57	uint8_t a = (src[i] >> `24`) & `0xFF`,
58	b = (src[i] >> `16`) & `0xFF`,
59	g = (src[i] >> `8`) & `0xFF`,
60	r = (src[i] >> `0`) & `0xFF`;
61	dst[i] = (uint32_t)a << `24`
62	\| (uint32_t)r << `16`
63	\| (uint32_t)g << `8`
64	\| (uint32_t)b << `0`;
65	}
66	}
67
68	static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
69	for (int i = `0`; i < count; i++) {
70	uint8_t g = src[`0`],
71	a = src[`1`];
72	src += `2`;
73	dst[i] = (uint32_t)a << `24`
74	\| (uint32_t)g << `16`
75	\| (uint32_t)g << `8`
76	\| (uint32_t)g << `0`;
77	}
78	}
79
80	static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
81	for (int i = `0`; i < count; i++) {
82	uint8_t g = src[`0`],
83	a = src[`1`];
84	src += `2`;
85	g = (g*a+`127`)/`255`;
86	dst[i] = (uint32_t)a << `24`
87	\| (uint32_t)g << `16`
88	\| (uint32_t)g << `8`
89	\| (uint32_t)g << `0`;
90	}
91	}
92
93	static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
94	for (int i = `0`; i < count; i++) {
95	uint8_t k = (src[i] >> `24`) & `0xFF`,
96	y = (src[i] >> `16`) & `0xFF`,
97	m = (src[i] >> `8`) & `0xFF`,
98	c = (src[i] >> `0`) & `0xFF`;
99	// See comments in SkSwizzler.cpp for details on the conversion formula.
100	uint8_t b = (y*k+`127`)/`255`,
101	g = (m*k+`127`)/`255`,
102	r = (c*k+`127`)/`255`;
103	dst[i] = (uint32_t)`0xFF` << `24`
104	\| (uint32_t) b << `16`
105	\| (uint32_t) g << `8`
106	\| (uint32_t) r << `0`;
107	}
108	}
109
110	static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
111	for (int i = `0`; i < count; i++) {
112	uint8_t k = (src[i] >> `24`) & `0xFF`,
113	y = (src[i] >> `16`) & `0xFF`,
114	m = (src[i] >> `8`) & `0xFF`,
115	c = (src[i] >> `0`) & `0xFF`;
116	uint8_t b = (y*k+`127`)/`255`,
117	g = (m*k+`127`)/`255`,
118	r = (c*k+`127`)/`255`;
119	dst[i] = (uint32_t)`0xFF` << `24`
120	\| (uint32_t) r << `16`
121	\| (uint32_t) g << `8`
122	\| (uint32_t) b << `0`;
123	}
124	}
125
126	#if defined(SK_ARM_HAS_NEON)
127
128	// Rounded divide by 255, (x + 127) / 255
129	static uint8x8_t div255_round(uint16x8_t x) {
130	// result = (x + 127) / 255
131	// result = (x + 127) / 256 + error1
132	//
133	// error1 = (x + 127) / (255 256)*
134	// error1 = (x + 127) / (256 256) + error2*
135	//
136	// error2 = (x + 127) / (255 256 * 256)*
137	//
138	// The maximum value of error2 is too small to matter. Thus:
139	// result = (x + 127) / 256 + (x + 127) / (256 256)*
140	// result = ((x + 127) / 256 + x + 127) / 256
141	// result = ((x + 127) >> 8 + x + 127) >> 8
142	//
143	// Use >>> to represent "rounded right shift" which, conveniently,
144	// NEON supports in one instruction.
145	// result = ((x >>> 8) + x) >>> 8
146	//
147	// Note that the second right shift is actually performed as an
148	// "add, round, and narrow back to 8-bits" instruction.
149	return vraddhn_u16(x, vrshrq_n_u16(x, `8`));
150	}
151
152	// Scale a byte by another, (x y + 127) / 255*
153	static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
154	return div255_round(vmull_u8(x, y));
155	}
156
157	static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
158	while (count >= `8`) {
159	// Load 8 pixels.
160	uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
161
162	uint8x8_t a = rgba.val[`3`],
163	b = rgba.val[`2`],
164	g = rgba.val[`1`],
165	r = rgba.val[`0`];
166
167	// Premultiply.
168	b = scale(b, a);
169	g = scale(g, a);
170	r = scale(r, a);
171
172	// Store 8 premultiplied pixels.
173	if (kSwapRB) {
174	rgba.val[`2`] = r;
175	rgba.val[`1`] = g;
176	rgba.val[`0`] = b;
177	} else {
178	rgba.val[`2`] = b;
179	rgba.val[`1`] = g;
180	rgba.val[`0`] = r;
181	}
182	vst4_u8((uint8_t*) dst, rgba);
183	src += `8`;
184	dst += `8`;
185	count -= `8`;
186	}
187
188	// Call portable code to finish up the tail of [0,8) pixels.
189	auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
190	proc(dst, src, count);
191	}
192
193	/not static/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
194	premul_should_swapRB(false, dst, src, count);
195	}
196
197	/not static/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
198	premul_should_swapRB(true, dst, src, count);
199	}
200
201	/not static/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
202	using std::swap;
203	while (count >= `16`) {
204	// Load 16 pixels.
205	uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
206
207	// Swap r and b.
208	swap(rgba.val[`0`], rgba.val[`2`]);
209
210	// Store 16 pixels.
211	vst4q_u8((uint8_t*) dst, rgba);
212	src += `16`;
213	dst += `16`;
214	count -= `16`;
215	}
216
217	if (count >= `8`) {
218	// Load 8 pixels.
219	uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
220
221	// Swap r and b.
222	swap(rgba.val[`0`], rgba.val[`2`]);
223
224	// Store 8 pixels.
225	vst4_u8((uint8_t*) dst, rgba);
226	src += `8`;
227	dst += `8`;
228	count -= `8`;
229	}
230
231	RGBA_to_BGRA_portable(dst, src, count);
232	}
233
234	static void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) {
235	while (count >= `16`) {
236	// Load 16 pixels.
237	uint8x16x2_t ga = vld2q_u8(src);
238
239	// Premultiply if requested.
240	if (kPremul) {
241	ga.val[`0`] = vcombine_u8(
242	scale(vget_low_u8(ga.val[`0`]), vget_low_u8(ga.val[`1`])),
243	scale(vget_high_u8(ga.val[`0`]), vget_high_u8(ga.val[`1`])));
244	}
245
246	// Set each of the color channels.
247	uint8x16x4_t rgba;
248	rgba.val[`0`] = ga.val[`0`];
249	rgba.val[`1`] = ga.val[`0`];
250	rgba.val[`2`] = ga.val[`0`];
251	rgba.val[`3`] = ga.val[`1`];
252
253	// Store 16 pixels.
254	vst4q_u8((uint8_t*) dst, rgba);
255	src += `16`*`2`;
256	dst += `16`;
257	count -= `16`;
258	}
259
260	if (count >= `8`) {
261	// Load 8 pixels.
262	uint8x8x2_t ga = vld2_u8(src);
263
264	// Premultiply if requested.
265	if (kPremul) {
266	ga.val[`0`] = scale(ga.val[`0`], ga.val[`1`]);
267	}
268
269	// Set each of the color channels.
270	uint8x8x4_t rgba;
271	rgba.val[`0`] = ga.val[`0`];
272	rgba.val[`1`] = ga.val[`0`];
273	rgba.val[`2`] = ga.val[`0`];
274	rgba.val[`3`] = ga.val[`1`];
275
276	// Store 8 pixels.
277	vst4_u8((uint8_t*) dst, rgba);
278	src += `8`*`2`;
279	dst += `8`;
280	count -= `8`;
281	}
282
283	auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
284	proc(dst, src, count);
285	}
286
287	/not static/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
288	expand_grayA(false, dst, src, count);
289	}
290
291	/not static/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
292	expand_grayA(true, dst, src, count);
293	}
294
295	enum Format { kRGB1, kBGR1 };
296	static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
297	while (count >= `8`) {
298	// Load 8 cmyk pixels.
299	uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
300
301	uint8x8_t k = pixels.val[`3`],
302	y = pixels.val[`2`],
303	m = pixels.val[`1`],
304	c = pixels.val[`0`];
305
306	// Scale to r, g, b.
307	uint8x8_t b = scale(y, k);
308	uint8x8_t g = scale(m, k);
309	uint8x8_t r = scale(c, k);
310
311	// Store 8 rgba pixels.
312	if (kBGR1 == format) {
313	pixels.val[`3`] = vdup_n_u8(`0xFF`);
314	pixels.val[`2`] = r;
315	pixels.val[`1`] = g;
316	pixels.val[`0`] = b;
317	} else {
318	pixels.val[`3`] = vdup_n_u8(`0xFF`);
319	pixels.val[`2`] = b;
320	pixels.val[`1`] = g;
321	pixels.val[`0`] = r;
322	}
323	vst4_u8((uint8_t*) dst, pixels);
324	src += `8`;
325	dst += `8`;
326	count -= `8`;
327	}
328
329	auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
330	proc(dst, src, count);
331	}
332
333	/not static/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
334	inverted_cmyk_to(kRGB1, dst, src, count);
335	}
336
337	/not static/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
338	inverted_cmyk_to(kBGR1, dst, src, count);
339	}
340
341	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
342	// Scale a byte by another.
343	// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
344	static __m512i scale(__m512i x, __m512i y) {
345	const __m512i _128 = _mm512_set1_epi16(`128`);
346	const __m512i _257 = _mm512_set1_epi16(`257`);
347
348	// (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.
349	return _mm512_mulhi_epu16(_mm512_add_epi16(_mm512_mullo_epi16(x, y), _128), _257);
350	}
351
352	static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
353
354	auto premul8 = [=](__m512i* lo, __m512i* hi) {
355	const __m512i zeros = _mm512_setzero_si512();
356	skvx::Vec<`64`, uint8_t> mask;
357	if (kSwapRB) {
358	mask = { `2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`,
359	`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`,
360	`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`,
361	`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15` };
362	} else {
363	mask = { `0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`,
364	`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`,
365	`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`,
366	`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15` };
367	}
368	__m512i planar = skvx::bit_pun<__m512i>(mask);
369
370	// Swizzle the pixels to 8-bit planar.
371	lo = _mm512_shuffle_epi8(lo, planar);
372	hi = _mm512_shuffle_epi8(hi, planar);
373	__m512i rg = _mm512_unpacklo_epi32(lo, hi),
374	ba = _mm512_unpackhi_epi32(lo, hi);
375
376	// Unpack to 16-bit planar.
377	__m512i r = _mm512_unpacklo_epi8(rg, zeros),
378	g = _mm512_unpackhi_epi8(rg, zeros),
379	b = _mm512_unpacklo_epi8(ba, zeros),
380	a = _mm512_unpackhi_epi8(ba, zeros);
381
382	// Premultiply!
383	r = scale(r, a);
384	g = scale(g, a);
385	b = scale(b, a);
386
387	// Repack into interlaced pixels.
388	rg = _mm512_or_si512(r, _mm512_slli_epi16(g, `8`));
389	ba = _mm512_or_si512(b, _mm512_slli_epi16(a, `8`));
390	*lo = _mm512_unpacklo_epi16(rg, ba);
391	*hi = _mm512_unpackhi_epi16(rg, ba);
392	};
393
394	while (count >= `32`) {
395	__m512i lo = _mm512_loadu_si512((const __m512i*) (src + `0`)),
396	hi = _mm512_loadu_si512((const __m512i*) (src + `16`));
397
398	premul8(&lo, &hi);
399
400	_mm512_storeu_si512((__m512i*) (dst + `0`), lo);
401	_mm512_storeu_si512((__m512i*) (dst + `16`), hi);
402
403	src += `32`;
404	dst += `32`;
405	count -= `32`;
406	}
407
408	if (count >= `16`) {
409	__m512i lo = _mm512_loadu_si512((const __m512i*) src),
410	hi = _mm512_setzero_si512();
411
412	premul8(&lo, &hi);
413
414	_mm512_storeu_si512((__m512i*) dst, lo);
415
416	src += `16`;
417	dst += `16`;
418	count -= `16`;
419	}
420
421	// Call portable code to finish up the tail of [0,16) pixels.
422	auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
423	proc(dst, src, count);
424	}
425
426	/not static/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
427	premul_should_swapRB(false, dst, src, count);
428	}
429
430	/not static/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
431	premul_should_swapRB(true, dst, src, count);
432	}
433
434	/not static/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
435	const uint8_t mask[`64`] = { `2`,`1`,`0`,`3`, `6`,`5`,`4`,`7`, `10`,`9`,`8`,`11`, `14`,`13`,`12`,`15`,
436	`2`,`1`,`0`,`3`, `6`,`5`,`4`,`7`, `10`,`9`,`8`,`11`, `14`,`13`,`12`,`15`,
437	`2`,`1`,`0`,`3`, `6`,`5`,`4`,`7`, `10`,`9`,`8`,`11`, `14`,`13`,`12`,`15`,
438	`2`,`1`,`0`,`3`, `6`,`5`,`4`,`7`, `10`,`9`,`8`,`11`, `14`,`13`,`12`,`15` };
439	const __m512i swapRB = _mm512_loadu_si512(mask);
440
441	while (count >= `16`) {
442	__m512i rgba = _mm512_loadu_si512((const __m512i*) src);
443	__m512i bgra = _mm512_shuffle_epi8(rgba, swapRB);
444	_mm512_storeu_si512((__m512i*) dst, bgra);
445
446	src += `16`;
447	dst += `16`;
448	count -= `16`;
449	}
450
451	RGBA_to_BGRA_portable(dst, src, count);
452	}
453
454	// Use SSSE3 impl as AVX2 / AVX-512 impl regresses performance for RGB_to_RGB1 / RGB_to_BGR1.
455
456	// Use AVX2 impl as AVX-512 impl regresses performance for gray_to_RGB1.
457
458	/not static/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
459	while (count >= `32`) {
460	__m512i ga = _mm512_loadu_si512((const __m512i*) src);
461
462	__m512i gg = _mm512_or_si512(_mm512_and_si512(ga, _mm512_set1_epi16(`0x00FF`)),
463	_mm512_slli_epi16(ga, `8`));
464
465	__m512i ggga_lo = _mm512_unpacklo_epi16(gg, ga);
466	__m512i ggga_hi = _mm512_unpackhi_epi16(gg, ga);
467
468	// 1st shuffle for pixel reorder.
469	// Note. 'p' stands for 'ggga'
470	// Before 1st shuffle:
471	// ggga_lo = p0 p1 p2 p3 \| p8 p9 p10 p11 \| p16 p17 p18 p19 \| p24 p25 p26 p27
472	// ggga_hi = p4 p5 p6 p7 \| p12 p13 p14 p15 \| p20 p21 p22 p23 \| p28 p29 p30 p31
473	//
474	// After 1st shuffle:
475	// ggga_lo_shuffle_1 =
476	// p0 p1 p2 p3 \| p8 p9 p10 p11 \| p4 p5 p6 p7 \| p12 p13 p14 p15
477	// ggga_hi_shuffle_1 =
478	// p16 p17 p18 p19 \| p24 p25 p26 p27 \| p20 p21 p22 p23 \| p28 p29 p30 p31
479	__m512i ggga_lo_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, `0x44`),
480	ggga_hi_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, `0xee`);
481
482	// 2nd shuffle for pixel reorder.
483	// After the 2nd shuffle:
484	// ggga_lo_shuffle_2 =
485	// p0 p1 p2 p3 \| p4 p5 p6 p7 \| p8 p9 p10 p11 \| p12 p13 p14 p15
486	// ggga_hi_shuffle_2 =
487	// p16 p17 p18 p19 \| p20 p21 p22 p23 \| p24 p25 p26 p27 \| p28 p29 p30 p31
488	__m512i ggga_lo_shuffle_2 = _mm512_shuffle_i32x4(ggga_lo_shuffle_1,
489	ggga_lo_shuffle_1, `0xd8`),
490	ggga_hi_shuffle_2 = _mm512_shuffle_i32x4(ggga_hi_shuffle_1,
491	ggga_hi_shuffle_1, `0xd8`);
492
493	_mm512_storeu_si512((__m512i*) (dst + `0`), ggga_lo_shuffle_2);
494	_mm512_storeu_si512((__m512i*) (dst + `16`), ggga_hi_shuffle_2);
495
496	src += `32`*`2`;
497	dst += `32`;
498	count -= `32`;
499	}
500
501	grayA_to_RGBA_portable(dst, src, count);
502	}
503
504	/not static/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
505	while (count >= `32`) {
506	__m512i grayA = _mm512_loadu_si512((const __m512i*) src);
507
508	__m512i g0 = _mm512_and_si512(grayA, _mm512_set1_epi16(`0x00FF`));
509	__m512i a0 = _mm512_srli_epi16(grayA, `8`);
510
511	// Premultiply
512	g0 = scale(g0, a0);
513
514	__m512i gg = _mm512_or_si512(g0, _mm512_slli_epi16(g0, `8`));
515	__m512i ga = _mm512_or_si512(g0, _mm512_slli_epi16(a0, `8`));
516
517	__m512i ggga_lo = _mm512_unpacklo_epi16(gg, ga);
518	__m512i ggga_hi = _mm512_unpackhi_epi16(gg, ga);
519
520	// 1st shuffle for pixel reorder, same as grayA_to_RGBA.
521	__m512i ggga_lo_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, `0x44`),
522	ggga_hi_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, `0xee`);
523
524	// 2nd shuffle for pixel reorder, same as grayA_to_RGBA.
525	__m512i ggga_lo_shuffle_2 = _mm512_shuffle_i32x4(ggga_lo_shuffle_1,
526	ggga_lo_shuffle_1, `0xd8`),
527	ggga_hi_shuffle_2 = _mm512_shuffle_i32x4(ggga_hi_shuffle_1,
528	ggga_hi_shuffle_1, `0xd8`);
529
530	_mm512_storeu_si512((__m512i*) (dst + `0`), ggga_lo_shuffle_2);
531	_mm512_storeu_si512((__m512i*) (dst + `16`), ggga_hi_shuffle_2);
532
533	src += `32`*`2`;
534	dst += `32`;
535	count -= `32`;
536	}
537
538	grayA_to_rgbA_portable(dst, src, count);
539	}
540
541	enum Format { kRGB1, kBGR1 };
542	static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
543	auto convert8 = [=](__m512i* lo, __m512i* hi) {
544	const __m512i zeros = _mm512_setzero_si512();
545	skvx::Vec<`64`, uint8_t> mask;
546	if (kBGR1 == format) {
547	mask = { `2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`,
548	`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`,
549	`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`,
550	`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15` };
551	} else {
552	mask = { `0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`,
553	`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`,
554	`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`,
555	`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15` };
556	}
557	__m512i planar = skvx::bit_pun<__m512i>(mask);
558
559	// Swizzle the pixels to 8-bit planar.
560	lo = _mm512_shuffle_epi8(lo, planar);
561	hi = _mm512_shuffle_epi8(hi, planar);
562	__m512i cm = _mm512_unpacklo_epi32(lo, hi),
563	yk = _mm512_unpackhi_epi32(lo, hi);
564
565	// Unpack to 16-bit planar.
566	__m512i c = _mm512_unpacklo_epi8(cm, zeros),
567	m = _mm512_unpackhi_epi8(cm, zeros),
568	y = _mm512_unpacklo_epi8(yk, zeros),
569	k = _mm512_unpackhi_epi8(yk, zeros);
570
571	// Scale to r, g, b.
572	__m512i r = scale(c, k),
573	g = scale(m, k),
574	b = scale(y, k);
575
576	// Repack into interlaced pixels.
577	__m512i rg = _mm512_or_si512(r, _mm512_slli_epi16(g, `8`)),
578	ba = _mm512_or_si512(b, _mm512_set1_epi16((uint16_t) `0xFF00`));
579	*lo = _mm512_unpacklo_epi16(rg, ba);
580	*hi = _mm512_unpackhi_epi16(rg, ba);
581	};
582
583	while (count >= `32`) {
584	__m512i lo = _mm512_loadu_si512((const __m512i*) (src + `0`)),
585	hi = _mm512_loadu_si512((const __m512i*) (src + `16`));
586
587	convert8(&lo, &hi);
588
589	_mm512_storeu_si512((__m512i*) (dst + `0`), lo);
590	_mm512_storeu_si512((__m512i*) (dst + `16`), hi);
591
592	src += `32`;
593	dst += `32`;
594	count -= `32`;
595	}
596
597	if (count >= `16`) {
598	__m512i lo = _mm512_loadu_si512((const __m512i*) src),
599	hi = _mm512_setzero_si512();
600
601	convert8(&lo, &hi);
602
603	_mm512_storeu_si512((__m512i*) dst, lo);
604
605	src += `16`;
606	dst += `16`;
607	count -= `16`;
608	}
609
610	auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
611	proc(dst, src, count);
612	}
613
614	/not static/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
615	inverted_cmyk_to(kRGB1, dst, src, count);
616	}
617
618	/not static/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
619	inverted_cmyk_to(kBGR1, dst, src, count);
620	}
621
622	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
623
624	// Scale a byte by another.
625	// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
626	static __m256i scale(__m256i x, __m256i y) {
627	const __m256i _128 = _mm256_set1_epi16(`128`);
628	const __m256i _257 = _mm256_set1_epi16(`257`);
629
630	// (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.
631	return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257);
632	}
633
634	static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
635
636	auto premul8 = [=](__m256i* lo, __m256i* hi) {
637	const __m256i zeros = _mm256_setzero_si256();
638	__m256i planar;
639	if (kSwapRB) {
640	planar = _mm256_setr_epi8(`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`,
641	`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`);
642	} else {
643	planar = _mm256_setr_epi8(`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`,
644	`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`);
645	}
646
647	// Swizzle the pixels to 8-bit planar.
648	lo = _mm256_shuffle_epi8(lo, planar); // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa
649	hi = _mm256_shuffle_epi8(hi, planar); // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA
650	__m256i rg = _mm256_unpacklo_epi32(lo, hi), // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG
651	ba = _mm256_unpackhi_epi32(lo, hi); // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA
652
653	// Unpack to 16-bit planar.
654	__m256i r = _mm256_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_
655	g = _mm256_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_
656	b = _mm256_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_
657	a = _mm256_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_
658
659	// Premultiply!
660	r = scale(r, a);
661	g = scale(g, a);
662	b = scale(b, a);
663
664	// Repack into interlaced pixels.
665	rg = _mm256_or_si256(r, _mm256_slli_epi16(g, `8`)); // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
666	ba = _mm256_or_si256(b, _mm256_slli_epi16(a, `8`)); // babababa BABABABA babababa BABABABA
667	lo = _mm256_unpacklo_epi16(rg, ba); // rgbargba rgbargba rgbargba rgbargba*
668	hi = _mm256_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA RGBARGBA RGBARGBA*
669	};
670
671	while (count >= `16`) {
672	__m256i lo = _mm256_loadu_si256((const __m256i*) (src + `0`)),
673	hi = _mm256_loadu_si256((const __m256i*) (src + `8`));
674
675	premul8(&lo, &hi);
676
677	_mm256_storeu_si256((__m256i*) (dst + `0`), lo);
678	_mm256_storeu_si256((__m256i*) (dst + `8`), hi);
679
680	src += `16`;
681	dst += `16`;
682	count -= `16`;
683	}
684
685	if (count >= `8`) {
686	__m256i lo = _mm256_loadu_si256((const __m256i*) src),
687	hi = _mm256_setzero_si256();
688
689	premul8(&lo, &hi);
690
691	_mm256_storeu_si256((__m256i*) dst, lo);
692
693	src += `8`;
694	dst += `8`;
695	count -= `8`;
696	}
697
698	// Call portable code to finish up the tail of [0,8) pixels.
699	auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
700	proc(dst, src, count);
701	}
702
703	/not static/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
704	premul_should_swapRB(false, dst, src, count);
705	}
706
707	/not static/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
708	premul_should_swapRB(true, dst, src, count);
709	}
710
711	/not static/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
712	const __m256i swapRB = _mm256_setr_epi8(`2`,`1`,`0`,`3`, `6`,`5`,`4`,`7`, `10`,`9`,`8`,`11`, `14`,`13`,`12`,`15`,
713	`2`,`1`,`0`,`3`, `6`,`5`,`4`,`7`, `10`,`9`,`8`,`11`, `14`,`13`,`12`,`15`);
714
715	while (count >= `8`) {
716	__m256i rgba = _mm256_loadu_si256((const __m256i*) src);
717	__m256i bgra = _mm256_shuffle_epi8(rgba, swapRB);
718	_mm256_storeu_si256((__m256i*) dst, bgra);
719
720	src += `8`;
721	dst += `8`;
722	count -= `8`;
723	}
724
725	RGBA_to_BGRA_portable(dst, src, count);
726	}
727
728	/not static/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
729	while (count >= `16`) {
730	__m256i ga = _mm256_loadu_si256((const __m256i*) src);
731
732	__m256i gg = _mm256_or_si256(_mm256_and_si256(ga, _mm256_set1_epi16(`0x00FF`)),
733	_mm256_slli_epi16(ga, `8`));
734
735	__m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
736	__m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
737
738	// Shuffle for pixel reorder
739	// Note. 'p' stands for 'ggga'
740	// Before shuffle:
741	// ggga_lo = p0 p1 p2 p3 \| p8 p9 p10 p11
742	// ggga_hi = p4 p5 p6 p7 \| p12 p13 p14 p15
743	//
744	// After shuffle:
745	// ggga_lo_shuffle = p0 p1 p2 p3 \| p4 p5 p6 p7
746	// ggga_hi_shuffle = p8 p9 p10 p11 \| p12 p13 p14 p15
747	__m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, `0x20`),
748	ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, `0x31`);
749
750	_mm256_storeu_si256((__m256i*) (dst + `0`), ggga_lo_shuffle);
751	_mm256_storeu_si256((__m256i*) (dst + `8`), ggga_hi_shuffle);
752
753	src += `16`*`2`;
754	dst += `16`;
755	count -= `16`;
756	}
757
758	grayA_to_RGBA_portable(dst, src, count);
759	}
760
761	/not static/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
762	while (count >= `16`) {
763	__m256i grayA = _mm256_loadu_si256((const __m256i*) src);
764
765	__m256i g0 = _mm256_and_si256(grayA, _mm256_set1_epi16(`0x00FF`));
766	__m256i a0 = _mm256_srli_epi16(grayA, `8`);
767
768	// Premultiply
769	g0 = scale(g0, a0);
770
771	__m256i gg = _mm256_or_si256(g0, _mm256_slli_epi16(g0, `8`));
772	__m256i ga = _mm256_or_si256(g0, _mm256_slli_epi16(a0, `8`));
773
774	__m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
775	__m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
776
777	// Shuffle for pixel reorder, similar as grayA_to_RGBA
778	__m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, `0x20`),
779	ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, `0x31`);
780
781	_mm256_storeu_si256((__m256i*) (dst + `0`), ggga_lo_shuffle);
782	_mm256_storeu_si256((__m256i*) (dst + `8`), ggga_hi_shuffle);
783
784	src += `16`*`2`;
785	dst += `16`;
786	count -= `16`;
787	}
788
789	grayA_to_rgbA_portable(dst, src, count);
790	}
791
792	enum Format { kRGB1, kBGR1 };
793	static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
794	auto convert8 = [=](__m256i* lo, __m256i* hi) {
795	const __m256i zeros = _mm256_setzero_si256();
796	__m256i planar;
797	if (kBGR1 == format) {
798	planar = _mm256_setr_epi8(`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`,
799	`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`);
800	} else {
801	planar = _mm256_setr_epi8(`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`,
802	`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`);
803	}
804
805	// Swizzle the pixels to 8-bit planar.
806	lo = _mm256_shuffle_epi8(lo, planar); // ccccmmmm yyyykkkk ccccmmmm yyyykkkk
807	hi = _mm256_shuffle_epi8(hi, planar); // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK
808	__m256i cm = _mm256_unpacklo_epi32(lo, hi), // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM
809	yk = _mm256_unpackhi_epi32(lo, hi); // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK
810
811	// Unpack to 16-bit planar.
812	__m256i c = _mm256_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_
813	m = _mm256_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_
814	y = _mm256_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_
815	k = _mm256_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_
816
817	// Scale to r, g, b.
818	__m256i r = scale(c, k),
819	g = scale(m, k),
820	b = scale(y, k);
821
822	// Repack into interlaced pixels:
823	// rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
824	// ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1
825	__m256i rg = _mm256_or_si256(r, _mm256_slli_epi16(g, `8`)),
826	ba = _mm256_or_si256(b, _mm256_set1_epi16((uint16_t) `0xFF00`));
827	lo = _mm256_unpacklo_epi16(rg, ba); // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1*
828	hi = _mm256_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1*
829	};
830
831	while (count >= `16`) {
832	__m256i lo = _mm256_loadu_si256((const __m256i*) (src + `0`)),
833	hi = _mm256_loadu_si256((const __m256i*) (src + `8`));
834
835	convert8(&lo, &hi);
836
837	_mm256_storeu_si256((__m256i*) (dst + `0`), lo);
838	_mm256_storeu_si256((__m256i*) (dst + `8`), hi);
839
840	src += `16`;
841	dst += `16`;
842	count -= `16`;
843	}
844
845	if (count >= `8`) {
846	__m256i lo = _mm256_loadu_si256((const __m256i*) src),
847	hi = _mm256_setzero_si256();
848
849	convert8(&lo, &hi);
850
851	_mm256_storeu_si256((__m256i*) dst, lo);
852
853	src += `8`;
854	dst += `8`;
855	count -= `8`;
856	}
857
858	auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
859	proc(dst, src, count);
860	}
861
862	/not static/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
863	inverted_cmyk_to(kRGB1, dst, src, count);
864	}
865
866	/not static/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
867	inverted_cmyk_to(kBGR1, dst, src, count);
868	}
869
870	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
871
872	// Scale a byte by another.
873	// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
874	static __m128i scale(__m128i x, __m128i y) {
875	const __m128i _128 = _mm_set1_epi16(`128`);
876	const __m128i _257 = _mm_set1_epi16(`257`);
877
878	// (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.
879	return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
880	}
881
882	static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
883
884	auto premul8 = [=](__m128i* lo, __m128i* hi) {
885	const __m128i zeros = _mm_setzero_si128();
886	__m128i planar;
887	if (kSwapRB) {
888	planar = _mm_setr_epi8(`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`);
889	} else {
890	planar = _mm_setr_epi8(`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`);
891	}
892
893	// Swizzle the pixels to 8-bit planar.
894	lo = _mm_shuffle_epi8(lo, planar); // rrrrgggg bbbbaaaa
895	hi = _mm_shuffle_epi8(hi, planar); // RRRRGGGG BBBBAAAA
896	__m128i rg = _mm_unpacklo_epi32(lo, hi), // rrrrRRRR ggggGGGG
897	ba = _mm_unpackhi_epi32(lo, hi); // bbbbBBBB aaaaAAAA
898
899	// Unpack to 16-bit planar.
900	__m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
901	g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
902	b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
903	a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
904
905	// Premultiply!
906	r = scale(r, a);
907	g = scale(g, a);
908	b = scale(b, a);
909
910	// Repack into interlaced pixels.
911	rg = _mm_or_si128(r, _mm_slli_epi16(g, `8`)); // rgrgrgrg RGRGRGRG
912	ba = _mm_or_si128(b, _mm_slli_epi16(a, `8`)); // babababa BABABABA
913	lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba*
914	hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA*
915	};
916
917	while (count >= `8`) {
918	__m128i lo = _mm_loadu_si128((const __m128i*) (src + `0`)),
919	hi = _mm_loadu_si128((const __m128i*) (src + `4`));
920
921	premul8(&lo, &hi);
922
923	_mm_storeu_si128((__m128i*) (dst + `0`), lo);
924	_mm_storeu_si128((__m128i*) (dst + `4`), hi);
925
926	src += `8`;
927	dst += `8`;
928	count -= `8`;
929	}
930
931	if (count >= `4`) {
932	__m128i lo = _mm_loadu_si128((const __m128i*) src),
933	hi = _mm_setzero_si128();
934
935	premul8(&lo, &hi);
936
937	_mm_storeu_si128((__m128i*) dst, lo);
938
939	src += `4`;
940	dst += `4`;
941	count -= `4`;
942	}
943
944	// Call portable code to finish up the tail of [0,4) pixels.
945	auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
946	proc(dst, src, count);
947	}
948
949	/not static/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
950	premul_should_swapRB(false, dst, src, count);
951	}
952
953	/not static/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
954	premul_should_swapRB(true, dst, src, count);
955	}
956
957	/not static/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
958	const __m128i swapRB = _mm_setr_epi8(`2`,`1`,`0`,`3`, `6`,`5`,`4`,`7`, `10`,`9`,`8`,`11`, `14`,`13`,`12`,`15`);
959
960	while (count >= `4`) {
961	__m128i rgba = _mm_loadu_si128((const __m128i*) src);
962	__m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
963	_mm_storeu_si128((__m128i*) dst, bgra);
964
965	src += `4`;
966	dst += `4`;
967	count -= `4`;
968	}
969
970	RGBA_to_BGRA_portable(dst, src, count);
971	}
972
973	/not static/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
974	while (count >= `8`) {
975	__m128i ga = _mm_loadu_si128((const __m128i*) src);
976
977	__m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(`0x00FF`)),
978	_mm_slli_epi16(ga, `8`));
979
980	__m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
981	__m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
982
983	_mm_storeu_si128((__m128i*) (dst + `0`), ggga_lo);
984	_mm_storeu_si128((__m128i*) (dst + `4`), ggga_hi);
985
986	src += `8`*`2`;
987	dst += `8`;
988	count -= `8`;
989	}
990
991	grayA_to_RGBA_portable(dst, src, count);
992	}
993
994	/not static/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
995	while (count >= `8`) {
996	__m128i grayA = _mm_loadu_si128((const __m128i*) src);
997
998	__m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(`0x00FF`));
999	__m128i a0 = _mm_srli_epi16(grayA, `8`);
1000
1001	// Premultiply
1002	g0 = scale(g0, a0);
1003
1004	__m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, `8`));
1005	__m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, `8`));
1006
1007
1008	__m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
1009	__m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
1010
1011	_mm_storeu_si128((__m128i*) (dst + `0`), ggga_lo);
1012	_mm_storeu_si128((__m128i*) (dst + `4`), ggga_hi);
1013
1014	src += `8`*`2`;
1015	dst += `8`;
1016	count -= `8`;
1017	}
1018
1019	grayA_to_rgbA_portable(dst, src, count);
1020	}
1021
1022	enum Format { kRGB1, kBGR1 };
1023	static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1024	auto convert8 = [=](__m128i* lo, __m128i* hi) {
1025	const __m128i zeros = _mm_setzero_si128();
1026	__m128i planar;
1027	if (kBGR1 == format) {
1028	planar = _mm_setr_epi8(`2`,`6`,`10`,`14`, `1`,`5`,`9`,`13`, `0`,`4`,`8`,`12`, `3`,`7`,`11`,`15`);
1029	} else {
1030	planar = _mm_setr_epi8(`0`,`4`,`8`,`12`, `1`,`5`,`9`,`13`, `2`,`6`,`10`,`14`, `3`,`7`,`11`,`15`);
1031	}
1032
1033	// Swizzle the pixels to 8-bit planar.
1034	lo = _mm_shuffle_epi8(lo, planar); // ccccmmmm yyyykkkk
1035	hi = _mm_shuffle_epi8(hi, planar); // CCCCMMMM YYYYKKKK
1036	__m128i cm = _mm_unpacklo_epi32(lo, hi), // ccccCCCC mmmmMMMM
1037	yk = _mm_unpackhi_epi32(lo, hi); // yyyyYYYY kkkkKKKK
1038
1039	// Unpack to 16-bit planar.
1040	__m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
1041	m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
1042	y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
1043	k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
1044
1045	// Scale to r, g, b.
1046	__m128i r = scale(c, k),
1047	g = scale(m, k),
1048	b = scale(y, k);
1049
1050	// Repack into interlaced pixels.
1051	__m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, `8`)), // rgrgrgrg RGRGRGRG
1052	ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) `0xFF00`)); // b1b1b1b1 B1B1B1B1
1053	lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba*
1054	hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1*
1055	};
1056
1057	while (count >= `8`) {
1058	__m128i lo = _mm_loadu_si128((const __m128i*) (src + `0`)),
1059	hi = _mm_loadu_si128((const __m128i*) (src + `4`));
1060
1061	convert8(&lo, &hi);
1062
1063	_mm_storeu_si128((__m128i*) (dst + `0`), lo);
1064	_mm_storeu_si128((__m128i*) (dst + `4`), hi);
1065
1066	src += `8`;
1067	dst += `8`;
1068	count -= `8`;
1069	}
1070
1071	if (count >= `4`) {
1072	__m128i lo = _mm_loadu_si128((const __m128i*) src),
1073	hi = _mm_setzero_si128();
1074
1075	convert8(&lo, &hi);
1076
1077	_mm_storeu_si128((__m128i*) dst, lo);
1078
1079	src += `4`;
1080	dst += `4`;
1081	count -= `4`;
1082	}
1083
1084	auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1085	proc(dst, src, count);
1086	}
1087
1088	/not static/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1089	inverted_cmyk_to(kRGB1, dst, src, count);
1090	}
1091
1092	/not static/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1093	inverted_cmyk_to(kBGR1, dst, src, count);
1094	}
1095
1096	#else
1097
1098	/not static/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1099	RGBA_to_rgbA_portable(dst, src, count);
1100	}
1101
1102	/not static/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1103	RGBA_to_bgrA_portable(dst, src, count);
1104	}
1105
1106	/not static/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1107	RGBA_to_BGRA_portable(dst, src, count);
1108	}
1109
1110	/not static/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1111	grayA_to_RGBA_portable(dst, src, count);
1112	}
1113
1114	/not static/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1115	grayA_to_rgbA_portable(dst, src, count);
1116	}
1117
1118	/not static/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1119	inverted_CMYK_to_RGB1_portable(dst, src, count);
1120	}
1121
1122	/not static/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1123	inverted_CMYK_to_BGR1_portable(dst, src, count);
1124	}
1125
1126	#endif
1127
1128	// Basically as above, but we found no benefit from AVX-512 for gray_to_RGB1.
1129	static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1130	for (int i = `0`; i < count; i++) {
1131	dst[i] = (uint32_t)`0xFF` << `24`
1132	\| (uint32_t)src[i] << `16`
1133	\| (uint32_t)src[i] << `8`
1134	\| (uint32_t)src[i] << `0`;
1135	}
1136	}
1137	#if defined(SK_ARM_HAS_NEON)
1138	/not static/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1139	while (count >= `16`) {
1140	// Load 16 pixels.
1141	uint8x16_t gray = vld1q_u8(src);
1142
1143	// Set each of the color channels.
1144	uint8x16x4_t rgba;
1145	rgba.val[`0`] = gray;
1146	rgba.val[`1`] = gray;
1147	rgba.val[`2`] = gray;
1148	rgba.val[`3`] = vdupq_n_u8(`0xFF`);
1149
1150	// Store 16 pixels.
1151	vst4q_u8((uint8_t*) dst, rgba);
1152	src += `16`;
1153	dst += `16`;
1154	count -= `16`;
1155	}
1156	if (count >= `8`) {
1157	// Load 8 pixels.
1158	uint8x8_t gray = vld1_u8(src);
1159
1160	// Set each of the color channels.
1161	uint8x8x4_t rgba;
1162	rgba.val[`0`] = gray;
1163	rgba.val[`1`] = gray;
1164	rgba.val[`2`] = gray;
1165	rgba.val[`3`] = vdup_n_u8(`0xFF`);
1166
1167	// Store 8 pixels.
1168	vst4_u8((uint8_t*) dst, rgba);
1169	src += `8`;
1170	dst += `8`;
1171	count -= `8`;
1172	}
1173	gray_to_RGB1_portable(dst, src, count);
1174	}
1175	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
1176	/not static/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1177	const __m256i alphas = _mm256_set1_epi8((uint8_t) `0xFF`);
1178	while (count >= `32`) {
1179	__m256i grays = _mm256_loadu_si256((const __m256i*) src);
1180
1181	__m256i gg_lo = _mm256_unpacklo_epi8(grays, grays);
1182	__m256i gg_hi = _mm256_unpackhi_epi8(grays, grays);
1183	__m256i ga_lo = _mm256_unpacklo_epi8(grays, alphas);
1184	__m256i ga_hi = _mm256_unpackhi_epi8(grays, alphas);
1185
1186	__m256i ggga0 = _mm256_unpacklo_epi16(gg_lo, ga_lo);
1187	__m256i ggga1 = _mm256_unpackhi_epi16(gg_lo, ga_lo);
1188	__m256i ggga2 = _mm256_unpacklo_epi16(gg_hi, ga_hi);
1189	__m256i ggga3 = _mm256_unpackhi_epi16(gg_hi, ga_hi);
1190
1191	// Shuffle for pixel reorder.
1192	// Note. 'p' stands for 'ggga'
1193	// Before shuffle:
1194	// ggga0 = p0 p1 p2 p3 \| p16 p17 p18 p19
1195	// ggga1 = p4 p5 p6 p7 \| p20 p21 p22 p23
1196	// ggga2 = p8 p9 p10 p11 \| p24 p25 p26 p27
1197	// ggga3 = p12 p13 p14 p15 \| p28 p29 p30 p31
1198	//
1199	// After shuffle:
1200	// ggga0_shuffle = p0 p1 p2 p3 \| p4 p5 p6 p7
1201	// ggga1_shuffle = p8 p9 p10 p11 \| p12 p13 p14 p15
1202	// ggga2_shuffle = p16 p17 p18 p19 \| p20 p21 p22 p23
1203	// ggga3_shuffle = p24 p25 p26 p27 \| p28 p29 p30 p31
1204	__m256i ggga0_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, `0x20`),
1205	ggga1_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, `0x20`),
1206	ggga2_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, `0x31`),
1207	ggga3_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, `0x31`);
1208
1209	_mm256_storeu_si256((__m256i*) (dst + `0`), ggga0_shuffle);
1210	_mm256_storeu_si256((__m256i*) (dst + `8`), ggga1_shuffle);
1211	_mm256_storeu_si256((__m256i*) (dst + `16`), ggga2_shuffle);
1212	_mm256_storeu_si256((__m256i*) (dst + `24`), ggga3_shuffle);
1213
1214	src += `32`;
1215	dst += `32`;
1216	count -= `32`;
1217	}
1218	gray_to_RGB1_portable(dst, src, count);
1219	}
1220	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 // TODO: just check >= SSE2?
1221	/not static/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1222	const __m128i alphas = _mm_set1_epi8((uint8_t) `0xFF`);
1223	while (count >= `16`) {
1224	__m128i grays = _mm_loadu_si128((const __m128i*) src);
1225
1226	__m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
1227	__m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
1228	__m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
1229	__m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
1230
1231	__m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
1232	__m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
1233	__m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
1234	__m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
1235
1236	_mm_storeu_si128((__m128i*) (dst + `0`), ggga0);
1237	_mm_storeu_si128((__m128i*) (dst + `4`), ggga1);
1238	_mm_storeu_si128((__m128i*) (dst + `8`), ggga2);
1239	_mm_storeu_si128((__m128i*) (dst + `12`), ggga3);
1240
1241	src += `16`;
1242	dst += `16`;
1243	count -= `16`;
1244	}
1245	gray_to_RGB1_portable(dst, src, count);
1246	}
1247	#else
1248	/not static/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1249	gray_to_RGB1_portable(dst, src, count);
1250	}
1251	#endif
1252
1253	// Again as above, this time not even finding benefit from AVX2 for RGB_to_{RGB,BGR}1.
1254	static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1255	for (int i = `0`; i < count; i++) {
1256	uint8_t r = src[`0`],
1257	g = src[`1`],
1258	b = src[`2`];
1259	src += `3`;
1260	dst[i] = (uint32_t)`0xFF` << `24`
1261	\| (uint32_t)b << `16`
1262	\| (uint32_t)g << `8`
1263	\| (uint32_t)r << `0`;
1264	}
1265	}
1266	static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
1267	for (int i = `0`; i < count; i++) {
1268	uint8_t r = src[`0`],
1269	g = src[`1`],
1270	b = src[`2`];
1271	src += `3`;
1272	dst[i] = (uint32_t)`0xFF` << `24`
1273	\| (uint32_t)r << `16`
1274	\| (uint32_t)g << `8`
1275	\| (uint32_t)b << `0`;
1276	}
1277	}
1278	#if defined(SK_ARM_HAS_NEON)
1279	static void insert_alpha_should_swaprb(bool kSwapRB,
1280	uint32_t dst[], const uint8_t* src, int count) {
1281	while (count >= `16`) {
1282	// Load 16 pixels.
1283	uint8x16x3_t rgb = vld3q_u8(src);
1284
1285	// Insert an opaque alpha channel and swap if needed.
1286	uint8x16x4_t rgba;
1287	if (kSwapRB) {
1288	rgba.val[`0`] = rgb.val[`2`];
1289	rgba.val[`2`] = rgb.val[`0`];
1290	} else {
1291	rgba.val[`0`] = rgb.val[`0`];
1292	rgba.val[`2`] = rgb.val[`2`];
1293	}
1294	rgba.val[`1`] = rgb.val[`1`];
1295	rgba.val[`3`] = vdupq_n_u8(`0xFF`);
1296
1297	// Store 16 pixels.
1298	vst4q_u8((uint8_t*) dst, rgba);
1299	src += `16`*`3`;
1300	dst += `16`;
1301	count -= `16`;
1302	}
1303
1304	if (count >= `8`) {
1305	// Load 8 pixels.
1306	uint8x8x3_t rgb = vld3_u8(src);
1307
1308	// Insert an opaque alpha channel and swap if needed.
1309	uint8x8x4_t rgba;
1310	if (kSwapRB) {
1311	rgba.val[`0`] = rgb.val[`2`];
1312	rgba.val[`2`] = rgb.val[`0`];
1313	} else {
1314	rgba.val[`0`] = rgb.val[`0`];
1315	rgba.val[`2`] = rgb.val[`2`];
1316	}
1317	rgba.val[`1`] = rgb.val[`1`];
1318	rgba.val[`3`] = vdup_n_u8(`0xFF`);
1319
1320	// Store 8 pixels.
1321	vst4_u8((uint8_t*) dst, rgba);
1322	src += `8`*`3`;
1323	dst += `8`;
1324	count -= `8`;
1325	}
1326
1327	// Call portable code to finish up the tail of [0,8) pixels.
1328	auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1329	proc(dst, src, count);
1330	}
1331
1332	/not static/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1333	insert_alpha_should_swaprb(false, dst, src, count);
1334	}
1335	/not static/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1336	insert_alpha_should_swaprb(true, dst, src, count);
1337	}
1338	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
1339	static void insert_alpha_should_swaprb(bool kSwapRB,
1340	uint32_t dst[], const uint8_t* src, int count) {
1341	const __m128i alphaMask = _mm_set1_epi32(`0xFF000000`);
1342	__m128i expand;
1343	const uint8_t X = `0xFF`; // Used a placeholder. The value of X is irrelevant.
1344	if (kSwapRB) {
1345	expand = _mm_setr_epi8(`2`,`1`,`0`,X, `5`,`4`,`3`,X, `8`,`7`,`6`,X, `11`,`10`,`9`,X);
1346	} else {
1347	expand = _mm_setr_epi8(`0`,`1`,`2`,X, `3`,`4`,`5`,X, `6`,`7`,`8`,X, `9`,`10`,`11`,X);
1348	}
1349
1350	while (count >= `6`) {
1351	// Load a vector. While this actually contains 5 pixels plus an
1352	// extra component, we will discard all but the first four pixels on
1353	// this iteration.
1354	__m128i rgb = _mm_loadu_si128((const __m128i*) src);
1355
1356	// Expand the first four pixels to RGBX and then mask to RGB(FF).
1357	__m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
1358
1359	// Store 4 pixels.
1360	_mm_storeu_si128((__m128i*) dst, rgba);
1361
1362	src += `4`*`3`;
1363	dst += `4`;
1364	count -= `4`;
1365	}
1366
1367	// Call portable code to finish up the tail of [0,4) pixels.
1368	auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1369	proc(dst, src, count);
1370	}
1371
1372	/not static/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1373	insert_alpha_should_swaprb(false, dst, src, count);
1374	}
1375	/not static/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1376	insert_alpha_should_swaprb(true, dst, src, count);
1377	}
1378	#else
1379	/not static/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1380	RGB_to_RGB1_portable(dst, src, count);
1381	}
1382	/not static/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1383	RGB_to_BGR1_portable(dst, src, count);
1384	}
1385	#endif
1386
1387	} // namespace SK_OPTS_NS
1388
1389	#endif // SkSwizzler_opts_DEFINED
1390

Browse the source code of engine/third_party/skia/src/opts/SkSwizzler_opts.h